PiperOrigin-RevId: 509849832pull/2891/merge
| @ -0,0 +1,71 @@ | |||
| /* | |||
| * Copyright (C) 2022 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata; | |||
| import static com.google.common.base.CharMatcher.whitespace; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static java.lang.Integer.parseUnsignedInt; | |||
| import com.google.common.base.CharMatcher; | |||
| import com.google.common.base.Splitter; | |||
| import com.google.common.collect.ContiguousSet; | |||
| import com.google.common.collect.ImmutableSortedSet; | |||
| import java.util.List; | |||
| import java.util.NavigableSet; | |||
| import java.util.TreeSet; | |||
| /** Parses strings of form "4,7-9,11" which are used as length specifiers across LPN metadata */ | |||
| public final class LengthsParser { | |||
| private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults(whitespace()); | |||
| private static final Splitter RANGE_SPLITTER = | |||
| Splitter.on('-').trimResults(whitespace()).limit(2); | |||
| private static final CharMatcher ALLOWED_CHARACTERS = | |||
| CharMatcher.inRange('0', '9').or(CharMatcher.anyOf("-,")).or(whitespace()); | |||
| /** Returns the set of integers specified by this string. */ | |||
| public static ImmutableSortedSet<Integer> parseLengths(String s) { | |||
| checkArgument( | |||
| ALLOWED_CHARACTERS.matchesAllOf(s), | |||
| "Length specifier contains forbidden characters: %s", | |||
| s); | |||
| NavigableSet<Integer> lengths = new TreeSet<>(); | |||
| for (String lengthOrRange : COMMA_SPLITTER.split(s)) { | |||
| if (lengthOrRange.contains("-")) { | |||
| List<String> lohi = RANGE_SPLITTER.splitToList(lengthOrRange); | |||
| int lo = parseUnsignedInt(lohi.get(0)); | |||
| int hi = parseUnsignedInt(lohi.get(1)); | |||
| checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi); | |||
| checkArgument( | |||
| lengths.isEmpty() || lo > lengths.last(), | |||
| "Numbers in length specifier are out of order: %s", | |||
| s); | |||
| lengths.addAll(ContiguousSet.closed(lo, hi)); | |||
| } else { | |||
| int length = parseUnsignedInt(lengthOrRange); | |||
| checkArgument( | |||
| lengths.isEmpty() || length > lengths.last(), | |||
| "Numbers in length specifier are out of order: %s", | |||
| s); | |||
| lengths.add(length); | |||
| } | |||
| } | |||
| return ImmutableSortedSet.copyOf(lengths); | |||
| } | |||
| private LengthsParser() {} | |||
| } | |||
| @ -0,0 +1,317 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode.State; | |||
| /** | |||
| * Matches phone number regular expressions based on compact compiled data generated by | |||
| * {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler | |||
| * MatcherCompiler}. Typically the phone number regular expression will be compiled at build time | |||
| * and the resulting matcher data will be packaged into the binary which needs it, or downloaded at | |||
| * run time. | |||
| * <p> | |||
| * This class is designed to be lightweight and fast, and should be simple to implement in many | |||
| * different languages (C++, Python, JS, etc.). | |||
| * | |||
| * TODO: Consider UnisgnedBytes.toInt(x) to avoid lots of (x & 0xFF). | |||
| */ | |||
| public abstract class DigitSequenceMatcher { | |||
| /** Possible result types returned by a match operation. */ | |||
| public enum Result { | |||
| /** The match operation was a success and the input was matched. */ | |||
| MATCHED, | |||
| /** The match operation failed because unexpected input was encountered. */ | |||
| INVALID, | |||
| /** | |||
| * The match operation failed because the input terminated too soon (ie, the input was a | |||
| * valid prefix for the matcher). | |||
| */ | |||
| TOO_SHORT, | |||
| /** | |||
| * The match operation failed due to the existence of additional input after matching had | |||
| * completed (ie, the the input would have matched if it were shorter). | |||
| */ | |||
| TOO_LONG; | |||
| } | |||
| /** An iterator of {@code int}, used to supply the matcher with a sequence of input digits. */ | |||
| public interface DigitSequence { | |||
| /** Returns true if there are more digits available. */ | |||
| boolean hasNext(); | |||
| /** | |||
| * Return the next digit value (from 0 to 9 inclusive, not a char value). The matcher does not | |||
| * test for invalid digits, so returning values outside this range will have undefined results, | |||
| * including false positive results. | |||
| */ | |||
| int next(); | |||
| } | |||
| /** Internal abstraction to allow matching over either byte arrays or strings. */ | |||
| interface DataView { | |||
| /** Return the unsigned byte value at the given offset from the current position. */ | |||
| int peekByte(int offset); | |||
| /** Return the unsigned byte value at the current position and move ahead 1 byte. */ | |||
| int readByte(); | |||
| /** Return the unsigned short value at the current position and move ahead 2 bytes. */ | |||
| int readShort(); | |||
| /** Return the unsigned int value at the current position and move ahead 4 bytes. */ | |||
| int readInt(); | |||
| /** Adjust the current position by the given (non-negative) offset. */ | |||
| State branch(int offset); | |||
| /** | |||
| * Adjust the current position by the unsigned byte offset value read from the current | |||
| * position plus the given index. This is used to implement maps and branching ranges. | |||
| */ | |||
| State jumpTable(int index); | |||
| } | |||
| /** | |||
| * Creates a new matcher which reads instructions directly from the given byte array. Typically | |||
| * it is expected that this method will consume byte arrays packaged into a binary at build time | |||
| * (the MatcherCompiler is not suitable for direct parsing of regular expressions at run time). | |||
| * <p> | |||
| * See {@code MatcherCompiler.compile(...)}. | |||
| */ | |||
| public static DigitSequenceMatcher create(byte[] data) { | |||
| if (data.length == 0) { | |||
| throw new IllegalArgumentException("matcher data cannot be empty"); | |||
| } | |||
| return new ByteArrayMatcher(data); | |||
| } | |||
| /** | |||
| * Creates a new matcher which reads instructions from the given string. Typically it is expected | |||
| * that this method will be used when matcher data is packaged as literal Java string constants | |||
| * in (auto-generated) source files. | |||
| * <p> | |||
| * See {@code MatcherCompiler.compileToUnquotedJavaSourceString(...)}. | |||
| */ | |||
| public static DigitSequenceMatcher create(String data) { | |||
| if (data.isEmpty()) { | |||
| throw new IllegalArgumentException("matcher data cannot be empty"); | |||
| } | |||
| return new StringMatcher(data); | |||
| } | |||
| abstract DataView newDataView(); | |||
| abstract int size(); | |||
| /** Matches the input against this matcher, returning a result code. */ | |||
| public Result match(DigitSequence in) { | |||
| State state = runMatcher(in); | |||
| switch (state) { | |||
| case TERMINAL: | |||
| return !in.hasNext() ? Result.MATCHED : Result.TOO_LONG; | |||
| case TRUNCATED: | |||
| return Result.TOO_SHORT; | |||
| case INVALID: | |||
| return Result.INVALID; | |||
| default: | |||
| throw new AssertionError("unexpected state: " + state); | |||
| } | |||
| } | |||
| private State runMatcher(DigitSequence in) { | |||
| DataView data = newDataView(); | |||
| State state; | |||
| do { | |||
| state = OpCode.decode(data.peekByte(0)).execute(data, in); | |||
| } while (state == State.CONTINUE); | |||
| return state; | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| int size = size(); | |||
| StringBuilder out = new StringBuilder(size + " :: [ "); | |||
| DataView data = newDataView(); | |||
| while (size-- > 0) { | |||
| out.append(Integer.toHexString(data.readByte())).append(", "); | |||
| } | |||
| out.setLength(out.length() - 2); | |||
| out.append(" ]"); | |||
| return out.toString(); | |||
| } | |||
| /** A matcher for reading instructions from a byte array. */ | |||
| private static final class ByteArrayMatcher extends DigitSequenceMatcher { | |||
| private class ByteArrayData implements DataView { | |||
| int position = 0; | |||
| @Override public int peekByte(int offset) { | |||
| return bytes[position + offset] & 0xFF; | |||
| } | |||
| @Override public int readByte() { | |||
| return bytes[position++] & 0xFF; | |||
| } | |||
| @Override public int readShort() { | |||
| return (readByte() << 8) | readByte(); | |||
| } | |||
| @Override public int readInt() { | |||
| return (readShort() << 16) | readShort(); | |||
| } | |||
| @Override public State branch(int offset) { | |||
| position += offset; | |||
| return offset != 0 ? State.CONTINUE : State.TERMINAL; | |||
| } | |||
| @Override public State jumpTable(int index) { | |||
| return branch(peekByte(index)); | |||
| } | |||
| } | |||
| private final byte[] bytes; | |||
| private ByteArrayMatcher(byte[] data) { | |||
| this.bytes = data; | |||
| } | |||
| @Override | |||
| DataView newDataView() { | |||
| return new ByteArrayData(); | |||
| } | |||
| @Override | |||
| int size() { | |||
| return bytes.length; | |||
| } | |||
| } | |||
| /** A matcher for reading instructions from a String. */ | |||
| private static final class StringMatcher extends DigitSequenceMatcher { | |||
| /* | |||
| * Note: Using unsigned shift "x >>> 1" is more likely to be free as part of a data load | |||
| * instruction than "x / 2". | |||
| */ | |||
| private class StringData implements DataView { | |||
| int position = 0; | |||
| @Override public int peekByte(int offset) { | |||
| offset += position; | |||
| int data = bytes.charAt(offset >>> 1); | |||
| // char := hi [ even-byte | odd-byte ] lo | |||
| return (offset & 1) != 0 ? data & 0xFF : data >>> 8; | |||
| } | |||
| @Override public int readByte() { | |||
| int data = bytes.charAt(position >>> 1); | |||
| // char := hi [ even-byte | odd-byte ] lo | |||
| data = (position & 1) != 0 ? data & 0xFF : data >>> 8; | |||
| position += 1; | |||
| return data; | |||
| } | |||
| @Override public int readShort() { | |||
| int data = bytes.charAt(position >>> 1); | |||
| // Adding 2 early does not affect odd/even (but does reference next char). | |||
| position += 2; | |||
| if ((position & 1) != 0) { | |||
| data = ((data & 0xFF) << 8) | (bytes.charAt(position >>> 1) >>> 8); | |||
| } | |||
| return data; | |||
| } | |||
| @Override public int readInt() { | |||
| return (readShort() << 16) | readShort(); | |||
| } | |||
| @Override public State branch(int offset) { | |||
| position += offset; | |||
| return offset != 0 ? State.CONTINUE : State.TERMINAL; | |||
| } | |||
| @Override public State jumpTable(int index) { | |||
| return branch(peekByte(index)); | |||
| } | |||
| } | |||
| private final String bytes; | |||
| private StringMatcher(String bytes) { | |||
| this.bytes = bytes; | |||
| } | |||
| @Override | |||
| DataView newDataView() { | |||
| return new StringData(); | |||
| } | |||
| @Override | |||
| int size() { | |||
| int size = 2 * bytes.length(); | |||
| if ((bytes.charAt(bytes.length() - 1) & 0xFF) == 0xFF) { | |||
| size -= 1; | |||
| } | |||
| return size; | |||
| } | |||
| } | |||
| /** An iterator of {@code int} that yields a sequence of input digits from a string. */ | |||
| private static final class StringDigits implements DigitSequence { | |||
| private final CharSequence number; | |||
| private int n = 0; | |||
| private StringDigits(CharSequence number) { | |||
| this.number = number; | |||
| } | |||
| @Override public int next() { | |||
| if (n < 0 || n >= number.length()) { | |||
| throw new IndexOutOfBoundsException( | |||
| "index '" + n + "' out of bounds for input: " + number); | |||
| } | |||
| char c = number.charAt(n); | |||
| if (c < '0' || c > '9') { | |||
| throw new IllegalArgumentException( | |||
| "non-digit character '" + c + "' [" + ((int) c) + "] at index " + n + " in: " + number); | |||
| } | |||
| n++; | |||
| return c - '0'; | |||
| } | |||
| @Override public boolean hasNext() { | |||
| return n < number.length(); | |||
| } | |||
| } | |||
| /** | |||
| * Returns an instance of DigitSequence based on the input string. The input string may only | |||
| * contain digits. | |||
| */ | |||
| public static DigitSequence digitsFromString(CharSequence number) { | |||
| return new StringDigits(number); | |||
| } | |||
| /** A matcher has no internal state and is just a factory for data specific implementations. */ | |||
| private DigitSequenceMatcher() { } | |||
| } | |||
| @ -0,0 +1,262 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DataView; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence; | |||
| /** | |||
| * Implementation of instructions for the phone number matcher state machine. | |||
| * <p> | |||
| * <h3>Jump Tables</h3> | |||
| * | |||
| * Several instructions use a "jump table" concept which is simply a contiguous region of bytes | |||
| * containing offsets from which a new position is calculated. The new position is the current | |||
| * position (at the start of the jump table) plus the value of the chosen jump offset. | |||
| * | |||
| * <pre>{@code | |||
| * [ ... | JUMP_0 | JUMP_1 | ... | JUMP_N | ... | DEST | ... | |||
| * position --^ ^ ^ | |||
| * `---index ---' | | |||
| * offset `---------------- [ position + index ] -----' | |||
| * | |||
| * position = position + unsignedByteValueAt(position + index) | |||
| * }</pre> | |||
| * | |||
| * A jump offset of zero signifies that the state jumped to is terminal (this avoids having to jump | |||
| * to a termination byte). A jump table will always occur immediately after an associated | |||
| * instruction and the instruction's stated size includes the number of bytes in the jump table. | |||
| */ | |||
| public enum OpCode { | |||
| /** | |||
| * Jumps ahead by between 1 and 4095 bytes from the end of this opcode. This opcode does not | |||
| * consume any input. | |||
| * <p> | |||
| * This is a variable length instruction, taking one byte for offsets up to 15 and (if EXT is set) | |||
| * two bytes for larger offsets up to 4095. The jump offset signifies how many bytes to skip after | |||
| * this instruction. | |||
| * <p> | |||
| * As a special case, a single byte branch with a jump offset of zero (represented by a single | |||
| * zero byte) can be used to signify that the current state is terminal and the state machine | |||
| * should exit (a zero jump offset never makes sense in any instruction). | |||
| * | |||
| * <pre>{@code | |||
| * [ 0 | 0 | JUMP ] | |||
| * [ 0 | 1 | JUMP | EXT_JUMP ] | |||
| * <3>.<1>.<-- 4 -->.<---- 8 ----> | |||
| * }</pre> | |||
| */ | |||
| BRANCH(0) { | |||
| @Override | |||
| State execute(DataView data, DigitSequence ignored) { | |||
| int op = data.readByte(); | |||
| int offset = op & 0xF; | |||
| if ((op & (1 << 4)) != 0) { | |||
| offset = (offset << 8) + data.readByte(); | |||
| } | |||
| return data.branch(offset); | |||
| } | |||
| }, | |||
| /** | |||
| * Accepts a single input (and transition to a single state). Inputs not matching "VAL" are | |||
| * invalid from the current state. If "TRM" is set then the state being transitioned from may | |||
| * terminate. | |||
| * | |||
| * <pre>{@code | |||
| * [ 1 |TRM| VAL ] | |||
| * <3>.<1>.<- 4 -> | |||
| * }</pre> | |||
| */ | |||
| SINGLE(1) { | |||
| @Override | |||
| State execute(DataView data, DigitSequence in) { | |||
| int op = data.readByte(); | |||
| if (!in.hasNext()) { | |||
| return ((op & (1 << 4)) != 0) ? State.TERMINAL : State.TRUNCATED; | |||
| } | |||
| int n = in.next(); | |||
| return ((op & 0xF) == n) ? State.CONTINUE : State.INVALID; | |||
| } | |||
| }, | |||
| /** | |||
| * Accept any input to transition to a single state one or more times. | |||
| * <p> | |||
| * If "TRM" is set then every state that is transitioned from may terminate. | |||
| * | |||
| * <pre>{@code | |||
| * [ 2 |TRM| NUM-1 ] | |||
| * <3>.<1>.<- 4 -> | |||
| * }</pre> | |||
| */ | |||
| ANY(2) { | |||
| @Override | |||
| State execute(DataView data, DigitSequence in) { | |||
| int op = data.readByte(); | |||
| int num = (op & 0xF) + 1; | |||
| boolean isTerminating = (op & (1 << 4)) != 0; | |||
| while (num-- > 0) { | |||
| if (!in.hasNext()) { | |||
| return isTerminating ? State.TERMINAL : State.TRUNCATED; | |||
| } | |||
| in.next(); | |||
| } | |||
| return State.CONTINUE; | |||
| } | |||
| }, | |||
| /** | |||
| * Accepts multiple inputs to transition to one or two states. The bit-set has the Nth bit set if | |||
| * we should accept digit N (bit-0 is the lowest bit of the 2 byte form of the instruction). | |||
| * <p> | |||
| * This is a variable length instruction which either treats non-matched inputs as invalid | |||
| * (2 byte form) or branches to one of two states via a 2-entry jump table (4 byte form). | |||
| * <p> | |||
| * If "TRM" is set then the state being transitioned from may terminate. | |||
| * | |||
| * <pre>{@code | |||
| * [ 3 |TRM| 0 |---| BIT SET ] | |||
| * [ 3 |TRM| 1 |---| BIT SET | JUMP_IN | JUMP_OUT ] | |||
| * <3>.<1>.<1>.<1>.<--- 10 --->.<--- 8 --->.<--- 8 ---> | |||
| * }</pre> | |||
| */ | |||
| RANGE(3) { | |||
| @Override | |||
| State execute(DataView data, DigitSequence in) { | |||
| int op = data.readShort(); | |||
| if (!in.hasNext()) { | |||
| return ((op & (1 << 12)) != 0) ? State.TERMINAL : State.TRUNCATED; | |||
| } | |||
| int n = in.next(); | |||
| if ((op & (1 << 11)) == 0) { | |||
| // 2 byte form, non-matched input is invalid. | |||
| return ((op & (1 << n)) != 0) ? State.CONTINUE : State.INVALID; | |||
| } | |||
| // 4 byte form uses jump table (use bitwise negation so a set bit becomes a 0 index). | |||
| return data.jumpTable((~op >>> n) & 1); | |||
| } | |||
| }, | |||
| /** | |||
| * Accept multiple inputs to transition to between one and ten states via jump offsets. Inputs | |||
| * not encoded in "CODED MAP" are invalid from the current state. | |||
| * <p> | |||
| * Because there is no room for a termination bit in this instruction, there is an alternate | |||
| * version, {@code TMAP}, which should be used when transitioning from a terminating state. | |||
| * <p> | |||
| * TODO: Figure out if we can save one bit here and merge MAP and TMAP. | |||
| * | |||
| * <pre>{@code | |||
| * [ 4 | CODED MAP | JUMP_1 | ... | JUMP_N ] | |||
| * <3>.<-------- 29 -------->.<--- 8 --->. ... .<--- 8 ---> | |||
| * }</pre> | |||
| */ | |||
| MAP(4) { | |||
| @Override | |||
| State execute(DataView data, DigitSequence in) { | |||
| return map(data, in, State.TRUNCATED); | |||
| } | |||
| }, | |||
| /** | |||
| * Like {@code MAP} but transitions from a terminating state. | |||
| */ | |||
| TMAP(5) { | |||
| @Override | |||
| State execute(DataView data, DigitSequence in) { | |||
| return map(data, in, State.TERMINAL); | |||
| } | |||
| }; | |||
| /** The types of states that the state-machine can be in. */ | |||
| public enum State { | |||
| CONTINUE, TERMINAL, INVALID, TRUNCATED; | |||
| } | |||
| private static final OpCode[] VALUES = values(); | |||
| /** | |||
| * Encode maps as 29 bits where each digit takes a different number of bits to encode its offset. | |||
| * Specifically: | |||
| * <ul> | |||
| * <li>The first entry (matching 0) has only two possible values (it is either not present or maps | |||
| * to the first entry in the jump table), so takes only 1 bit. | |||
| * <li>The second entry (matching 1) has three possible values (not present or maps to either the | |||
| * first or second entry in the jump table), so it takes 2 bits. | |||
| * <li>In general the entry matching digit N has (N+1) possible states and takes log2(N+1) bits. | |||
| * </ul> | |||
| */ | |||
| private static final long MAP_SHIFT_BITS = 0L << 0 | // 1 bit (1x, mask=1) | |||
| 1L << 5 | 3L << 10 | // 2 bits (2x, mask=3) | |||
| 5L << 15 | 8L << 20 | 11L << 25 | 14L << 30 | // 3 bits (4x, mask=7) | |||
| 17L << 35 | 21L << 40 | 25L << 45; // 4 bits (3x, mask=F) | |||
| /** | |||
| * A table of values with which to mask the coded jump table map, after shifting it. Each nibble | |||
| * is a mask of up to 4 bits to extract the encoded index from a map instruction after it has | |||
| * been shifted. | |||
| */ | |||
| private static final long MAP_MASK_BITS = 0xFFF7777331L; | |||
| /** | |||
| * Returns the number of bits we must shift the coded jump table map for a digit with value | |||
| * {@code n} such that the jump index is in the lowest bits. | |||
| */ | |||
| public static int getMapShift(int n) { | |||
| return (int) (MAP_SHIFT_BITS >>> (5 * n)) & 0x1F; | |||
| } | |||
| /** | |||
| * Returns a mask we must apply to the shifted jump table map to extract only the jump index from | |||
| * the lowest bits. | |||
| */ | |||
| public static int getMapMask(int n) { | |||
| return (int) (MAP_MASK_BITS >>> (4 * n)) & 0xF; | |||
| } | |||
| /** | |||
| * Executes a map instruction by decoding the map data and selecting a jump offset to apply. | |||
| */ | |||
| private static State map(DataView data, DigitSequence in, State noInputState) { | |||
| int op = data.readInt(); | |||
| if (!in.hasNext()) { | |||
| return noInputState; | |||
| } | |||
| int n = in.next(); | |||
| // Coded indices are 1-to-10 (0 is the "invalid" state). | |||
| int index = ((op >>> getMapShift(n)) & getMapMask(n)); | |||
| if (index == 0) { | |||
| return State.INVALID; | |||
| } | |||
| // Jump offsets are zero based. | |||
| return data.jumpTable(index - 1); | |||
| } | |||
| /** | |||
| * Returns the opcode associated with the given unsigned byte value (the first byte of any | |||
| * instruction). | |||
| */ | |||
| static OpCode decode(int unsignedByte) { | |||
| return VALUES[unsignedByte >>> 5]; | |||
| } | |||
| private OpCode(int code) { | |||
| // Assertion checks during enum creation. Opcodes must be 3 bits and match the ordinal of the | |||
| // enum (this prevents issues if reordering enums occurs). | |||
| if ((code & ~0x7) != 0 || code != ordinal()) { | |||
| throw new AssertionError("bad opcode value: " + code); | |||
| } | |||
| } | |||
| abstract State execute(DataView data, DigitSequence in); | |||
| } | |||
| @ -0,0 +1,247 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.Iterables; | |||
| import com.google.common.collect.Lists; | |||
| import com.google.common.io.ByteArrayDataOutput; | |||
| import com.google.common.io.ByteStreams; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler.Sequence; | |||
| import java.io.ByteArrayOutputStream; | |||
| import java.io.IOException; | |||
| import java.util.ArrayList; | |||
| import java.util.Collections; | |||
| import java.util.Comparator; | |||
| import java.util.HashMap; | |||
| import java.util.HashSet; | |||
| import java.util.Iterator; | |||
| import java.util.LinkedHashSet; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.Set; | |||
| /** | |||
| * Renders the final bytecode representation for the matcher by connecting sequences of operations | |||
| * together and fixing-up offsets and branch instructions. This is essentially the higher-level | |||
| * aspect of matcher bytecode compilation. | |||
| * <p> | |||
| * Unlike {@link MatcherCompiler} in which a lot of the data is immutable (because sequences can | |||
| * be defined in isolation), there's a lot of mutable state in this class due to the need to build | |||
| * and manage offsets between the sequences, which relies on the order in which other sequences | |||
| * have been rendered. | |||
| */ | |||
| class MatcherBytes { | |||
| /** | |||
| * A partial order on byte sequences based on their size. This is not "equivalent to equals" and | |||
| * must not be used to construct an ordered set. | |||
| */ | |||
| private static final Comparator<SequenceBytes> DECREASING_BY_SIZE = | |||
| new Comparator<SequenceBytes>() { | |||
| @Override public int compare(SequenceBytes lhs, SequenceBytes rhs) { | |||
| return Integer.compare(rhs.size(), lhs.size()); | |||
| } | |||
| }; | |||
| /** | |||
| * Sequences we have not considered for rendering yet. | |||
| */ | |||
| private final List<Sequence> remainingSequences; | |||
| /** | |||
| * Candidate sequences whose dependent sequences have all been rendered, and which may themselves | |||
| * now be rendered. | |||
| */ | |||
| private final Set<Sequence> canditiateSequences = new LinkedHashSet<>(); | |||
| /** | |||
| * Sequences which have been rendered (used to determine when other sequences become renderable). | |||
| */ | |||
| private final Set<Sequence> compiledSequences = new HashSet<>(); | |||
| /** | |||
| * A map from which are final nodes of a sequence to the sequence they belong to. The key set of | |||
| * this map is a subset of all nodes. | |||
| */ | |||
| private final Map<DfaNode, SequenceBytes> sequenceMap = new HashMap<>(); | |||
| /** | |||
| * A list of compiled byte sequences in reverse order (ie, the sequence with the terminal node | |||
| * in it is first in this list and the sequence with the initial node is last). Compilation | |||
| * occurs in reverse order to allow offsets between sequences to be calculated as we go. | |||
| */ | |||
| private final List<SequenceBytes> reverseOrder = new ArrayList<>(); | |||
| /** Statistics instance for collecting inforation about the compilation. */ | |||
| private final Statistics stats; | |||
| MatcherBytes(Iterable<Sequence> allSequences, Statistics stats) { | |||
| // Our set of remaining sequences just starts out as all the sequences. | |||
| // Sequences are processed in reverse order, so reverse the sorted sequences before beginning. | |||
| remainingSequences = Lists.reverse(Lists.newArrayList(allSequences)); | |||
| this.stats = Preconditions.checkNotNull(stats); | |||
| } | |||
| /** | |||
| * Compiles all sequences into a single byte buffer suitable for use by a | |||
| * {@code DigitSequenceMatcher}. | |||
| */ | |||
| byte[] compile() { | |||
| int totalSequenceCount = remainingSequences.size(); | |||
| // Sequences with not dependent sequences are compiled first. | |||
| compileFinalSequences(); | |||
| // Determine new candidate sequences. | |||
| while (compiledSequences.size() < totalSequenceCount) { | |||
| // We won't always add a new candidate sequence each time around the loop, but the set | |||
| // should never be emptied until the final sequence is processed. | |||
| for (Iterator<Sequence> it = remainingSequences.iterator(); it.hasNext();) { | |||
| Sequence s = it.next(); | |||
| if (compiledSequences.containsAll(s.unorderedOutSequences())) { | |||
| canditiateSequences.add(s); | |||
| it.remove(); | |||
| } | |||
| } | |||
| // Compile the next candidate sequence. | |||
| Sequence toCompile = Iterables.get(canditiateSequences, 0); | |||
| reverseOrder.add(compile(toCompile)); | |||
| compiledSequences.add(toCompile); | |||
| canditiateSequences.remove(toCompile); | |||
| } | |||
| // We should have always exhausted the candidate sequences when we've finished rendering. | |||
| Preconditions.checkState(remainingSequences.isEmpty()); | |||
| Preconditions.checkState(canditiateSequences.isEmpty()); | |||
| return concatSequenceBytesInForwardOrder(); | |||
| } | |||
| /** | |||
| * Compiles any sequences which have no dependencies and orders them by size to heuristically | |||
| * reduce the size of branch offsets needed to reach them. | |||
| */ | |||
| private void compileFinalSequences() { | |||
| for (Iterator<Sequence> it = remainingSequences.iterator(); it.hasNext();) { | |||
| Sequence s = it.next(); | |||
| if (s.isFinal()) { | |||
| reverseOrder.add(compile(s)); | |||
| compiledSequences.add(s); | |||
| it.remove(); | |||
| } | |||
| } | |||
| // They are ordered by size (shortest first) because this will tend to reduce the number of | |||
| // 2-byte branch instructions needed to jump to them. | |||
| Collections.sort(reverseOrder, DECREASING_BY_SIZE); | |||
| } | |||
| /** Compiles a sequence for which all dependent sequences have already been compiled. */ | |||
| private SequenceBytes compile(Sequence sequence) { | |||
| // Note: Even non branching sequences will have an out node here. | |||
| Map<DfaNode, Integer> offsetMap = new HashMap<>(); | |||
| for (DfaNode out : sequence.getOutStates()) { | |||
| SequenceBytes targetSequence = sequenceMap.get(out); | |||
| int offsetToStartOfSequence = 0; | |||
| for (int n = reverseOrder.size() - 1; n >= 0 && reverseOrder.get(n) != targetSequence; n--) { | |||
| offsetToStartOfSequence += reverseOrder.get(n).size(); | |||
| } | |||
| if (offsetToStartOfSequence > 0 && targetSequence.isTerminator()) { | |||
| // If we would explicitly jump to a terminator sequence, we can just exit | |||
| // unconditionally at this point. | |||
| offsetToStartOfSequence = Operation.TERMINATION_OFFSET; | |||
| } | |||
| offsetMap.put(out, offsetToStartOfSequence); | |||
| } | |||
| SequenceBytes compiled = new SequenceBytes(sequence, offsetMap, stats); | |||
| sequenceMap.put(sequence.getInitialState(), compiled); | |||
| return compiled; | |||
| } | |||
| /** Creates the final, single buffer of bytecode instructions for the matcher. */ | |||
| private byte[] concatSequenceBytesInForwardOrder() { | |||
| try { | |||
| ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); | |||
| for (int n = reverseOrder.size() - 1; n >= 0; n--) { | |||
| outBuffer.write(reverseOrder.get(n).getBytes()); | |||
| } | |||
| return outBuffer.toByteArray(); | |||
| } catch (IOException e) { | |||
| throw new AssertionError("ByteArrayOutputStream cannot throw IOException"); | |||
| } | |||
| } | |||
| /** Renders a sequence (along with a map of branch offsets) to its bytecode form. */ | |||
| private static byte[] renderSequence( | |||
| Sequence sequence, Map<DfaNode, Integer> offsetMap, Statistics stats) { | |||
| // Because our operations come from a sequence, we can assert that only the last operation | |||
| // could possibly be branching. | |||
| List<Operation> ops = sequence.createOps(); | |||
| ByteArrayOutputStream baos = new ByteArrayOutputStream(); | |||
| ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(baos); | |||
| // Write all but the last operation (there are no branches to worry about). | |||
| for (int n = 0; n < ops.size() - 1; n++) { | |||
| ops.get(n).writeTo(outBytes, null, stats); | |||
| } | |||
| Operation lastOp = Iterables.getLast(ops); | |||
| if (lastOp.isTerminating()) { | |||
| stats.record(Statistics.Type.TERMINATING); | |||
| } | |||
| if (lastOp.isBranching()) { | |||
| // A branching operation uses the offset map directly to fill in its jump table information. | |||
| lastOp.writeTo(outBytes, offsetMap, stats); | |||
| } else { | |||
| // A non-branching operation does not use offsets, but we may need to add an explicit branch | |||
| // instruction after it. | |||
| lastOp.writeTo(outBytes, null, stats); | |||
| if (!offsetMap.isEmpty()) { | |||
| // When adding a branch instruction, there should only be a single offset to use. | |||
| int offset = Iterables.getOnlyElement(offsetMap.values()); | |||
| if (offset >= 0) { | |||
| // The offset could still be zero, but this is handled correctly by writeBranch(). | |||
| Operation.writeBranch(outBytes, offset, stats); | |||
| } else { | |||
| // This is a terminal instruction and the matcher should exit. | |||
| Preconditions.checkArgument(offset == Operation.TERMINATION_OFFSET); | |||
| Operation.writeTerminator(outBytes, stats); | |||
| } | |||
| } | |||
| } | |||
| return baos.toByteArray(); | |||
| } | |||
| /** | |||
| * A single compiled sequence of operations. This is just a holder for a {@link Sequence} and the | |||
| * compiled bytes it produces. | |||
| */ | |||
| static class SequenceBytes { | |||
| private final Sequence sequence; | |||
| private final byte[] bytes; | |||
| SequenceBytes(Sequence sequence, Map<DfaNode, Integer> offsetMap, Statistics stats) { | |||
| this.sequence = sequence; | |||
| this.bytes = renderSequence(sequence, offsetMap, stats); | |||
| } | |||
| Sequence getSequence() { | |||
| return sequence; | |||
| } | |||
| boolean isTerminator() { | |||
| return sequence.isFinal() && sequence.size() == 1; | |||
| } | |||
| int size() { | |||
| return bytes.length; | |||
| } | |||
| byte[] getBytes() { | |||
| return bytes; | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,299 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.common.collect.ImmutableMap.toImmutableMap; | |||
| import static com.google.common.collect.ImmutableSet.toImmutableSet; | |||
| import static java.lang.Integer.numberOfTrailingZeros; | |||
| import com.google.common.base.Joiner; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.common.collect.ImmutableMap; | |||
| import com.google.common.collect.ImmutableSet; | |||
| import com.google.common.collect.Iterables; | |||
| import com.google.common.graph.MutableValueGraph; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.common.graph.ValueGraphBuilder; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; | |||
| import java.util.ArrayList; | |||
| import java.util.Comparator; | |||
| import java.util.LinkedHashMap; | |||
| import java.util.List; | |||
| import java.util.Set; | |||
| import java.util.function.Function; | |||
| /** | |||
| * Compiles non-capturing phone number regular expressions into sequences of bytes suitable for | |||
| * creating {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher | |||
| * DigitSequenceMatcher} instances. | |||
| */ | |||
| public final class MatcherCompiler { | |||
| /** | |||
| * Compiles the given {@code RangeTree} into a sequence of bytes suitable for creating a | |||
| * {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher | |||
| * DigitSequenceMatcher}. | |||
| */ | |||
| public static byte[] compile(RangeTree dfa) { | |||
| return compile(dfa, Statistics.NO_OP); | |||
| } | |||
| /** | |||
| * As {@link #compile(RangeTree)} but additionally accepts a {@link Statistics} instance | |||
| * to record metrics about the compilation. | |||
| */ | |||
| public static byte[] compile(RangeTree dfa, Statistics stats) { | |||
| return new MatcherCompiler(dfa).compile(stats); | |||
| } | |||
| /** The DFA from which the matcher data is to be compiled. */ | |||
| private final ValueGraph<DfaNode, DfaEdge> dfa; | |||
| /** The unique initial node of the DFA. */ | |||
| private final DfaNode init; | |||
| /** | |||
| * A map from nodes which are at the beginning of a sequence to that sequence. Not all nodes | |||
| * will be present in the key set of this map. | |||
| */ | |||
| private final ImmutableMap<DfaNode, Sequence> seqStart; | |||
| /** | |||
| * Builds a graph directly from the DFA in a RangeTree. | |||
| * | |||
| * <p>Rather than deal with the DFA tree directly (which is deliberately opaque as a data | |||
| * structure) we serialize it into a more maleable ValueGraph. This allows simpler graph | |||
| * traversal while maintaining a simple-as-possible node/edge structure. It's okay to reuse the | |||
| * RangeTree types {@code DfaNode} and {@code DfaEdge} here because they have the expected | |||
| * semantics (e.g. conforming to equals/hashcode etc...) but care must be taken not to keep the | |||
| * instances around for a long time, since this will keep larger parts of the original DFA alive | |||
| * in the garbage collector (but this is fine since only bytes are returned from this class). | |||
| */ | |||
| private static ValueGraph<DfaNode, DfaEdge> buildGraph(RangeTree dfa) { | |||
| Preconditions.checkArgument(!dfa.isEmpty()); | |||
| MutableValueGraph<DfaNode, DfaEdge> graph = | |||
| ValueGraphBuilder.directed().allowsSelfLoops(false).build(); | |||
| graph.addNode(dfa.getInitial()); | |||
| DfaVisitor visitor = new DfaVisitor() { | |||
| @Override | |||
| public void visit(DfaNode source, DfaEdge edge, DfaNode target) { | |||
| boolean isFirstVisit = graph.addNode(target); | |||
| graph.putEdgeValue(source, target, edge); | |||
| if (isFirstVisit) { | |||
| target.accept(this); | |||
| } | |||
| } | |||
| }; | |||
| dfa.accept(visitor); | |||
| return graph; | |||
| } | |||
| /** | |||
| * Creates a {@code MatcherCompiler} from the given automaton by generating all the | |||
| * {@code Sequence}'s of operations necessary to represent it. | |||
| */ | |||
| MatcherCompiler(RangeTree ranges) { | |||
| this.dfa = buildGraph(ranges); | |||
| this.init = ranges.getInitial(); | |||
| LinkedHashMap<DfaNode, Sequence> start = new LinkedHashMap<>(); | |||
| buildSequencesFrom(init, start); | |||
| this.seqStart = ImmutableMap.copyOf(start); | |||
| } | |||
| /** | |||
| * Returns the output targets of the given node sorted according to the lowest "accepting" digit | |||
| * on the corresponding edge. This ordering is necessary for stability, but also correctness when | |||
| * building mapping operations. Apart from special cases (e.g. only one output) this is the only | |||
| * method which should be used to obtain output nodes. | |||
| */ | |||
| private ImmutableSet<DfaNode> sortedOutputs(DfaNode source) { | |||
| Comparator<DfaNode> ordering = Comparator.comparing( | |||
| target -> numberOfTrailingZeros(dfa.edgeValue(source, target).get().getDigitMask())); | |||
| return dfa.successors(source).stream().sorted(ordering).collect(toImmutableSet()); | |||
| } | |||
| /** Returns the single output target of the given node (or throws an exception). */ | |||
| private DfaNode singleOutput(DfaNode source) { | |||
| return Iterables.getOnlyElement(dfa.successors(source)); | |||
| } | |||
| /** | |||
| * Builds the output map from a given node in the DFA in the correct order. Note that because | |||
| * ImmutableSetMultimap.Builder orders keys based on the first time they are added, and we add | |||
| * keys (nodes) in the order of the input by which they can be reached, the keys of the returned | |||
| * map are ordered by the lowest digit in their set of values (inputs). This is necessary for | |||
| * correct behaviour in the "Mapping" operation. | |||
| */ | |||
| private ImmutableMap<DfaNode, Integer> getOutMap(DfaNode source) { | |||
| Function<DfaNode, Integer> getMask = | |||
| target -> dfa.edgeValue(source, target).get().getDigitMask(); | |||
| return sortedOutputs(source).stream().collect(toImmutableMap(Function.identity(), getMask)); | |||
| } | |||
| /** | |||
| * Recursively builds sequences by traversing the DFA and grouping successive sub-sequences of | |||
| * nodes which neither branch, nor are branched to. Each such sub-sequence is represented by a | |||
| * {@code Sequence} instance (a list of non-branching operations, optionally terminated with a | |||
| * branching operation). | |||
| */ | |||
| private void buildSequencesFrom(DfaNode start, LinkedHashMap<DfaNode, Sequence> map) { | |||
| if (map.containsKey(start)) { | |||
| return; | |||
| } | |||
| DfaNode current = start; | |||
| ImmutableList.Builder<DfaNode> nodes = ImmutableList.builder(); | |||
| while (true) { | |||
| nodes.add(current); | |||
| if (dfa.outDegree(current) != 1) { | |||
| break; | |||
| } | |||
| DfaNode next = singleOutput(current); | |||
| if (dfa.inDegree(next) > 1) { | |||
| break; | |||
| } | |||
| current = next; | |||
| } | |||
| Sequence seq = new Sequence(nodes.build()); | |||
| map.put(start, seq); | |||
| // Recurse from the outputs at the end of the sequence according to their edge values. | |||
| // IMPORTANT: We must not use "current.successors()" here since we need the order of insertion | |||
| // to be well defined and ValueGraph does not make good enough promises about node ordering. | |||
| for (DfaNode out : sortedOutputs(current)) { | |||
| buildSequencesFrom(out, map); | |||
| } | |||
| } | |||
| /** Creates and compiles a {@code MatcherBytes} instance to render the output bytes. */ | |||
| byte[] compile(Statistics stats) { | |||
| return createMatcherBytes(stats).compile(); | |||
| } | |||
| /** Creates a mutable {@code MatcherBytes} instance which will render the output bytes. */ | |||
| MatcherBytes createMatcherBytes(Statistics stats) { | |||
| return new MatcherBytes(seqStart.values(), stats); | |||
| } | |||
| /** | |||
| * A contiguous sub-sequence of nodes in the DFA which neither branch, nor are branched to. | |||
| * <p> | |||
| * The important property of a {@code Sequence} is that branching may only occur at the end of a | |||
| * {@code Sequence} and branches may only jump to the start of another {@code Sequence}. This | |||
| * makes it easier to separate the compilation of operations (inside sequences) from the | |||
| * management of branches and offsets (between sequences). | |||
| */ | |||
| class Sequence { | |||
| private final ImmutableList<DfaNode> nodes; | |||
| Sequence(ImmutableList<DfaNode> nodes) { | |||
| checkArgument(!nodes.isEmpty()); | |||
| this.nodes = nodes; | |||
| } | |||
| private Operation getOp(DfaNode node) { | |||
| return Operation.from(node.canTerminate(), getOutMap(node)); | |||
| } | |||
| /** | |||
| * Returns the operations representing this sequence, merging successive operations where | |||
| * possible. The final list of operations is guaranteed to have at most one branching operation | |||
| * which (if present) will always be the last element in the list. | |||
| */ | |||
| List<Operation> createOps() { | |||
| List<Operation> ops = new ArrayList<>(); | |||
| Operation current = getOp(nodes.get(0)); | |||
| for (int n = 1; n < nodes.size(); n++) { | |||
| Operation next = getOp(nodes.get(n)); | |||
| Operation merged = current.mergeWith(next); | |||
| if (merged != null) { | |||
| current = merged; | |||
| } else { | |||
| ops.add(current); | |||
| current = next; | |||
| } | |||
| } | |||
| ops.add(current); | |||
| return ops; | |||
| } | |||
| DfaNode getInitialState() { | |||
| return Iterables.get(nodes, 0); | |||
| } | |||
| DfaNode getFinalState() { | |||
| return Iterables.getLast(nodes); | |||
| } | |||
| Set<DfaNode> getOutStates() { | |||
| return sortedOutputs(getFinalState()); | |||
| } | |||
| /** | |||
| * Not the same as "terminating" for an operation. A sequence is "final" if no other sequences | |||
| * follow it. Normally there is only one final sequence in a normalized DFA, even if that | |||
| * sequence contains only a single terminating node. However not all terminating nodes are | |||
| * in final sequences. | |||
| */ | |||
| boolean isFinal() { | |||
| return getOutStates().isEmpty(); | |||
| } | |||
| /** Returns the number of nodes that this sequence represents. */ | |||
| int size() { | |||
| return nodes.size(); | |||
| } | |||
| ImmutableSet<Sequence> unorderedOutSequences() { | |||
| return getOutStates().stream().map(seqStart::get).collect(toImmutableSet()); | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return toString(new StringBuilder(), 0).toString(); | |||
| } | |||
| private StringBuilder toString(StringBuilder buf, int indent) { | |||
| List<Operation> ops = createOps(); | |||
| appendIndent(buf, indent).append( | |||
| String.format("{%s} %s", nodes.get(0), Joiner.on(" >> ").join(ops))); | |||
| ImmutableList<DfaNode> outs = Iterables.getLast(ops).getOuts(); | |||
| if (!outs.isEmpty()) { | |||
| buf.append(" {\n"); | |||
| for (DfaNode out : outs) { | |||
| seqStart.get(out).toString(buf, indent + 1); | |||
| } | |||
| appendIndent(buf, indent).append("}\n"); | |||
| } else { | |||
| buf.append('\n'); | |||
| } | |||
| return buf; | |||
| } | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return seqStart.get(init).toString(); | |||
| } | |||
| private static StringBuilder appendIndent(StringBuilder out, int indent) { | |||
| for (int n = 0; n < indent; n++) { | |||
| out.append(" "); | |||
| } | |||
| return out; | |||
| } | |||
| } | |||
| @ -0,0 +1,600 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| import static com.google.common.collect.ImmutableList.toImmutableList; | |||
| import static com.google.common.collect.ImmutableSetMultimap.flatteningToImmutableSetMultimap; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; | |||
| import static java.lang.Integer.numberOfTrailingZeros; | |||
| import static java.util.stream.Collectors.joining; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.common.collect.ImmutableMap; | |||
| import com.google.common.collect.ImmutableSet; | |||
| import com.google.common.collect.ImmutableSetMultimap; | |||
| import com.google.common.collect.Iterables; | |||
| import com.google.common.io.ByteArrayDataOutput; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.Statistics.Type; | |||
| import java.util.ArrayList; | |||
| import java.util.Collection; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.Map.Entry; | |||
| /** | |||
| * A specific instance of a number matching operation derived from a DFA. Operations are created by | |||
| * analyzing a sequence in a DFA and knowing how to write the corresponding instruction(s) as bytes | |||
| * (to be processed by DigitSequenceMatcher or similar). | |||
| */ | |||
| abstract class Operation { | |||
| /** Represents the digits which can be accepted during matching operations. */ | |||
| private enum Digit { | |||
| // Order of enums must match the digit value itself (this is checked for in the constructor). | |||
| ZERO(0), ONE(1), TWO(2), THREE(3), FOUR(4), FIVE(5), SIX(6), SEVEN(7), EIGHT(8), NINE(9); | |||
| private static final Digit[] VALUES = values(); | |||
| // Iteration order is order of enum declaration (and thus also the value order). | |||
| public static final ImmutableSet<Digit> ALL = ImmutableSet.copyOf(VALUES); | |||
| Digit(int value) { | |||
| // No need to store the digit value if we know it matches our ordinal value. | |||
| Preconditions.checkArgument(value == ordinal()); | |||
| } | |||
| /** Returns the digit corresponding to the integral value in the range {@code 0...9}. */ | |||
| public static Digit of(int n) { | |||
| return VALUES[n]; | |||
| } | |||
| /** | |||
| * Returns the set of digits corresponding to a bit-mask in which bits 0 to 9 represent the | |||
| * corresponding digits. | |||
| */ | |||
| public static ImmutableSet<Digit> fromMask(int mask) { | |||
| Preconditions.checkArgument(mask >= 1 && mask <= ALL_DIGITS_MASK); | |||
| if (mask == ALL_DIGITS_MASK) { | |||
| return ALL; | |||
| } | |||
| ImmutableSet.Builder<Digit> digits = ImmutableSet.builder(); | |||
| for (int n = 0; n <= 9; n++) { | |||
| if ((mask & (1 << n)) != 0) { | |||
| digits.add(VALUES[n]); | |||
| } | |||
| } | |||
| return digits.build(); | |||
| } | |||
| /** Returns the integer value of this digit instance. */ | |||
| public int value() { | |||
| return ordinal(); | |||
| } | |||
| } | |||
| /** | |||
| * An invalid jump offset indicating that instead of jumping to a new instruction, the state | |||
| * machine can just terminate (used to avoid jumping directly to the termination instruction). | |||
| */ | |||
| static final int TERMINATION_OFFSET = -1; | |||
| /** The number of bytes required by a "long" branch instruction. */ | |||
| private static final int LONG_BRANCH_SIZE = 2; | |||
| private final boolean isTerminating; | |||
| private final boolean isBranching; | |||
| private Operation(boolean isTerminating, boolean isBranching) { | |||
| this.isTerminating = isTerminating; | |||
| this.isBranching = isBranching; | |||
| } | |||
| /** Returns whether this operation can terminate the state machine when it has been reached. */ | |||
| boolean isTerminating() { | |||
| return isTerminating; | |||
| } | |||
| /** | |||
| * Returns whether this operation is branching. A branching operation has more than one output | |||
| * node it can reach. | |||
| */ | |||
| boolean isBranching() { | |||
| return isBranching; | |||
| } | |||
| /** | |||
| * Returns the output nodes of this operation. For branching operations the order of multiple | |||
| * output nodes is defined by the operation itself (most operations are not branching and have | |||
| * only one output state anyway). | |||
| */ | |||
| abstract ImmutableList<DfaNode> getOuts(); | |||
| /** Returns the op-code for this operation, used when writing out instruction bytes. */ | |||
| abstract OpCode getOpCode(); | |||
| /** Writes this operation out as a series of instruction bytes. */ | |||
| abstract void writeImpl( | |||
| ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats); | |||
| void writeTo(ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats) { | |||
| if (isTerminating()) { | |||
| stats.record(Type.TERMINATING); | |||
| } | |||
| writeImpl(out, offsetMap, stats); | |||
| } | |||
| /** | |||
| * Merges two adjacent operations (a poor man's compiler optimization). Useful for collapsing | |||
| * sequences of "ANY" operations. If this instruction cannot be merged with the given "next" | |||
| * instruction then it should return {@code null}, which is the default behavior. | |||
| * | |||
| * @param next the operation following this operation which we will try and merge with. | |||
| */ | |||
| Operation mergeWith(Operation next) { | |||
| return null; | |||
| } | |||
| /** Writes a branch instructions into the output byte sequence. */ | |||
| static void writeBranch(ByteArrayDataOutput out, int jump, Statistics stats) { | |||
| Preconditions.checkArgument(jump >= 0 && jump < 0x1000, "invalid jump: " + jump); | |||
| if (jump == 0) { | |||
| stats.record(Type.CONTINUATION); | |||
| } else if (jump < 16) { | |||
| stats.record(Type.SHORT_BRANCH); | |||
| out.writeByte((OpCode.BRANCH.ordinal() << 5) | jump); | |||
| } else { | |||
| stats.record(jump < 0x100 ? Type.MEDIUM_BRANCH : Type.LONG_BRANCH); | |||
| out.writeShort((OpCode.BRANCH.ordinal() << 13) | (1 << 12) | jump); | |||
| } | |||
| } | |||
| /** Writes a termination byte into the output byte sequence. */ | |||
| static void writeTerminator(ByteArrayDataOutput out, Statistics stats) { | |||
| stats.record(Type.FINAL); | |||
| out.writeByte(0); | |||
| } | |||
| /** | |||
| * Creates a new operation to represent the output state transition given by {@code outMasks}. | |||
| * Note that where multiple nodes exist in {@code outMasks}, their ordering must be consistent | |||
| * with the {@code Mapping} operation (whereby nodes are ordered by the lowest bit set in the | |||
| * corresponding mask. | |||
| */ | |||
| static Operation from(boolean isTerminating, ImmutableMap<DfaNode, Integer> outMasks) { | |||
| if (outMasks.isEmpty()) { | |||
| // No out nodes; then it's a "Terminal" operation. | |||
| Preconditions.checkState(isTerminating); | |||
| return new Operation.Terminal(); | |||
| } | |||
| ImmutableList<DfaNode> outStates = outMasks.keySet().asList(); | |||
| if (outStates.size() == 1) { | |||
| DfaNode outState = Iterables.getOnlyElement(outStates); | |||
| int digitMask = outMasks.get(outState); | |||
| if (Integer.bitCount(digitMask) == 1) { | |||
| // One output state reached by a single input; then it's a "Single" operation. | |||
| return new Operation.Single(isTerminating, numberOfTrailingZeros(digitMask), outStates); | |||
| } | |||
| if (digitMask == ALL_DIGITS_MASK) { | |||
| // One output state reached by any input; then it's an "Any" operation. | |||
| return new Operation.Any(isTerminating, 1, outStates); | |||
| } | |||
| // One output state reached other general input; then it's a "Range" operation. | |||
| return new Operation.Range(isTerminating, digitMask, outStates); | |||
| } | |||
| if (outStates.size() == 2) { | |||
| // Test if the 2 disjoint masks cover all inputs. If so, use a shorter branch operation. | |||
| List<Integer> masks = outMasks.values().asList(); | |||
| if ((masks.get(0) | masks.get(1)) == ALL_DIGITS_MASK) { | |||
| // One of two output nodes reached by any input; then it's a branching "Range" operation. | |||
| return new Operation.Range(isTerminating, masks.get(0), outStates); | |||
| } | |||
| } | |||
| // Any other combination of nodes or inputs; then it's a "Mapping" operation. This code relies | |||
| // on the ordering of entries in the output map to correspond to edge order. | |||
| return new Operation.Mapping(isTerminating, outMasks); | |||
| } | |||
| /** Respresents a state with no legal outputs, which must be a terminal state in the matcher. */ | |||
| private static final class Terminal extends Operation { | |||
| Terminal() { | |||
| super(true, true); | |||
| } | |||
| @Override | |||
| OpCode getOpCode() { | |||
| return OpCode.BRANCH; | |||
| } | |||
| @Override | |||
| ImmutableList<DfaNode> getOuts() { | |||
| return ImmutableList.of(); | |||
| } | |||
| @Override | |||
| void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> unused, Statistics stats) { | |||
| writeTerminator(out, stats); | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return "TERMINAL"; | |||
| } | |||
| } | |||
| /** | |||
| * Respresents a state which can be transitioned from to a single output state via a single input | |||
| * (eg, "0" or "9"). | |||
| */ | |||
| private static final class Single extends Operation { | |||
| private final Digit digit; | |||
| private final ImmutableList<DfaNode> outs; | |||
| Single(boolean isTerminating, int digit, ImmutableList<DfaNode> outs) { | |||
| super(isTerminating, false); | |||
| Preconditions.checkArgument(outs.size() == 1); | |||
| this.digit = Digit.of(digit); | |||
| this.outs = outs; | |||
| } | |||
| @Override | |||
| OpCode getOpCode() { | |||
| return OpCode.SINGLE; | |||
| } | |||
| @Override ImmutableList<DfaNode> getOuts() { | |||
| return outs; | |||
| } | |||
| @Override | |||
| void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> unused, Statistics stats) { | |||
| // <--------- 1 byte ---------> | |||
| // [ OPCODE | TRM | VALUE ] | |||
| out.writeByte((getOpCode().ordinal() << 5) | |||
| | (isTerminating() ? (1 << 4) : 0) | |||
| | digit.value()); | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return format(digit.value()); | |||
| } | |||
| } | |||
| /** | |||
| * Respresents a state which can be transitioned from to a single output state via any input | |||
| * (ie, "\d"). Successive "Any" oeprations can be merged to represent a repeated sequence | |||
| * (eg, "\d{5}"). | |||
| */ | |||
| private static final class Any extends Operation { | |||
| private final int count; | |||
| private final ImmutableList<DfaNode> outs; | |||
| Any(boolean isTerminating, int count, ImmutableList<DfaNode> outs) { | |||
| super(isTerminating, false); | |||
| Preconditions.checkArgument(outs.size() == 1); | |||
| Preconditions.checkArgument(count > 0); | |||
| this.count = count; | |||
| this.outs = outs; | |||
| } | |||
| @Override | |||
| OpCode getOpCode() { | |||
| return OpCode.ANY; | |||
| } | |||
| @Override ImmutableList<DfaNode> getOuts() { | |||
| return outs; | |||
| } | |||
| @Override | |||
| void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> unused, Statistics stats) { | |||
| int remainingCount = count; | |||
| // <--------- 1 byte ---------> | |||
| // [ OPCODE | TRM | COUNT-1 ] | |||
| int anyN = (getOpCode().ordinal() << 5) | (isTerminating() ? (1 << 4) : 0); | |||
| while (remainingCount > 16) { | |||
| out.writeByte(anyN | 15); | |||
| remainingCount -= 16; | |||
| } | |||
| out.writeByte(anyN | remainingCount - 1); | |||
| } | |||
| @Override | |||
| public Operation mergeWith(Operation next) { | |||
| if (next.getOpCode() == OpCode.ANY && isTerminating() == next.isTerminating()) { | |||
| return new Any(isTerminating(), this.count + ((Any) next).count, ((Any) next).outs); | |||
| } | |||
| return null; | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return format(count); | |||
| } | |||
| } | |||
| /** | |||
| * Represents a state which can be transitioned from via an arbitrary set of inputs to either | |||
| * one or two output nodes (eg, "[23-69]" or "[0-4]X|[5-9]Y"). In the case where there are two | |||
| * output nodes, any input must reach one of the two possible nodes (ie, there is no invalid | |||
| * input). | |||
| */ | |||
| private static final class Range extends Operation { | |||
| private final ImmutableSet<Digit> digits; | |||
| private final ImmutableList<DfaNode> outs; | |||
| Range(boolean isTerminating, int digitMask, ImmutableList<DfaNode> outs) { | |||
| super(isTerminating, outs.size() == 2); | |||
| Preconditions.checkArgument(outs.size() <= 2); | |||
| this.digits = Digit.fromMask(digitMask); | |||
| this.outs = outs; | |||
| } | |||
| @Override | |||
| OpCode getOpCode() { | |||
| return OpCode.RANGE; | |||
| } | |||
| /** | |||
| * For branching Range operations (with 2 output nodes), the order is that the state matched | |||
| * by {@code digits} is the first state and the state reached by any other input is second. | |||
| */ | |||
| @Override ImmutableList<DfaNode> getOuts() { | |||
| return outs; | |||
| } | |||
| @Override | |||
| void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats) { | |||
| // <-------------- 2 bytes --------------> <-------- 2 bytes ---------> | |||
| // [ OPCODE | TRM | 0 | BIT SET ] | |||
| // [ OPCODE | TRM | 1 | BIT SET | JUMP_IN | JUMP_OUT ] | |||
| out.writeShort((getOpCode().ordinal() << 13) | |||
| | (isTerminating() ? (1 << 12) : 0) | |||
| | (isBranching() ? (1 << 11) : 0) | |||
| | asBitMask(digits)); | |||
| if (isBranching()) { | |||
| writeJumpTable(out, ImmutableList.of( | |||
| offsetMap.get(outs.get(0)), offsetMap.get(outs.get(1))), stats); | |||
| } | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return format(asRangeString(digits)); | |||
| } | |||
| } | |||
| /** | |||
| * Represents a state in the matcher which can be transitioned from via an arbitrary set of | |||
| * inputs, to an arbitrary set of nodes. This is the most general form of operation and (apart | |||
| * from branches) provides the only truly necessary instruction in the matcher; everything else | |||
| * is just some specialization of this operation. | |||
| */ | |||
| private static final class Mapping extends Operation { | |||
| private final ImmutableSetMultimap<DfaNode, Digit> nodeMap; | |||
| Mapping(boolean isTerminating, ImmutableMap<DfaNode, Integer> outMasks) { | |||
| super(isTerminating, true); | |||
| this.nodeMap = outMasks.entrySet().stream() | |||
| .collect(flatteningToImmutableSetMultimap( | |||
| Entry::getKey, e -> Digit.fromMask(e.getValue()).stream())); | |||
| } | |||
| @Override | |||
| OpCode getOpCode() { | |||
| return isTerminating() ? OpCode.TMAP : OpCode.MAP; | |||
| } | |||
| /** | |||
| * For Mapping operations, output node order is defined by the lowest digit by which that | |||
| * node can be reached. For example, if a map operation can reach three nodes {@code A}, | |||
| * {@code B} and {@code C} via inputs in the ranges {@code [1-38]}, {@code [4-6]} and | |||
| * {@code [09]} respectively, then they will be ordered {@code (C, A, B)}. | |||
| */ | |||
| @Override ImmutableList<DfaNode> getOuts() { | |||
| return nodeMap.keySet().asList(); | |||
| } | |||
| @Override | |||
| void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats) { | |||
| // <------------ 4 bytes ------------> <-- 1 byte per offset ---> | |||
| // [ OPCODE | CODED MAP | JUMP_1 | ... | JUMP_N ] | |||
| out.writeInt((getOpCode().ordinal() << 29) | asCodedMap(nodeMap)); | |||
| ImmutableList<Integer> offsets = | |||
| getOuts().stream().map(offsetMap::get).collect(toImmutableList()); | |||
| writeJumpTable(out, offsets, stats); | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| return format(nodeMap.asMap().values().stream() | |||
| .map(Operation::asRangeString).collect(joining(", "))); | |||
| } | |||
| } | |||
| String format(Object extra) { | |||
| return String.format("%s%s : %s", getOpCode(), isTerminating() ? "*" : "", extra); | |||
| } | |||
| /** | |||
| * Returns an integer with the lowest 10 bits set in accordance with the digits in the given set. | |||
| */ | |||
| private static int asBitMask(ImmutableSet<Digit> digits) { | |||
| int bitMask = 0; | |||
| for (Digit digit : digits) { | |||
| bitMask |= (1 << digit.value()); | |||
| } | |||
| return bitMask; | |||
| } | |||
| /** | |||
| * Returns a integer with the lowest 29 bits set to encode an arbitrary mapping from input digit | |||
| * to an output index. The 29 bits are partitioned such that lower inputs require fewer bits to | |||
| * encode (output indices are assigned as they are encountered, starting at the first input). | |||
| * Each digit can then be quickly mapped to either its 1-indexed output node, or 0 if the input | |||
| * was invalid. | |||
| */ | |||
| private static int asCodedMap(ImmutableSetMultimap<DfaNode, Digit> nodeMap) { | |||
| int codedMap = 0; | |||
| List<DfaNode> outs = nodeMap.keySet().asList(); | |||
| for (int n = 0; n < outs.size(); n++) { | |||
| for (Digit digit : nodeMap.get(outs.get(n))) { | |||
| // Coded indices are 1-to-10 (0 is the "invalid" node). | |||
| codedMap |= ((n + 1) << OpCode.getMapShift(digit.value())); | |||
| } | |||
| } | |||
| return codedMap; | |||
| } | |||
| /** | |||
| * Writes a sequence of offsets representing a unsigned byte-based jump table after either a | |||
| * Mapping or Range instruction. This accounts correctly for the need to introduce a new | |||
| * "trampoline" branch instruction after the jump table (when the desired offset is too large | |||
| * to fit in a single unsigned byte). | |||
| * <p> | |||
| * Offsets are either: | |||
| * <ul> | |||
| * <li>The number of bytes to jump from the end of the current {@code Sequence} bytes to the | |||
| * start of the destination {@code Sequence} bytes. | |||
| * <li>{@code -1} to indicate that a terminal node has been reached. | |||
| * </ul> | |||
| * <p> | |||
| * Note that the offset written into the jump table itself must be relative to the beginning of | |||
| * the jump table and so must be adjusted by the number of bytes in the jump table and any other | |||
| * branch instructions that follow it. This it probably the most awkward logic in the entire | |||
| * compiler. | |||
| */ | |||
| static void writeJumpTable(ByteArrayDataOutput out, List<Integer> offsets, | |||
| Statistics stats) { | |||
| int jumpTableSize = offsets.size(); | |||
| boolean needsExtraBranches = false; | |||
| for (int n = 0; n < jumpTableSize && !needsExtraBranches; n++) { | |||
| // Check whether the adjusted offset (ie, the one we would write) will fit in a byte. | |||
| // It's no issue to have offsets of -1 as it can never trigger "needsExtraBranches". | |||
| needsExtraBranches = (offsets.get(n) + jumpTableSize >= 0x100); | |||
| } | |||
| if (needsExtraBranches) { | |||
| // We only get here if at least one offset (after adjustment by the original jump table size) | |||
| // would not fit into a byte. Now we must calculate exactly how many extra branches we are | |||
| // going to need. For this we must assume the worst case adjustment of "3 x jumpTableSize" | |||
| // which is 1 byte for the jump table offset and 2 bytes for the extra branch for every entry. | |||
| // This is pessimistic because there will now be cases where we write a trampoline jump for | |||
| // an offset that could have fitted had we not assumed that we might need the extra space for | |||
| // the branch. However these cases are rare enough that we choose to ignore them. | |||
| int maxOffsetAdjust = ((1 + LONG_BRANCH_SIZE) * jumpTableSize); | |||
| int extraBranchCount = 0; | |||
| for (int n = 0; n < jumpTableSize; n++) { | |||
| if (offsets.get(n) + maxOffsetAdjust >= 0x100) { | |||
| extraBranchCount += 1; | |||
| } | |||
| } | |||
| // Now we know a reasonable upper bound for how many extra branches are needed, use this to | |||
| // adjust the actual offsets and write them. When a "trampoline" branch instruction is needed | |||
| // we split the offset so the jump table jumps to the branch instruction and that jumps the | |||
| // rest. Branch instructions are positioned, in order, immediately after the jump table. | |||
| List<Integer> extraBranchOffsets = new ArrayList<>(); | |||
| int totalOffsetAdjust = jumpTableSize + (LONG_BRANCH_SIZE * extraBranchCount); | |||
| for (int n = 0; n < jumpTableSize; n++) { | |||
| int offset = offsets.get(n); | |||
| if (offset >= 0) { | |||
| int worstCaseOffset = offset + maxOffsetAdjust; | |||
| // Get the actual total offset we want to jump by. | |||
| offset += totalOffsetAdjust; | |||
| // Use the worst case offset here so we repeat exactly the same decision as the loop | |||
| // above (otherwise we might add fewer branches which would screw up our offsets). | |||
| if (worstCaseOffset >= 0x100) { | |||
| // Split the original offset, recording the jump to the trampoline branch as well as | |||
| // the branch offset itself. Note that the offset adjustment changes as more trampoline | |||
| // branches are encountered (but the overall offset jumped remains the same). | |||
| int extraBranchIndex = extraBranchOffsets.size(); | |||
| // This offset will always be small (max jump table is 10 entries, so offset to the | |||
| // last possible branch will be at most 28 bytes). | |||
| int branchInstructionOffset = jumpTableSize + (LONG_BRANCH_SIZE * extraBranchIndex); | |||
| // Subtract one additional branch instruction here because when we trampoline jump, we | |||
| // jump to the start of the branch instruction, but jump away from the end of it. | |||
| extraBranchOffsets.add((offset - branchInstructionOffset) - LONG_BRANCH_SIZE); | |||
| offset = branchInstructionOffset; | |||
| } | |||
| // Write the total offset (offset must be < 0x100 here as worstCaseOffset was < 0x100). | |||
| Preconditions.checkState(offset < 0x100, "jump too long: %s", offset); | |||
| out.writeByte(offset); | |||
| } else { | |||
| // If the destination of this jump would just be a termination instruction, just write | |||
| // the termination byte here directly (no point jumping to the termination byte). | |||
| Preconditions.checkArgument(offset == TERMINATION_OFFSET, "bad offset: %s", offset); | |||
| writeTerminator(out, stats); | |||
| } | |||
| } | |||
| // Write out the trampoline jumps in the order they were found. | |||
| for (int offset : extraBranchOffsets) { | |||
| stats.record(Type.DOUBLE_JUMP); | |||
| Operation.writeBranch(out, offset, stats); | |||
| } | |||
| } else { | |||
| // In the simple case, there are no extra branches, so we just write the offsets we have. | |||
| // This has the same effect as running the code above with (extraBranchCount == 0) but can be | |||
| // reached more optimistically because we don't need to account for the worst case offset | |||
| // adjustment when deciding if it's safe to just use the offsets we were given. It's a form | |||
| // of hysteresis between the no-branch and extra-branch cases. | |||
| for (int n = 0; n < jumpTableSize; n++) { | |||
| int offset = offsets.get(n); | |||
| if (offset >= 0) { | |||
| offset += jumpTableSize; | |||
| Preconditions.checkState(offset < 0x100, "jump too long: " + offset); | |||
| out.writeByte(offset); | |||
| } else { | |||
| writeTerminator(out, stats); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // Helper function for asRanges() to print a single range (eg, "[014-7]"). | |||
| private static String asRangeString(Collection<Digit> digits) { | |||
| StringBuilder out = new StringBuilder(); | |||
| out.append("["); | |||
| Digit lhs = null; | |||
| Digit rhs = null; | |||
| for (Digit digit : digits) { | |||
| if (lhs != null) { | |||
| if (digit.value() == rhs.value() + 1) { | |||
| rhs = digit; | |||
| continue; | |||
| } | |||
| if (rhs != lhs) { | |||
| if (rhs.value() > lhs.value() + 1) { | |||
| out.append("-"); | |||
| } | |||
| out.append(rhs.value()); | |||
| } | |||
| } | |||
| lhs = digit; | |||
| rhs = digit; | |||
| out.append(lhs.value()); | |||
| } | |||
| if (rhs != lhs) { | |||
| if (rhs.value() > lhs.value() + 1) { | |||
| out.append("-"); | |||
| } | |||
| out.append(rhs.value()); | |||
| } | |||
| out.append("]"); | |||
| return out.toString(); | |||
| } | |||
| } | |||
| @ -0,0 +1,44 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| /** | |||
| * A simple class for capturing statistics produced during regular expression compilation. This can | |||
| * be used to quantify how proposed changes to the byte-code definition will affect the size of any | |||
| * compiled matcher bytes. | |||
| */ | |||
| public interface Statistics { | |||
| public static final Statistics NO_OP = new Statistics() { | |||
| @Override public void record(Type type) { } | |||
| }; | |||
| /** The type of things we are counting. */ | |||
| public enum Type { | |||
| SHORT_BRANCH, | |||
| MEDIUM_BRANCH, | |||
| LONG_BRANCH, | |||
| DOUBLE_JUMP, | |||
| CONTINUATION, | |||
| TERMINATING, | |||
| FINAL; | |||
| } | |||
| /** Records an operation of the specified type during bytecode compilation. */ | |||
| void record(Type type); | |||
| } | |||
| @ -0,0 +1,181 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import com.google.auto.value.AutoValue; | |||
| import com.google.common.annotations.VisibleForTesting; | |||
| import com.google.common.base.Preconditions; | |||
| import java.util.Optional; | |||
| /** | |||
| * Represents an NFA graph which accepts sequences of inputs of any digit (also known as "any-digit | |||
| * sequences"), possibly of variable length. For example, an {@code AnyPath} instance might accept | |||
| * a single input of any digit (i.e. equivalent to the regular expression {@code "\d"}), or it might | |||
| * accept sequences of any digits of length 4 or 6 (i.e. equivalent to the regular expression | |||
| * {@code "\d{4}\d{2}?"}. | |||
| * | |||
| * <p>As {@code AnyPath} instances are all restricted to only accepting any-digits sequences, the | |||
| * only interesting thing about them is the set of sequence lengths they accept. | |||
| */ | |||
| @AutoValue | |||
| abstract class AnyPath implements Comparable<AnyPath> { | |||
| /** | |||
| * The special empty path which matches zero length input. This is useful as an identity value | |||
| * when constructing other paths but should never be a path in the graph. | |||
| */ | |||
| public static final AnyPath EMPTY = new AutoValue_AnyPath(0x1); | |||
| /** The path matching exactly one input of any digit. */ | |||
| public static final AnyPath SINGLE = of(0x2); | |||
| /** The path matching one or zero inputs of any digit. */ | |||
| public static final AnyPath OPTIONAL = of(0x3); | |||
| @VisibleForTesting | |||
| static AnyPath of(int mask) { | |||
| Preconditions.checkArgument(mask > 1, "invalid path mask: %s", mask); | |||
| return new AutoValue_AnyPath(mask); | |||
| } | |||
| /** | |||
| * Returns a bit-mask representing the lengths of any-digit sequences accepted by this path. | |||
| * If bit-N is set, then this path accepts an N-length sequence of any digits. | |||
| */ | |||
| abstract int mask(); | |||
| /** Returns whether this path accepts an any-digit sequence of length {@code n}.*/ | |||
| public boolean acceptsLength(int n) { | |||
| Preconditions.checkArgument(n >= 0 && n < 32, "invalid path length: %s", n); | |||
| return (mask() & (1 << n)) != 0; | |||
| } | |||
| /** Returns the maximum length any-sequence that this path will accept. */ | |||
| public int maxLength() { | |||
| return (31 - Integer.numberOfLeadingZeros(mask())); | |||
| } | |||
| /** | |||
| * Returns whether this path is empty (i.e. accepts only zero length sequences). This is only | |||
| * useful when constructing paths and empty paths should never appear in an NFA graph. | |||
| */ | |||
| public boolean isEmpty() { | |||
| return mask() == 0x1; | |||
| } | |||
| /** | |||
| * Extends this path by one input, potentially setting all input as optional. For example (using | |||
| * 'x' to represent a single "any digit" input): | |||
| * <ul> | |||
| * <li>{@code "xx".extend(false) == "xxx"} | |||
| * <li>{@code "xx".extend(true) == "(xxx)?"} | |||
| * <li>{@code "xx(x)?".extend(false) == "xxx(x)?"} | |||
| * <li>{@code "xx(x)?".extend(true) == "(xxx(x)?)?"} | |||
| * </ul> | |||
| */ | |||
| public AnyPath extend(boolean allOptional) { | |||
| return of((mask() << 1) | (allOptional ? 0x1 : 0x0)); | |||
| } | |||
| /** | |||
| * Joins the given path to this one, results in a new path which is equivalent to the | |||
| * concatenation of the regular expressions they represent. For example (using | |||
| * 'x' to represent a single "any digit" input): | |||
| * <ul> | |||
| * <li>{@code "xx".join("xx") == "xxxx"} | |||
| * <li>{@code "xx".join("x?") == "xx(x)?"} | |||
| * </ul> | |||
| */ | |||
| public AnyPath join(AnyPath other) { | |||
| int newMask = 0; | |||
| // Include the length itself (which is always accepted). | |||
| for (int n = 0; n <= other.maxLength(); n++) { | |||
| if (other.acceptsLength(n)) { | |||
| newMask |= mask() << n; | |||
| } | |||
| } | |||
| return of(newMask); | |||
| } | |||
| /** | |||
| * Returns a new path which is equal to this path, except that it also accepts zero length | |||
| * sequences. | |||
| */ | |||
| public AnyPath makeOptional() { | |||
| return of(mask() | 0x1); | |||
| } | |||
| /** | |||
| * Attempts to "factor" this path by the given path to produce a path such that | |||
| * {@code p.factor(q).join(q)} is equivalent to {@code p}. This is useful when trying to | |||
| * determine longest common paths. Factorizing may not succeed in cases where no common path | |||
| * exists (e.g. {@code "xx(xx)?".factor("x?")} fails because there is no way to join anything | |||
| * to the path {@code "x?"} to make it accept exactly 2 or 4 length any-digit sequences). | |||
| */ | |||
| public Optional<AnyPath> factor(AnyPath other) { | |||
| int factor = mask() / other.mask(); | |||
| if (factor > 1 && (other.mask() * factor) == mask()) { | |||
| return Optional.of(of(factor)); | |||
| } else { | |||
| return Optional.empty(); | |||
| } | |||
| } | |||
| @Override | |||
| public int compareTo(AnyPath other) { | |||
| return Integer.compare(mask(), other.mask()); | |||
| } | |||
| @Override | |||
| public final String toString() { | |||
| // A non-obvious algorithm for getting a reasonable toString() using x's. | |||
| // Best understood via examples: | |||
| // | |||
| // 0001 is invalid as we cannot represent an optional zero-length sequence. | |||
| // | |||
| // Hi-bit-1 ==> 1 x | |||
| // 0010 -> x, 0011 -> (x)? | |||
| // | |||
| // Hi-bit-2 ==> 2 x's | |||
| // 0100 -> xx, 0101 -> (xx)?, 0110 -> x(x)?, 0111 -> (x(x)?)? | |||
| // | |||
| // Hi-bit-3 ==> 3 x's | |||
| // 1000 -> xxx, 1001 -> (xxx)?, 1010 -> x(xx)?, 1011 -> (x(xx)?)? | |||
| // 1100 -> xx(x)?, 1101 -> (xx(x)?)?, 1110 -> x(x(x)?)?, 1111 -> (x(x(x)?)?)? | |||
| // | |||
| // Rules: | |||
| // * For hi-bit M, there are M x's in the string. | |||
| // * For N < M; if bit-N is set, then a group starts after the Nth-x. | |||
| if (mask() == 0x1) { | |||
| return "<EMPTY>"; | |||
| } | |||
| StringBuilder out = new StringBuilder(); | |||
| for (int n = 0; n < maxLength(); n++) { | |||
| out.append('x'); | |||
| } | |||
| // Loop high-to-low to prevent earlier insertions messing with the index. | |||
| for (int n = maxLength() - 1; n >= 0; n--) { | |||
| if (acceptsLength(n)) { | |||
| out.insert(n, '('); | |||
| } | |||
| } | |||
| // The number of opened groups was the number of set bits - 1. | |||
| for (int n = Integer.bitCount(mask()) - 1; n > 0; n--) { | |||
| out.append(")?"); | |||
| } | |||
| return out.toString(); | |||
| } | |||
| } | |||
| @ -0,0 +1,351 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.common.collect.ImmutableSortedSet; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import java.util.Collection; | |||
| import java.util.List; | |||
| import java.util.Set; | |||
| import java.util.stream.Collectors; | |||
| import java.util.stream.Stream; | |||
| /** | |||
| * Value type for edges in NFA graphs of phone number regular expressions. Outside this package, | |||
| * this type is mainly used for examining NFA graphs which represent a regular expression, | |||
| * generated via {@link RangeTreeConverter#toNfaGraph}.. | |||
| * | |||
| * <p>Note that the ordering of edges is carefully designed to attempt to replicate as much of the | |||
| * existing intuition about ordering in regular expressions as possible. This should result in any | |||
| * generated expressions being as close to existing hand edited expressions as possible. | |||
| */ | |||
| public abstract class Edge implements Comparable<Edge> { | |||
| /** API for visiting composite edges; see also {@link #accept(Visitor)}. */ | |||
| public interface Visitor { | |||
| /** Visits a leaf node simple edge. */ | |||
| void visit(SimpleEdge edge); | |||
| /** | |||
| * Visits a composited sequence of edges. Note that sequences only ever contain disjunctions or | |||
| * simple edges, but never other sequences. For edges "a", "b", "c", this represents the | |||
| * concatenated edge "abc". | |||
| */ | |||
| void visitSequence(List<Edge> edges); | |||
| /** | |||
| * Visits a disjunction of parallel edges. Note that disjunctions only ever contain sequences | |||
| * or simple edges, but never other disjunctions. For edges "a", "b", "c", this represents the | |||
| * disjunctive group "(a|b|c)". | |||
| */ | |||
| void visitGroup(Set<Edge> edges, boolean isOptional); | |||
| } | |||
| // The singleton epsilon edge. | |||
| private static final SimpleEdge EPSILON = new SimpleEdge(); | |||
| // The singleton edge matching any digit (i.e. 'x' or '\d'). | |||
| private static final SimpleEdge ANY = new SimpleEdge(ALL_DIGITS_MASK, false); | |||
| // The singleton edge optionally matching any digit (i.e. 'x?' or '\d?'). | |||
| private static final SimpleEdge OPTIONAL_ANY = ANY.optional(); | |||
| /** Returns an edge which accepts digits 0 to 9 according tothe bits set in the given mask. */ | |||
| public static SimpleEdge fromMask(int digitMask) { | |||
| return digitMask == ALL_DIGITS_MASK ? ANY : new SimpleEdge(digitMask, false); | |||
| } | |||
| /** | |||
| * Returns the epsilon edge which accepts zero length input and transitions immediately. This | |||
| * edge should only ever appear parallel to other edges, and not as the only transition between | |||
| * two nodes. | |||
| */ | |||
| public static SimpleEdge epsilon() { | |||
| return EPSILON; | |||
| } | |||
| /** Returns the edge which accepts any digit {@code [0-9]}. */ | |||
| public static SimpleEdge any() { | |||
| return ANY; | |||
| } | |||
| /** Returns the edge which optionally accepts any digit {@code [0-9]}. */ | |||
| public static SimpleEdge optionalAny() { | |||
| return OPTIONAL_ANY; | |||
| } | |||
| /** | |||
| * Returns the ordered concatenation of the given edges. If either edge is a concatenation, it | |||
| * is first expanded, so that the resulting edge contains only simple edges or disjunctions. | |||
| */ | |||
| public static Edge concatenation(Edge lhs, Edge rhs) { | |||
| checkArgument(!lhs.equals(EPSILON) && !rhs.equals(EPSILON), "cannot concatenate epsilon edges"); | |||
| // Don't make concatenations of concatenations; flatten them out so you only have singletons | |||
| // or disjunctions. This is equivalent to writing "xyz" instead of "x(yz)". | |||
| List<Edge> edges = Stream.of(lhs, rhs) | |||
| .flatMap( | |||
| e -> (e instanceof Concatenation) ? ((Concatenation) e).edges.stream() : Stream.of(e)) | |||
| .collect(Collectors.toList()); | |||
| return new Concatenation(edges); | |||
| } | |||
| /** | |||
| * Returns the disjunction of the given edges. If either edge is already a concatenation, it | |||
| * is first expanded, so that the resulting edge contains only simple edges or disjunctions. | |||
| */ | |||
| public static Edge disjunction(Collection<Edge> edges) { | |||
| // Don't make disjunctions of disjunctions; flatten them out so you only have singletons, | |||
| // concatenations or epsilon. This is equivalent to writing "(x|y|z)" instead of "(x|(y|z))". | |||
| List<Edge> allEdges = edges.stream() | |||
| .flatMap( | |||
| e -> (e instanceof Disjunction) ? ((Disjunction) e).edges.stream() : Stream.of(e)) | |||
| .sorted() | |||
| .distinct() | |||
| .collect(Collectors.toList()); | |||
| // There should only ever be one epsilon when we make a disjunction (disjunctions are made when | |||
| // subgraphs collapse and each subgraph should only have one epsilon to make it optional). | |||
| // Epsilons sort to-the-left of everything, so if there is an epsilon it must be the first edge. | |||
| boolean isOptional = allEdges.get(0) == EPSILON; | |||
| if (isOptional) { | |||
| allEdges = allEdges.subList(1, allEdges.size()); | |||
| } | |||
| Preconditions.checkState(!allEdges.contains(EPSILON)); | |||
| return new Disjunction(allEdges, isOptional); | |||
| } | |||
| /** An edge optionally matching a single input token, or the epsilon transition. */ | |||
| public static final class SimpleEdge extends Edge { | |||
| private final int digitMask; | |||
| private final boolean isOptional; | |||
| // Constructor for singleton epsilon edge. | |||
| private SimpleEdge() { | |||
| this.digitMask = 0; | |||
| // An optional epsilon makes no real sense. | |||
| this.isOptional = false; | |||
| } | |||
| private SimpleEdge(int digitMask, boolean isOptional) { | |||
| checkArgument(digitMask > 0 && digitMask < (1 << 10), "invalid bit mask %s", digitMask); | |||
| this.digitMask = digitMask; | |||
| this.isOptional = isOptional; | |||
| } | |||
| /** Returns the mask of digits accepted by this edge. */ | |||
| public int getDigitMask() { | |||
| return digitMask; | |||
| } | |||
| /** Returns whether this edge is optional. */ | |||
| public boolean isOptional() { | |||
| return isOptional; | |||
| } | |||
| /** Returns an optional version of this, non-optional edge. */ | |||
| public SimpleEdge optional() { | |||
| Preconditions.checkState(digitMask != 0, "cannot make epsilon optional"); | |||
| Preconditions.checkState(!isOptional, "edge already optional"); | |||
| return new SimpleEdge(digitMask, true); | |||
| } | |||
| @Override | |||
| public void accept(Visitor visitor) { | |||
| visitor.visit(this); | |||
| } | |||
| @Override | |||
| public boolean equals(Object obj) { | |||
| return (obj instanceof SimpleEdge) && digitMask == ((SimpleEdge) obj).digitMask; | |||
| } | |||
| @Override | |||
| public int hashCode() { | |||
| return digitMask; | |||
| } | |||
| @Override | |||
| public int compareTo(Edge rhs) { | |||
| if (rhs instanceof SimpleEdge) { | |||
| return compare((SimpleEdge) rhs); | |||
| } else { | |||
| // Composite types know how to compare themselves to SimpleEdges, so delegate to them but | |||
| // remember to invert the result since we are reversing the comparison order. | |||
| return -rhs.compareTo(this); | |||
| } | |||
| } | |||
| private int compare(SimpleEdge rhs) { | |||
| if (isOptional != rhs.isOptional) { | |||
| // Optional edges sort to-the-right of non-optional things. | |||
| return isOptional ? 1 : -1; | |||
| } | |||
| if (digitMask == rhs.digitMask) { | |||
| return 0; | |||
| } | |||
| if (digitMask == 0 || rhs.digitMask == 0) { | |||
| // Epsilon sorts to-the-left of everything. | |||
| return digitMask == 0 ? -1 : 1; | |||
| } | |||
| // Unlike many other places where range specifications are used, we cannot guarantee the | |||
| // ranges are disjoint here, so we sort on the reversed bitmask to favour the lowest set bit. | |||
| // This sorts 'x' ([0-9]) to the left of everything, and epsilon to the right of everything. | |||
| // I.e. "x" < "0", "0" < "1", "[0-3]" < "[0-2]", "9" < epsilon. | |||
| // | |||
| // Remember to logical-shift back down to avoid negative values. | |||
| int reverseLhsMask = (Integer.reverse(digitMask) >>> 22); | |||
| int reverseRhsMask = (Integer.reverse(rhs.digitMask) >>> 22); | |||
| // Compare in the opposite order, so the largest reversed value is ordered "to the left". | |||
| return Integer.compare(reverseRhsMask, reverseLhsMask); | |||
| } | |||
| } | |||
| // A sequence of edges (disjunctions or simple edges). | |||
| private static final class Concatenation extends Edge { | |||
| private final ImmutableList<Edge> edges; | |||
| private Concatenation(Collection<Edge> edges) { | |||
| this.edges = ImmutableList.copyOf(edges); | |||
| } | |||
| @Override | |||
| public void accept(Visitor visitor) { | |||
| visitor.visitSequence(edges); | |||
| } | |||
| @Override | |||
| public boolean equals(Object obj) { | |||
| return (obj instanceof Concatenation) && edges.equals(((Concatenation) obj).edges); | |||
| } | |||
| @Override | |||
| public int hashCode() { | |||
| return edges.hashCode(); | |||
| } | |||
| @Override | |||
| public int compareTo(Edge rhs) { | |||
| if (rhs instanceof Concatenation) { | |||
| return compareEdges(edges, ((Concatenation) rhs).edges); | |||
| } else { | |||
| // Compare our first edge to the non-concatenation. If this compares as equal, order the | |||
| // concatenation between simple edges and disjunctions to break the tie and avoid implying | |||
| // that a concatenation and a non-concatenation are equal. | |||
| int comparison = -rhs.compareTo(edges.get(0)); | |||
| return comparison != 0 ? comparison : (rhs instanceof SimpleEdge ? 1 : -1); | |||
| } | |||
| } | |||
| } | |||
| // A disjunctive group of edges (sequences or simple edges). | |||
| private static final class Disjunction extends Edge { | |||
| private final ImmutableSortedSet<Edge> edges; | |||
| private final boolean isOptional; | |||
| private Disjunction(Collection<Edge> edges, boolean isOptional) { | |||
| checkArgument(!edges.isEmpty()); | |||
| this.edges = ImmutableSortedSet.copyOf(edges); | |||
| this.isOptional = isOptional; | |||
| } | |||
| @Override | |||
| public void accept(Visitor visitor) { | |||
| visitor.visitGroup(edges, isOptional); | |||
| } | |||
| @Override | |||
| public boolean equals(Object obj) { | |||
| return (obj instanceof Disjunction) && edges.equals(((Disjunction) obj).edges); | |||
| } | |||
| @Override | |||
| public int hashCode() { | |||
| // Negate bits here to be different from Concatenation. | |||
| return ~edges.hashCode(); | |||
| } | |||
| @Override | |||
| public int compareTo(Edge rhs) { | |||
| if (rhs instanceof Disjunction) { | |||
| return compareEdges(edges.asList(), ((Disjunction) rhs).edges.asList()); | |||
| } else { | |||
| // Compare our first edge to the non-disjunction. If this compares as equal, order the | |||
| // disjunction to the right of the other edge to break the tie and avoid implying that | |||
| // a disjunction and a non-disjunction are equal. | |||
| int comparison = -rhs.compareTo(edges.asList().get(0)); | |||
| return comparison == 0 ? 1 : comparison; | |||
| } | |||
| } | |||
| } | |||
| /** | |||
| * Accepts a visitor on this edge, visiting any sub-edges from which it is composed. This is a | |||
| * double-dispatch visitor to avoid anyone processing edges needing to know about specific types. | |||
| * Only the immediate edge is visited and the visitor is then responsible for visiting child | |||
| * edges. | |||
| */ | |||
| public abstract void accept(Visitor visitor); | |||
| // Compare lists according to elements, and tie break on length if different. This is effectively | |||
| // a lexicographical ordering. | |||
| private static int compareEdges(ImmutableList<Edge> lhs, ImmutableList<Edge> rhs) { | |||
| int minSize = Math.min(lhs.size(), rhs.size()); | |||
| for (int n = 0; n < minSize; n++) { | |||
| int compared = lhs.get(n).compareTo(rhs.get(n)); | |||
| if (compared != 0) { | |||
| return compared; | |||
| } | |||
| } | |||
| return Integer.compare(lhs.size(), rhs.size()); | |||
| } | |||
| @Override | |||
| public String toString() { | |||
| StringBuilder out = new StringBuilder(); | |||
| accept(new Visitor() { | |||
| @Override | |||
| public void visit(SimpleEdge e) { | |||
| if (e.equals(Edge.epsilon())) { | |||
| // Epsilon cannot be optional. | |||
| out.append("e"); | |||
| } else { | |||
| int m = e.getDigitMask(); | |||
| out.append(m == ALL_DIGITS_MASK ? "x" : RangeSpecification.toString(m)); | |||
| if (e.isOptional()) { | |||
| out.append('?'); | |||
| } | |||
| } | |||
| } | |||
| @Override | |||
| public void visitSequence(List<Edge> edges) { | |||
| edges.forEach(e -> e.accept(this)); | |||
| } | |||
| @Override | |||
| public void visitGroup(Set<Edge> edges, boolean isOptional) { | |||
| out.append("("); | |||
| edges.forEach(e -> { | |||
| e.accept(this); | |||
| out.append("|"); | |||
| }); | |||
| out.setLength(out.length() - 1); | |||
| out.append(isOptional ? ")?" : ")"); | |||
| } | |||
| }); | |||
| return out.toString(); | |||
| } | |||
| } | |||
| @ -0,0 +1,343 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.common.base.Preconditions.checkState; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; | |||
| import com.google.common.collect.Iterables; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.Visitor; | |||
| import java.util.List; | |||
| import java.util.Optional; | |||
| import java.util.Set; | |||
| import javax.annotation.Nullable; | |||
| /** Writes an NFA graph edge instance as a regular expression. */ | |||
| final class EdgeWriter implements Visitor { | |||
| // Regex constant strings pulled out for some degree of readability. | |||
| private static final String DOT_MATCH = "."; | |||
| private static final String DIGIT_MATCH = "\\d"; | |||
| private static final String OPTIONAL_MARKER = "?"; | |||
| private static final String GROUP_START = "(?:"; | |||
| private static final String GROUP_DISJUNCTION = "|"; | |||
| private static final String GROUP_END = ")"; | |||
| private static final String OPTIONAL_GROUP_END = GROUP_END + OPTIONAL_MARKER; | |||
| /** | |||
| * Returns a regular expression corresponding to the structure of the given edge. This method | |||
| * does not apply any specific optimizations to the edge it is given and any optimizations which | |||
| * affect the output must have already been applied to the graph from which the input edge was | |||
| * derived. | |||
| * | |||
| * @param edge A collapsed edge typically derived from serializing an NFA graph. | |||
| * @param useDotMatch true if {@code '.'} should be used to "match any digit" (instead of | |||
| * {@code '\\d'}) which results in shorter output. | |||
| */ | |||
| public static String toRegex(Edge edge, boolean useDotMatch) { | |||
| EdgeWriter writer = new EdgeWriter(useDotMatch); | |||
| edge.accept(writer); | |||
| return writer.out.toString(); | |||
| } | |||
| // The token to match any input digit (e.g. "\\d" or "."). | |||
| private final String anyToken; | |||
| // Accumulated regular expression appended to during visitation. | |||
| private final StringBuilder out = new StringBuilder(); | |||
| // Flag to determine when the top-level edge visited is a group, because if it is we can often | |||
| // omit the explicit grouping tokens and save some space. | |||
| private boolean isTopLevelGroup = true; | |||
| private EdgeWriter(boolean useDotMatch) { | |||
| this.anyToken = useDotMatch ? DOT_MATCH : DIGIT_MATCH; | |||
| } | |||
| @Override | |||
| public void visit(SimpleEdge e) { | |||
| checkArgument(!e.equals(Edge.epsilon()), "unexpected bare epsilon"); | |||
| isTopLevelGroup = false; | |||
| // It's easier to just attempt to extract an "any" edge as that code already has to work for | |||
| // simple edges when they are inside other composite edges. Optionality is encoded into the | |||
| // resulting AnyPath and handled by appendRegex(), so we don't need to handle it again here. | |||
| Optional<AnyPath> any = AnyPathVisitor.extractAnyPath(e); | |||
| if (any.isPresent()) { | |||
| appendRegex(out, any.get().mask()); | |||
| } else { | |||
| // Not an "any" edge so append the usual range representation (e.g. "6" or "[014-9]"). | |||
| out.append(RangeSpecification.toString(e.getDigitMask())); | |||
| if (e.isOptional()) { | |||
| out.append(OPTIONAL_MARKER); | |||
| } | |||
| } | |||
| } | |||
| @Override | |||
| public void visitSequence(List<Edge> edges) { | |||
| checkArgument(!edges.isEmpty(), "sequences must have at least one edge"); | |||
| isTopLevelGroup = false; | |||
| // At this level a sequence might be a mix of normal and "any" edges (e.g. "123xxxx"). To | |||
| // cope with this, track and accumulate the un-written "any" edge, and emit it just before | |||
| // any other output (or at the end). | |||
| AnyPath any = AnyPath.EMPTY; | |||
| for (Edge e : edges) { | |||
| Optional<AnyPath> next = AnyPathVisitor.extractAnyPath(e); | |||
| if (next.isPresent()) { | |||
| any = any.join(next.get()); | |||
| continue; | |||
| } | |||
| // Here we have a "normal" edge, but we still might need to emit a collected "any" edge. | |||
| if (!any.isEmpty()) { | |||
| appendRegex(out, any.mask()); | |||
| any = AnyPath.EMPTY; | |||
| } | |||
| // This recursion only happens when this was not an "any" edge (though it may still be a | |||
| // composite that contains other "any" edges). | |||
| e.accept(this); | |||
| } | |||
| // If the last thing we saw in this sequence was an "any" edge, don't forget to emit it. | |||
| if (!any.isEmpty()) { | |||
| appendRegex(out, any.mask()); | |||
| } | |||
| } | |||
| @Override | |||
| public void visitGroup(Set<Edge> edges, boolean isOptional) { | |||
| checkArgument(!edges.isEmpty(), "groups must have at least one edge"); | |||
| // The very top-level group is almost always non-optional and can be omitted for length | |||
| // (ie. "(?:a|b|c)" can just be "a|b|c"). | |||
| boolean canSkipParens = isTopLevelGroup && !isOptional; | |||
| // Unset this before recursing. | |||
| isTopLevelGroup = false; | |||
| // We have exactly one case where an "any" edge needs to be handled for groups, and that's | |||
| // when there's an optional any group that's not part of an enclosing sequence (e.g. "(xx)?"). | |||
| if (edges.size() == 1 && isOptional) { | |||
| Optional<AnyPath> any = AnyPathVisitor.extractAnyPath(Iterables.getOnlyElement(edges)); | |||
| if (any.isPresent()) { | |||
| // Remember to account for the optionality of the outer group. | |||
| appendRegex(out, any.get().makeOptional().mask()); | |||
| return; | |||
| } | |||
| } | |||
| if (!canSkipParens) { | |||
| out.append(GROUP_START); | |||
| } | |||
| for (Edge e : edges) { | |||
| e.accept(this); | |||
| out.append(GROUP_DISJUNCTION); | |||
| } | |||
| // Easier to just remove the disjunction we know was added last than track state in the loop. | |||
| out.setLength(out.length() - GROUP_DISJUNCTION.length()); | |||
| if (!canSkipParens) { | |||
| out.append(isOptional ? OPTIONAL_GROUP_END : GROUP_END); | |||
| } | |||
| } | |||
| /** | |||
| * Recursive visitor to extract "any" sequences from edges (simple or composite). A sequence of | |||
| * edges is an "any path" if all edges accept any digit. Composite edges already enforce the | |||
| * requirement that epsilon edges don't exist directly (they are represented via optionality). | |||
| */ | |||
| private static final class AnyPathVisitor implements Visitor { | |||
| /** | |||
| * Returns the longest "any" sequence represented by the given edge (if the edge represents an | |||
| * any sequence). If present, the result is non-empty. | |||
| */ | |||
| @Nullable | |||
| public static Optional<AnyPath> extractAnyPath(Edge e) { | |||
| AnyPathVisitor visitor = new AnyPathVisitor(); | |||
| e.accept(visitor); | |||
| return Optional.ofNullable(visitor.path); | |||
| } | |||
| // Accumulate value during visitation and set to null to abort. | |||
| @Nullable | |||
| private AnyPath path = AnyPath.EMPTY; | |||
| @Override | |||
| public void visit(SimpleEdge edge) { | |||
| checkState(path != null, "path should never be null at start of recursion"); | |||
| if (edge.getDigitMask() == ALL_DIGITS_MASK) { | |||
| path = path.join(edge.isOptional() ? AnyPath.OPTIONAL : AnyPath.SINGLE); | |||
| } else { | |||
| path = null; | |||
| } | |||
| } | |||
| @Override | |||
| public void visitSequence(List<Edge> edges) { | |||
| checkState(path != null, "path should never be null at start of recursion"); | |||
| // Looking for a complete sequence of "any edges" (partial sequences in a concatenation are | |||
| // taken care of by the caller). | |||
| for (Edge e : edges) { | |||
| Optional<AnyPath> next = AnyPathVisitor.extractAnyPath(e); | |||
| if (next.isPresent()) { | |||
| path = path.join(next.get()); | |||
| } else { | |||
| path = null; | |||
| break; | |||
| } | |||
| } | |||
| } | |||
| @Override | |||
| public void visitGroup(Set<Edge> edges, boolean isOptional) { | |||
| checkState(path != null, "path should never be null at start of recursion"); | |||
| // Looking for a group like (xxx(xx)?)? which contains one edge only. We just recurse into | |||
| // that edge and then make the result optional (a disjuction with only one edge must be | |||
| // optional or else it should have been a concatenation). | |||
| if (edges.size() > 1) { | |||
| path = null; | |||
| return; | |||
| } | |||
| checkState(isOptional, "single edge disjunctions should be optional"); | |||
| Edge e = Iterables.getOnlyElement(edges); | |||
| e.accept(this); | |||
| if (path != null) { | |||
| path = path.makeOptional(); | |||
| } | |||
| } | |||
| } | |||
| // The code below here is really a bit squiffy and relies on a whole bunch of bit fiddling to | |||
| // do what it does. The good news is that it's easy to unit-test the heck out of, so that's | |||
| // what I've done. Don't look too hard at what's going on unless you're a bit of a masochist. | |||
| /** | |||
| * Appends the regular expression corresponding to the given AnyPath mask value. This is a | |||
| * bit-mask where the Nth bit corresponds to accepting an any digit sequence of length N. | |||
| * | |||
| * <p>For example: | |||
| * <ul> | |||
| * <li> {@code 00000010} accepts only length 1 (e.g. "\d") | |||
| * <li> {@code 00000011} accepts lengths 0 or 1 (e.g. "\d?") | |||
| * <li> {@code 00001000} accepts only length 3 (e.g. "\d{3}") | |||
| * <li> {@code 00011100} accepts lengths 2-4 (e.g. "\d{2,4}") | |||
| * <li> {@code 11101100} accepts lengths 0,2,3,5,6,7 (e.g. "(?:\d\d(?:\d(?:\d{2,4})?)?)?") | |||
| * </ul> | |||
| */ | |||
| private void appendRegex(StringBuilder out, int mask) { | |||
| checkArgument(mask > 1, "unexpected mask value %s", mask); | |||
| // Deal with optionality separately. | |||
| boolean allOptional = (mask & 0x1) != 0; | |||
| mask &= ~0x1; | |||
| // We are looking for bit patterns like '1111000' for contiguous ranges (e.g. {3,7}). | |||
| // Find the lo/hi size of the next contiguous range (inclusive). | |||
| int lo = Integer.numberOfTrailingZeros(mask); | |||
| int hi = Integer.numberOfTrailingZeros(~(mask >>> lo)) + (lo - 1); | |||
| // If all the bits are accounted for (nothing above the "hi" bit) then this was the last | |||
| // contiguous range and we don't need to recurse (so no more groups need to be opened). | |||
| if (mask < (1 << (hi + 1))) { | |||
| // Writes a contiguous range as a single token with optionality (e.g. "\d", "(?:\d{2,4})?"). | |||
| appendAnyRange(out, lo, hi, allOptional); | |||
| return; | |||
| } | |||
| // This is about the entire group, not the subgroup we are about to recurse into. | |||
| if (allOptional) { | |||
| out.append(GROUP_START); | |||
| } | |||
| // IMPORTANT: If we are recursing, we must not attempt to emit the entire group here, only the | |||
| // shortest matching length. | |||
| // | |||
| // Mask "11101100" does NOT represent "\d{2,3}(?:\d{2,4})?" as that can match 4-digits too. | |||
| // Instead it should generate "\d\d(?:\d(?:\d{2,4})?)?", where the 3 digit match is part of an | |||
| // optional group. | |||
| appendRequiredAnyRange(out, lo); | |||
| // Recurse using the mask that's had the match we just emitted "factored out". This is always | |||
| // optional because bit-0 is what was the lowest set bit in our mask. | |||
| appendRegex(out, mask >>> lo); | |||
| if (allOptional) { | |||
| out.append(OPTIONAL_GROUP_END); | |||
| } | |||
| } | |||
| /** | |||
| * Appends regular expression tokens that accept any digits for a single length. | |||
| * | |||
| * <p>For example: | |||
| * <ol> | |||
| * <li>{@code n=1}: {@code "\d"} | |||
| * <li>{@code n=2}: {@code "\d\d"} (this could be extended if using '.') | |||
| * <li>{@code otherwise}: {@code "\d{n}"} | |||
| * </ol> | |||
| */ | |||
| private void appendRequiredAnyRange(StringBuilder out, int n) { | |||
| checkArgument(n >= 1, "bad any length %s", n); | |||
| out.append(anyToken); | |||
| if (n == 2) { | |||
| // Only safe to do this if the group is not optional ("\d\d?" != "(?:\d{2})?"). | |||
| out.append(anyToken); | |||
| } else if (n > 2) { | |||
| out.append('{').append(n).append('}'); | |||
| } | |||
| } | |||
| /** | |||
| * Appends regular expression tokens that accept any digits in a contiguous range of lengths. | |||
| * | |||
| * <p>For example: | |||
| * <ol> | |||
| * <li>{@code lo=1, hi=1, optional=false}: {@code "\d"} | |||
| * <li>{@code lo=1, hi=1, optional=true}: {@code "\d?"} | |||
| * <li>{@code lo=2, hi=2, optional=true}: {@code "(?:\d{2})?"} | |||
| * <li>{@code lo=3, hi=6, optional=false}: {@code "\d{3,6}"} | |||
| * <li>{@code lo=3, hi=6, optional=true}: {@code "(?:\d{3,6})?"} | |||
| * <li>{@code lo=1, hi=4, optional=true}: {@code "\d{0,4}"} (not {@code (?:\d{1,4})?}) | |||
| * <li>{@code lo=2, hi=2, optional=false}: {@code "\d\d"} (special case for size) | |||
| * <li>{@code lo=1, hi=2, optional=false}: {@code "\d\d?"} (special case for size) | |||
| * </ol> | |||
| */ | |||
| private void appendAnyRange(StringBuilder out, int lo, int hi, boolean optional) { | |||
| checkArgument(lo >= 1 && hi >= lo, "bad range arguments %s, %s", lo, hi); | |||
| if (lo == hi) { | |||
| if (!optional) { | |||
| // Required single length. | |||
| appendRequiredAnyRange(out, lo); | |||
| } else { | |||
| // Optional single length. | |||
| if (lo > 1) { | |||
| out.append(GROUP_START).append(anyToken); | |||
| out.append('{').append(lo).append('}'); | |||
| out.append(OPTIONAL_GROUP_END); | |||
| } else { | |||
| out.append(anyToken).append(OPTIONAL_MARKER); | |||
| } | |||
| } | |||
| } else if (lo == 1 && hi == 2 && !optional) { | |||
| // Special case for "\d\d?" as it's shorter than "\d{1,2}" (and even shorter with '.'). | |||
| // Even though we append the "optional marker" (i.e. '?') here it's got nothing to do | |||
| // with the entire group being optional. That would be "(?:\d{1,2})?" which is "\d{0,2}". | |||
| out.append(anyToken).append(anyToken).append(OPTIONAL_MARKER); | |||
| } else if (lo == 1 && optional) { | |||
| // Special case to write "\d{0,N}" instead of "(?:\d{1,N})?" | |||
| out.append(anyToken).append("{0,").append(hi).append('}'); | |||
| } else { | |||
| if (optional) { | |||
| out.append(GROUP_START); | |||
| } | |||
| // General case. | |||
| out.append(anyToken).append('{').append(lo).append(',').append(hi).append('}'); | |||
| if (optional) { | |||
| out.append(OPTIONAL_GROUP_END); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,195 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import com.google.auto.value.AutoValue; | |||
| import com.google.common.annotations.VisibleForTesting; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.Iterables; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.ArrayList; | |||
| import java.util.Comparator; | |||
| import java.util.HashMap; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.PriorityQueue; | |||
| import java.util.function.Function; | |||
| /** | |||
| * Flattens an NFA graph of simple edges into a composite edge which represents all the same | |||
| * transitions in a strict tree structure (i.e. nestable sub-groups). This can entail some | |||
| * duplication of edges, but this should be kept to a minimum and favours duplicating trailing | |||
| * paths to avoid introducing additional non-determinism. | |||
| */ | |||
| final class NfaFlattener { | |||
| /** | |||
| * Flattens the given NFA graph into a single composite edge composed of concatenation and | |||
| * disjunction. The resulting edge can be visited using the {@code Edge.Visitor} class. | |||
| */ | |||
| public static Edge flatten(ValueGraph<Node, SimpleEdge> graph) { | |||
| return new NfaFlattener(graph).flatten(); | |||
| } | |||
| /* | |||
| * A simple pair of edge value and target node which represents the current state along any path | |||
| * in the NFA graph. Path followers may be joined (if they point at the same node) but can only | |||
| * be split by recursion into the new subgraph. | |||
| */ | |||
| @AutoValue | |||
| abstract static class PathFollower { | |||
| private static PathFollower of(Node node, Edge edge) { | |||
| return new AutoValue_NfaFlattener_PathFollower(node, edge); | |||
| } | |||
| /** The target node that this follower points to. */ | |||
| abstract Node node(); | |||
| /** A composite edge representing everything up to the target node in the current sub-graph. */ | |||
| abstract Edge edge(); | |||
| } | |||
| // The graph being flattened. | |||
| private final ValueGraph<Node, SimpleEdge> graph; | |||
| // An ordering for the work queue which ensures that followers with the same node are adjacent. | |||
| private final Comparator<PathFollower> queueOrder; | |||
| private NfaFlattener(ValueGraph<Node, SimpleEdge> graph) { | |||
| this.graph = graph; | |||
| this.queueOrder = Comparator | |||
| .comparing(PathFollower::node, nodeOrdering(graph)) | |||
| .thenComparing(PathFollower::edge); | |||
| } | |||
| private Edge flatten() { | |||
| // Sub-graph visitation only works for graphs which branch from and collapse to a single node. | |||
| // An NFA graph could be multiple sequential edges or a sequence of edges and sub-graphs. | |||
| // Handle that in this outer loop rather than complicate the visitor (already quite complex). | |||
| PathFollower out = visitSubgraph(Node.INITIAL); | |||
| while (out.node() != Node.TERMINAL) { | |||
| PathFollower subgraph = visitSubgraph(out.node()); | |||
| out = PathFollower.of(subgraph.node(), Edge.concatenation(out.edge(), subgraph.edge())); | |||
| } | |||
| return out.edge(); | |||
| } | |||
| /** | |||
| * Visits the sub-graph rooted at the given node, following all out-edges until they eventually | |||
| * re-join. Because the given graph has only one terminal node and no cycles, all sub-graphs must | |||
| * eventually rejoin at some point. If during visitation of a sub-graph, a node with multiple | |||
| * out-edges is reached, then the sub-graph it starts is recursively visited. Note that as "inner" | |||
| * sub-graphs must terminate at or before their parent graph, nesting is assured. | |||
| * | |||
| * <p>The key to the implementation of this algorithm is that visitation occurs in breadth-first | |||
| * order defined according to the reachability of the nodes in the graph. This ensures that when | |||
| * an edge follower which reaches a node at which other edges join together is processed (i.e. | |||
| * when it gets to the head of the queue) all the other followers that can also reach that node | |||
| * must also be present in a contiguous sequence at the front of the queue. | |||
| */ | |||
| private PathFollower visitSubgraph(Node node) { | |||
| Preconditions.checkArgument(graph.outDegree(node) > 0, "cannot recurse from the terminal node"); | |||
| if (graph.outDegree(node) == 1) { | |||
| // Visit the trivial "subgraph" that's really just a single edge. Note that this code could | |||
| // loop and concatenate all sequential single edges, but it also works fine to rely on the | |||
| // recursion of the caller (the advantage of doing it this, simpler, way means that this code | |||
| // doesn't have to know about termination due to reaching the terminal node). | |||
| Node target = Iterables.getOnlyElement(graph.successors(node)); | |||
| return PathFollower.of(target, graph.edgeValue(node, target).get()); | |||
| } | |||
| // A work-queue of the path followers, ordered primarily by the node they target. This results | |||
| // in the followers at any "point of collapse" being adjacent in the queue. | |||
| PriorityQueue<PathFollower> followerQueue = new PriorityQueue<>(queueOrder); | |||
| for (Node t : graph.successors(node)) { | |||
| followerQueue.add(PathFollower.of(t, graph.edgeValue(node, t).get())); | |||
| } | |||
| while (true) { | |||
| // Get the set of followers that share the same target node at the head of the queue. The | |||
| // ordering in the queue ensures that followers for the same target are always adjacent. | |||
| PathFollower follower = followerQueue.remove(); | |||
| Node target = follower.node(); | |||
| List<Edge> joiningEdges = collectJoiningEdges(followerQueue, target); | |||
| if (joiningEdges != null) { | |||
| // Replace any joined followers with their disjunction (they all have the same target). | |||
| joiningEdges.add(follower.edge()); | |||
| follower = PathFollower.of(target, Edge.disjunction(joiningEdges)); | |||
| } | |||
| if (followerQueue.isEmpty()) { | |||
| // If we just processed the last "joining" paths then this sub-graph has been collapsed | |||
| // into a single edge and we just return the current follower. Note that we can join edges | |||
| // without ending recursion (when 3 followers become 2) but we can only end recursion after | |||
| // joining at least 2 edges at the terminal sub-graph node. | |||
| return follower; | |||
| } | |||
| // Recurse into the next sub-graph (possibly just a single edge) which is just concatenated | |||
| // onto the current follower. | |||
| PathFollower subgraph = visitSubgraph(target); | |||
| followerQueue.add( | |||
| PathFollower.of(subgraph.node(), Edge.concatenation(follower.edge(), subgraph.edge()))); | |||
| } | |||
| } | |||
| // Collects the edges of any followers at the front of the queue which share the same target node | |||
| // as the given follower. If the node is not a target of any other followers then return null. | |||
| private static List<Edge> collectJoiningEdges(PriorityQueue<PathFollower> queue, Node target) { | |||
| // It's really common for edges not to join, so avoid making the list unless necessary. | |||
| if (!nextFollowerJoinsTarget(queue, target)) { | |||
| return null; | |||
| } | |||
| List<Edge> joiningEdges = new ArrayList<>(); | |||
| do { | |||
| joiningEdges.add(queue.remove().edge()); | |||
| } while (nextFollowerJoinsTarget(queue, target)); | |||
| return joiningEdges; | |||
| } | |||
| // Checks if the head of the queue is a follower with the same target node. | |||
| private static boolean nextFollowerJoinsTarget(PriorityQueue<PathFollower> queue, Node target) { | |||
| return !queue.isEmpty() && queue.peek().node().equals(target); | |||
| } | |||
| /** | |||
| * Returns a total ordering of nodes in this graph based on the maximum path length from the | |||
| * initial node. If path lengths are equal for two nodes, then the node ID is used to tie break. | |||
| * | |||
| * <p>The property of this ordering that is critical to the node flattening algorithm is that if | |||
| * {@code a < b}, then no path exists in the graph where {@code b} precedes {@code a}. This | |||
| * ensures that path followers are processed consistently with the "node reachability" and if | |||
| * several path followers target the same node, then they are adjacent in the follower queue. | |||
| * | |||
| * <p>Using the node ID as a tie-break is safe, because while node IDs are assigned arbitrarily, | |||
| * they only apply between nodes in the same path length "bucket", so it cannot violate the total | |||
| * ordering requirement, since any order within a "bucket" is equally good. | |||
| */ | |||
| // Note: If there are graph cycles this will not terminate, but that implies bad bugs elsewhere. | |||
| @VisibleForTesting | |||
| static Comparator<Node> nodeOrdering(ValueGraph<Node, ?> graph) { | |||
| Map<Node, Integer> map = new HashMap<>(); | |||
| recursivelySetMaxPathLength(Node.INITIAL, 0, graph, map); | |||
| // We have to cast the "get" method since it accepts "Object", not "Node" on a map. | |||
| return Comparator.comparing((Function<Node, Integer>) map::get).thenComparing(Node::id); | |||
| } | |||
| private static void recursivelySetMaxPathLength( | |||
| Node node, int length, ValueGraph<Node, ?> graph, Map<Node, Integer> map) { | |||
| // Only continue if at least some paths can be lengthened from here onwards. | |||
| if (length > map.getOrDefault(node, -1)) { | |||
| map.put(node, length); | |||
| for (Node target : graph.successors(node)) { | |||
| recursivelySetMaxPathLength(target, length + 1, graph, map); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,51 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import com.google.auto.value.AutoValue; | |||
| /** | |||
| * Value type for nodes in NFA graphs of phone number regular expressions. This is basically a | |||
| * trivial wrapper for an {@code int}, but it makes a lot of other pieces of code type safe. | |||
| * Outside this package, this type is mainly used for examining NFA graphs which represent a | |||
| * regular expression, generated via {@link RangeTreeConverter#toNfaGraph}. | |||
| */ | |||
| @AutoValue | |||
| public abstract class Node implements Comparable<Node> { | |||
| /** The unique initial node in an NFA graph with in-order zero. */ | |||
| public static final Node INITIAL = new AutoValue_Node(0); | |||
| /** The unique terminal node in an NFA graph with out-order zero. */ | |||
| public static final Node TERMINAL = new AutoValue_Node(1); | |||
| /** Returns a new node whose ID is one greater than this node. */ | |||
| public Node createNext() { | |||
| return (id() == 0) ? TERMINAL : new AutoValue_Node(id() + 1); | |||
| } | |||
| /** Returns the numeric ID of this node, which must be unique within an NFA graph. */ | |||
| abstract int id(); | |||
| @Override | |||
| public int compareTo(Node o) { | |||
| return Integer.compare(id(), o.id()); | |||
| } | |||
| @Override | |||
| public final String toString() { | |||
| return Integer.toString(id()); | |||
| } | |||
| } | |||
| @ -0,0 +1,123 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.Preconditions.checkState; | |||
| import com.google.common.graph.ElementOrder; | |||
| import com.google.common.graph.MutableValueGraph; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.common.graph.ValueGraphBuilder; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.HashMap; | |||
| import java.util.Map; | |||
| /** | |||
| * Converts DFA {@link RangeTree}s to NFA {@link ValueGraph}s. The resulting graph has almost | |||
| * exactly the same node and edge structure as the original DFA, with the following exceptions: | |||
| * <ol> | |||
| * <li>Nodes which could optionally terminate now have 'epsilon' edges connecting them to the | |||
| * terminal node. | |||
| * <li>If an optionally terminating node connects directly to the terminal node, then a special | |||
| * "optional edge" is used (this is because the {@link ValueGraph} structure allows only one | |||
| * value for each edge, so you can't have an epsilon edge that goes between the same source and | |||
| * target as other edge). | |||
| * </ol> | |||
| */ | |||
| public final class RangeTreeConverter { | |||
| /** | |||
| * Returns the directed NFA graph representation of a {@link RangeTree}. The returned graph is | |||
| * not a DFA and may contain epsilon transitions. Nodes are assigned in visitation order, except | |||
| * for the initial and terminal nodes which are always present in the graph. | |||
| */ | |||
| public static ValueGraph<Node, SimpleEdge> toNfaGraph(RangeTree ranges) { | |||
| NfaVisitor visitor = new NfaVisitor(ranges.getInitial()); | |||
| ranges.accept(visitor); | |||
| return visitor.graph; | |||
| } | |||
| private static class NfaVisitor implements DfaVisitor { | |||
| private final MutableValueGraph<Node, SimpleEdge> graph = ValueGraphBuilder | |||
| .directed() | |||
| .allowsSelfLoops(false) | |||
| // Stable ordering should help keep any generated structures (regex, graph files) stable. | |||
| .nodeOrder(ElementOrder.<Node>natural()) | |||
| .build(); | |||
| // Map of nodes added to the new graph (keyed by the corresponding DFA node). | |||
| private final Map<DfaNode, Node> nodeMap = new HashMap<>(); | |||
| // The last node we added. | |||
| private Node lastAdded; | |||
| private NfaVisitor(DfaNode initial) { | |||
| // Add initial and terminal nodes first (there's always exactly one of each). | |||
| graph.addNode(Node.INITIAL); | |||
| graph.addNode(Node.TERMINAL); | |||
| // During visitation we check only target nodes to add epsilon edges, but we may also need | |||
| // to add an epsilon from the very top if the DFA can match the empty input. | |||
| if (initial.canTerminate()) { | |||
| graph.putEdgeValue(Node.INITIAL, Node.TERMINAL, Edge.epsilon()); | |||
| } | |||
| nodeMap.put(initial, Node.INITIAL); | |||
| nodeMap.put(RangeTree.getTerminal(), Node.TERMINAL); | |||
| lastAdded = Node.TERMINAL; | |||
| } | |||
| @Override | |||
| public void visit(DfaNode dfaSource, DfaEdge dfaEdge, DfaNode dfaTarget) { | |||
| SimpleEdge simpleEdge = Edge.fromMask(dfaEdge.getDigitMask()); | |||
| Node source = nodeMap.get(dfaSource); | |||
| Node target = getTarget(dfaTarget); | |||
| boolean wasNewNode = graph.addNode(target); | |||
| // The only chance of an existing edge is if an epsilon was already added immediately before | |||
| // visiting this edge. This can only occur if (target == TERMINAL) however. | |||
| SimpleEdge epsilon = graph.putEdgeValue(source, target, simpleEdge); | |||
| if (epsilon != null) { | |||
| checkState(target.equals(Node.TERMINAL) && epsilon.equals(Edge.epsilon()), | |||
| "unexpected edge during visitation: %s -- %s --> %s", source, epsilon, target); | |||
| // Re-add the edge, but this time make it optional (because that's what epsilon means). | |||
| graph.putEdgeValue(source, target, simpleEdge.optional()); | |||
| } | |||
| // Only recurse if the target node was newly added to the graph in this visitation. | |||
| if (wasNewNode) { | |||
| // The TERMINAL node is always in the map so (target != TERMINAL) here. This means we | |||
| // never risk adding a loop in the graph. The epsilon may end up being swapped out for | |||
| // an optional edge when we visit the dfaTarget, but that's fine. | |||
| if (dfaTarget.canTerminate()) { | |||
| graph.putEdgeValue(target, Node.TERMINAL, Edge.epsilon()); | |||
| } | |||
| dfaTarget.accept(this); | |||
| } | |||
| } | |||
| // Gets or creates a new target node, adding it to the node map (but not to the graph itself). | |||
| private Node getTarget(DfaNode gnode) { | |||
| Node target = nodeMap.get(gnode); | |||
| if (target != null) { | |||
| return target; | |||
| } | |||
| lastAdded = lastAdded.createNext(); | |||
| nodeMap.put(gnode, lastAdded); | |||
| return lastAdded; | |||
| } | |||
| } | |||
| private RangeTreeConverter() {} | |||
| } | |||
| @ -0,0 +1,118 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import com.google.common.base.CharMatcher; | |||
| import com.google.common.base.Preconditions; | |||
| /** | |||
| * Simple indenting formatter for regular expressions and other similar nested syntax. Obviously | |||
| * the results are not the same from a match perspective as the new string contains whitespace. | |||
| */ | |||
| public final class RegexFormatter { | |||
| /** Option for how to handle formatting of groups. */ | |||
| public enum FormatOption { | |||
| PRESERVE_CAPTURING_GROUPS, | |||
| FORCE_NON_CAPTURING_GROUPS, | |||
| FORCE_CAPTURING_GROUPS, | |||
| } | |||
| // We only care about 3 specific tokens, so this code can be used to print strings which look | |||
| // similar (nested, disjunctive groups) such as the toString() of the Edge class. | |||
| private static final CharMatcher tokens = CharMatcher.anyOf("()|"); | |||
| /** | |||
| * Formats a regular expression (or similar nested group syntax) using the following rules: | |||
| * <ol> | |||
| * <li>Newline after opening '(?:' and increase indent. | |||
| * <li>Newline after '|' | |||
| * <li>Decrease indent and add newline before closing ')' | |||
| * </ol> | |||
| */ | |||
| public static String format(String regex, FormatOption formatOption) { | |||
| return new RegexFormatter(regex, formatOption).format(); | |||
| } | |||
| private final StringBuilder out = new StringBuilder(); | |||
| private final String regex; | |||
| private final FormatOption formatOption; | |||
| private RegexFormatter(String regex, FormatOption formatOption) { | |||
| this.regex = CharMatcher.whitespace().removeFrom(regex); | |||
| this.formatOption = Preconditions.checkNotNull(formatOption); | |||
| } | |||
| private String format() { | |||
| recurse(0, 0); | |||
| return out.toString(); | |||
| } | |||
| // Assume at line start. | |||
| private int recurse(int pos, int level) { | |||
| while (pos < regex.length()) { | |||
| indent(level); | |||
| // Optionally printing closing group from previous recursion. | |||
| if (regex.charAt(pos) == ')') { | |||
| out.append(')'); | |||
| pos++; | |||
| } | |||
| int nextToken = tokens.indexIn(regex, pos); | |||
| if (nextToken == -1) { | |||
| out.append(regex.substring(pos, regex.length())); | |||
| return regex.length(); | |||
| } | |||
| out.append(regex.substring(pos, nextToken)); | |||
| pos = nextToken; | |||
| switch (regex.charAt(pos)) { | |||
| case '(': | |||
| out.append("("); | |||
| pos++; | |||
| if (regex.indexOf("?:", pos) == pos) { | |||
| if (formatOption != FormatOption.FORCE_CAPTURING_GROUPS) { | |||
| out.append("?:"); | |||
| } | |||
| pos += 2; | |||
| } else if (formatOption == FormatOption.FORCE_NON_CAPTURING_GROUPS) { | |||
| out.append("?:"); | |||
| } | |||
| out.append('\n'); | |||
| pos = recurse(pos, level + 1); | |||
| break; | |||
| case '|': | |||
| out.append("|\n"); | |||
| pos++; | |||
| break; | |||
| case ')': | |||
| // Just exit recursion and let the parent write the ')', so don't update our position. | |||
| out.append("\n"); | |||
| return pos; | |||
| default: | |||
| throw new AssertionError(); | |||
| } | |||
| } | |||
| return pos; | |||
| } | |||
| private void indent(int level) { | |||
| while (level-- > 0) { | |||
| out.append(" "); | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,171 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.ALLOW_EDGE_SPLITTING; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES; | |||
| import static java.util.stream.Collectors.joining; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.Optional; | |||
| /** Produces partially optimized regular expressions from {@code RangeTree}s. */ | |||
| public final class RegexGenerator { | |||
| private static final RegexGenerator BASIC = new RegexGenerator(false, false, false, false); | |||
| // NOTE: Tail optimization should remain disabled since it seems to undo some of the benefits of | |||
| // subgroup optimization. At some point the code can probably just be removed. | |||
| private static final RegexGenerator DEFAULT_XML = | |||
| BASIC.withDfaFactorization().withSubgroupOptimization(); | |||
| /** | |||
| * Returns a basic regular expression generator with no optional optimizations enabled. This will | |||
| * produce regular expressions with a simpler structure than other generators but output will | |||
| * almost always be longer. | |||
| */ | |||
| public static RegexGenerator basic() { | |||
| return BASIC; | |||
| } | |||
| /** | |||
| * Returns the default regex generator for XML data. This should be used by any tool wishing to | |||
| * obtain the same regular expressions as the legacy XML data. It is deliberately not specified | |||
| * as to which optimizations are enabled for this regular expression generator. | |||
| */ | |||
| public static RegexGenerator defaultXmlGenerator() { | |||
| return DEFAULT_XML; | |||
| } | |||
| /** | |||
| * Returns a new regular expression generator which uses the {@code '.'} token for matching any | |||
| * digit (rather than {@code '\d'}). This results in shorter output, but possibly at the cost of | |||
| * performance on certain platforms (and a degree of readability). | |||
| */ | |||
| public RegexGenerator withDotMatch() { | |||
| Preconditions.checkState(!this.useDotMatch, "Dot-matching already enabled"); | |||
| return new RegexGenerator(true, this.factorizeDfa, this.optimizeSubgroups, this.optimizeTail); | |||
| } | |||
| /** | |||
| * Returns a new regular expression generator which applies a length-based factorization of the | |||
| * DFA graph in an attempt to reduce the number of problematic terminating states. This results | |||
| * in regular expressions with additional non-determinism, but which can greatly reduce size. | |||
| */ | |||
| public RegexGenerator withDfaFactorization() { | |||
| Preconditions.checkState(!this.factorizeDfa, "Length based factorizing already enabled"); | |||
| return new RegexGenerator(this.useDotMatch, true, this.optimizeSubgroups, this.optimizeTail); | |||
| } | |||
| /** | |||
| * Returns a new regular expression generator which applies experimental factorization of the | |||
| * DFA graph in an attempt to identify and handle subgroups which would cause repetition. This | |||
| * results in regular expressions with additional non-determinism, but which can greatly reduce | |||
| * size. | |||
| */ | |||
| public RegexGenerator withSubgroupOptimization() { | |||
| Preconditions.checkState(!this.optimizeSubgroups, "Subgroup optimization already enabled"); | |||
| return new RegexGenerator(this.useDotMatch, this.factorizeDfa, true, this.optimizeTail); | |||
| } | |||
| /** | |||
| * Returns a new regular expression generator which applies tail optimization to the intermediate | |||
| * NFA graph to factor out common trailing paths. This results in a small size improvement to | |||
| * many cases and does not adversely affect readability. | |||
| */ | |||
| public RegexGenerator withTailOptimization() { | |||
| Preconditions.checkState(!this.optimizeTail, "Tail optimization already enabled"); | |||
| return new RegexGenerator(this.useDotMatch, this.factorizeDfa, this.optimizeSubgroups, true); | |||
| } | |||
| private final boolean useDotMatch; | |||
| private final boolean factorizeDfa; | |||
| private final boolean optimizeSubgroups; | |||
| private final boolean optimizeTail; | |||
| private RegexGenerator( | |||
| boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail) { | |||
| this.useDotMatch = useDotMatch; | |||
| this.factorizeDfa = factorizeDfa; | |||
| this.optimizeSubgroups = optimizeSubgroups; | |||
| this.optimizeTail = optimizeTail; | |||
| } | |||
| /** | |||
| * Generates a regular expression from a range tree, applying the configured options for this | |||
| * generator. | |||
| */ | |||
| public String toRegex(RangeTree ranges) { | |||
| // The regex of the empty range is "a regex that matches nothing". This is meaningless. | |||
| checkArgument(!ranges.isEmpty(), | |||
| "cannot generate regular expression from empty ranges"); | |||
| // We cannot generate any regular expressions if there are no explicit state transitions in the | |||
| // graph (i.e. we can generate "(?:<re>)?" but only if "<re>" is non-empty). We just get | |||
| // "the regex that always immediately terminates after no input". This is also meaningless. | |||
| checkArgument(!ranges.getInitial().equals(RangeTree.getTerminal()), | |||
| "range tree must not contain only the empty digit sequence: %s", ranges); | |||
| String regex = generateFactorizedRegex(ranges); | |||
| if (optimizeSubgroups) { | |||
| regex = recursivelyOptimizeSubgroups(ranges, regex); | |||
| } | |||
| return regex; | |||
| } | |||
| private String recursivelyOptimizeSubgroups(RangeTree ranges, String regex) { | |||
| Optional<RangeTree> subgraphRanges = SubgroupOptimizer.extractRepeatingSubgraph(ranges); | |||
| if (subgraphRanges.isPresent()) { | |||
| RangeTree leftoverRanges = ranges.subtract(subgraphRanges.get()); | |||
| String leftoverRegex = generateFactorizedRegex(leftoverRanges); | |||
| leftoverRegex = recursivelyOptimizeSubgroups(leftoverRanges, leftoverRegex); | |||
| String optimizedRegex = leftoverRegex + "|" + generateFactorizedRegex(subgraphRanges.get()); | |||
| if (optimizedRegex.length() < regex.length()) { | |||
| regex = optimizedRegex; | |||
| } | |||
| } | |||
| return regex; | |||
| } | |||
| private String generateFactorizedRegex(RangeTree ranges) { | |||
| String regex = regexOf(ranges); | |||
| if (factorizeDfa) { | |||
| regex = generateFactorizedRegex(ranges, regex, REQUIRE_EQUAL_EDGES); | |||
| regex = generateFactorizedRegex(ranges, regex, ALLOW_EDGE_SPLITTING); | |||
| } | |||
| return regex; | |||
| } | |||
| private String generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy) { | |||
| String factoredRegex = RangeTreeFactorizer.factor(dfa, strategy).stream() | |||
| .map(this::regexOf) | |||
| .collect(joining("|")); | |||
| return factoredRegex.length() < bestRegex.length() ? factoredRegex : bestRegex; | |||
| } | |||
| private String regexOf(RangeTree ranges) { | |||
| ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(ranges); | |||
| if (optimizeTail) { | |||
| nfa = TrailingPathOptimizer.optimize(nfa); | |||
| } | |||
| return EdgeWriter.toRegex(NfaFlattener.flatten(nfa), useDotMatch); | |||
| } | |||
| } | |||
| @ -0,0 +1,190 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.common.base.Preconditions.checkNotNull; | |||
| import static com.google.common.collect.ImmutableList.toImmutableList; | |||
| import com.google.common.annotations.VisibleForTesting; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.common.collect.LinkedHashMultiset; | |||
| import com.google.common.collect.Multiset; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; | |||
| import java.util.ArrayList; | |||
| import java.util.List; | |||
| import java.util.Optional; | |||
| import java.util.stream.IntStream; | |||
| import javax.annotation.Nullable; | |||
| /** | |||
| * An optimization for RangeTree DFAs which attempts to isolate and extract subgraphs which would | |||
| * otherwise cause a lot of repetition in the generated regular expression. | |||
| */ | |||
| public final class SubgroupOptimizer { | |||
| /** | |||
| * Returns the subgraph which is likely to cause the most repetition in the regular expression | |||
| * of the given DFA. Subtracting the result out of the original range tree and generating two | |||
| * distinct regular expressions is likely to be shorter than the regular expression of the | |||
| * original range. | |||
| */ | |||
| public static Optional<RangeTree> extractRepeatingSubgraph(RangeTree ranges) { | |||
| return LinkNodeVisitor | |||
| .findBridgingNode(ranges) | |||
| .flatMap(n -> SubgraphExtractionVisitor.extractSubgraph(ranges, n)); | |||
| } | |||
| /** | |||
| * A visitor which applies two types of weights to every interior node in a DFA. | |||
| * <ul> | |||
| * <li>A count of incoming edges to that node. | |||
| * <li>A count of all edges in the subgraph rooted at that node. | |||
| * </ul> | |||
| * These are then multiplied together using the cost function: | |||
| * <pre>cost(n) = subgraph-weight(n) * (in-order(n) - 1)</pre> | |||
| * get get a proxy for the cost of additional duplicates likely to be created by this node. | |||
| */ | |||
| static class LinkNodeVisitor implements DfaVisitor { | |||
| // Reasonable approximation for the cost of an edge in a subgraph is the length of the | |||
| // corresponding range specification (it doesn't work so well for repeated edges like | |||
| // 'xxxxxxxx' --> "\d{8}", but it's good to help break ties in the cost function). | |||
| private static final ImmutableList<Integer> EDGE_WEIGHTS = | |||
| IntStream.rangeClosed(1, 0x3FF) | |||
| .mapToObj(m -> RangeSpecification.toString(m).length()) | |||
| .collect(toImmutableList()); | |||
| // Important to use "linked" multisets here (at least for the one we iterate over) since | |||
| // otherwise we end up with non-deterministic regular expression generation. | |||
| private final Multiset<DfaNode> inOrder = LinkedHashMultiset.create(); | |||
| private final Multiset<DfaNode> subgraphWeight = LinkedHashMultiset.create(); | |||
| /** | |||
| * Returns the interior node whose subgraph is likely to cause the most repetition in the | |||
| * regular expression of the given DFA. | |||
| */ | |||
| static Optional<DfaNode> findBridgingNode(RangeTree ranges) { | |||
| checkArgument(!ranges.isEmpty(), "cannot visit empty ranges"); | |||
| LinkNodeVisitor v = new LinkNodeVisitor(); | |||
| ranges.accept(v); | |||
| return Optional.ofNullable(v.getHighestCostNode()); | |||
| } | |||
| private static int getEdgeWeight(DfaEdge edge) { | |||
| // Subtract 1 since the array is 1-based (a zero edge mask is not legal). | |||
| return EDGE_WEIGHTS.get(edge.getDigitMask() - 1); | |||
| } | |||
| @VisibleForTesting | |||
| int getSubgraphWeight(DfaNode n) { | |||
| return subgraphWeight.count(n); | |||
| } | |||
| @VisibleForTesting | |||
| int getInOrder(DfaNode n) { | |||
| return inOrder.count(n); | |||
| } | |||
| // This returns null if no edge has a cost greater than zero. Since the cost function uses | |||
| // (in-order(n) - 1) this is trivially true for any graph where all interior nodes have only | |||
| // a single in-edge (the terminal node can have more than one in-edge, but it has a weight of | |||
| // zero and the initial node is never considered a candidate). | |||
| @VisibleForTesting | |||
| @Nullable | |||
| DfaNode getHighestCostNode() { | |||
| DfaNode node = null; | |||
| int maxWeight = 0; | |||
| for (DfaNode n : inOrder.elementSet()) { | |||
| int weight = getSubgraphWeight(n) * (getInOrder(n) - 1); | |||
| if (weight > maxWeight) { | |||
| maxWeight = weight; | |||
| node = n; | |||
| } | |||
| } | |||
| return node; | |||
| } | |||
| @Override | |||
| public void visit(DfaNode source, DfaEdge edge, DfaNode target) { | |||
| // The weight is zero only if we haven't visited this node before (or it's the terminal). | |||
| int targetWeight = subgraphWeight.count(target); | |||
| if (targetWeight == 0 && !target.equals(RangeTree.getTerminal())) { | |||
| target.accept(this); | |||
| targetWeight = subgraphWeight.count(target); | |||
| } | |||
| // Add an extra one for the edge we are processing now and increment our target's in-order. | |||
| subgraphWeight.add(source, targetWeight + getEdgeWeight(edge)); | |||
| inOrder.add(target); | |||
| } | |||
| } | |||
| /** | |||
| * A visitor to extract the subgraph of a DFA which passes through a specified interior | |||
| * "bridging" node. | |||
| */ | |||
| private static class SubgraphExtractionVisitor implements DfaVisitor { | |||
| private final DfaNode bridgingNode; | |||
| private final List<RangeSpecification> paths = new ArrayList<>(); | |||
| private RangeSpecification path = RangeSpecification.empty(); | |||
| private boolean sawBridgingNode = false; | |||
| private boolean splitHappens = false; | |||
| /** Returns the subgraph which passes through the specified node. */ | |||
| static Optional<RangeTree> extractSubgraph(RangeTree ranges, DfaNode node) { | |||
| SubgraphExtractionVisitor v = new SubgraphExtractionVisitor(node); | |||
| ranges.accept(v); | |||
| // Only return proper subgraphs. | |||
| return v.splitHappens ? Optional.of(RangeTree.from(v.paths)) : Optional.empty(); | |||
| } | |||
| private SubgraphExtractionVisitor(DfaNode bridgingNode) { | |||
| this.bridgingNode = checkNotNull(bridgingNode); | |||
| } | |||
| @Override | |||
| public void visit(DfaNode source, DfaEdge edge, DfaNode target) { | |||
| RangeSpecification oldPath = path; | |||
| path = path.extendByMask(edge.getDigitMask()); | |||
| // Potentially emit paths for any terminating node (not just the end of the graph). We have | |||
| // to extract the entire sub-graph _after_ the bridging node, including terminating nodes. | |||
| if (target.canTerminate()) { | |||
| // Emit path if we are "below" the bridging node. | |||
| if (sawBridgingNode) { | |||
| paths.add(path); | |||
| } else { | |||
| // Records that there were other paths not in the subgroup (since we only want to return | |||
| // a new DFA that's a proper subgraph of the original graph). | |||
| splitHappens = true; | |||
| } | |||
| } | |||
| if (target.equals(bridgingNode)) { | |||
| // Recurse with the flag set to emit paths once we hit the terminal node (note that the | |||
| // bridging node cannot be the terminal node). | |||
| sawBridgingNode = true; | |||
| target.accept(this); | |||
| sawBridgingNode = false; | |||
| } else { | |||
| // Recurse normally regardless of the flag. | |||
| target.accept(this); | |||
| } | |||
| path = oldPath; | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,206 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.collect.ImmutableList.toImmutableList; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; | |||
| import static java.util.Comparator.naturalOrder; | |||
| import static java.util.stream.Collectors.toList; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.common.graph.Graphs; | |||
| import com.google.common.graph.MutableValueGraph; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.Comparator; | |||
| import java.util.HashMap; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.Map.Entry; | |||
| import java.util.Optional; | |||
| /** | |||
| * Optimizer for NFA graphs which attempts to restructure the trailing paths to maximize sharing | |||
| * and hopefully minimize the amount of duplication in the resulting regular expression. | |||
| */ | |||
| public final class TrailingPathOptimizer { | |||
| /** | |||
| * Optimizes an NFA graph to make trailing "any digit" sequences common where possible. In many | |||
| * cases this will result in no change to the structure of the NFA (common trailing paths are | |||
| * not a feature of every NFA), but in some cases a substantial reduction in duplication can | |||
| * occur. | |||
| * | |||
| * <p>This is equivalent to recognizing that {@code "12\d{2}\d{2}?|34\d{2}|56\d{3}"} can be | |||
| * written as {@code "(?:12\d{2}?|34|56\d)\d{2}"}. | |||
| */ | |||
| public static ValueGraph<Node, SimpleEdge> optimize(ValueGraph<Node, SimpleEdge> graph) { | |||
| MutableValueGraph<Node, SimpleEdge> out = Graphs.copyOf(graph); | |||
| // Build a map of trailing "any digit" sequences (key is the node it starts from). | |||
| Map<Node, AnyPath> anyPaths = new HashMap<>(); | |||
| recursivelyDetachTrailingPaths(Node.TERMINAL, AnyPath.EMPTY, out, anyPaths); | |||
| // If the terminal node has no "any digit" sequences leading to it, there's nothing we can do | |||
| // (well not in this simplistic algorithm anyway). This should almost never happen for phone | |||
| // number matching graphs as it implies a match expression that can terminate at a precise | |||
| // digit, rather than any digit. The only time this might occur is for short-codes, but due to | |||
| // their size it's likely to be fine if we don't try to aggressively optimize them. | |||
| if (anyPaths.size() == 1 && anyPaths.containsKey(Node.TERMINAL)) { | |||
| return graph; | |||
| } | |||
| // This is just a way to find a node from which we can start generating new nodes. | |||
| Node lastAddedNode = out.nodes().stream().max(naturalOrder()).get(); | |||
| // Process paths from short to long (since some paths are sub-paths of longer ones). | |||
| List<Node> shortestPathsFirst = anyPaths.entrySet().stream() | |||
| .sorted(Comparator.comparing(Entry::getValue)) | |||
| .map(Entry::getKey) | |||
| .collect(toList()); | |||
| Node pathEnd = Node.TERMINAL; | |||
| while (true) { | |||
| // Start with the next path that might be a factor of all the remaining paths. | |||
| Node shortestPathNode = shortestPathsFirst.get(0); | |||
| AnyPath shortestPath = anyPaths.get(shortestPathNode); | |||
| int pathsToFactor = shortestPathsFirst.size() - 1; | |||
| if (pathsToFactor == 0) { | |||
| // If all paths are factored, we're done. | |||
| break; | |||
| } | |||
| // Factor all the remaining paths by the shortest path (where a missing result means it | |||
| // cannot be factored). | |||
| ImmutableList<AnyPath> factored = shortestPathsFirst.stream() | |||
| .skip(1) | |||
| .map(n -> anyPaths.get(n).factor(shortestPath)) | |||
| .filter(Optional::isPresent) | |||
| .map(Optional::get) | |||
| .collect(toImmutableList()); | |||
| // If not all the remaining paths have the shortest path as a common factor, we're done (in | |||
| // this simplistic algorithm we don't consider cases where an AnyPath is the factor of some, | |||
| // but not all, other paths; we could but it's far less likely to reduce regex size). | |||
| if (factored.size() < pathsToFactor) { | |||
| break; | |||
| } | |||
| // Shortest path is a factor of all remaining paths, so add a new path to the graph for it. | |||
| lastAddedNode = addPath(shortestPathNode, pathEnd, shortestPath, lastAddedNode, out); | |||
| // We're done with this path, but might still be able to find more factors of remaining paths. | |||
| anyPaths.remove(shortestPathNode); | |||
| shortestPathsFirst.remove(0); // index, not value. | |||
| // The newly factored edges now replace the original factors in the map. | |||
| for (int n = 0; n < factored.size(); n++) { | |||
| Preconditions.checkState(anyPaths.containsKey(shortestPathsFirst.get(n))); | |||
| anyPaths.put(shortestPathsFirst.get(n), factored.get(n)); | |||
| } | |||
| // We now connect any new factored edges to the node we just added (not the terminal node). | |||
| pathEnd = shortestPathNode; | |||
| } | |||
| // If we exit, we must still reconnect any remaining, unfactored, paths to the graph. | |||
| for (Map.Entry<Node, AnyPath> e : anyPaths.entrySet()) { | |||
| lastAddedNode = addPath(e.getKey(), pathEnd, e.getValue(), lastAddedNode, out); | |||
| } | |||
| return out; | |||
| } | |||
| /** | |||
| * Recursively build up a map of trailing "any digit" sequences (AnyPath), starting from some | |||
| * current node (initially the terminal node) and working backwards. The key in the map is the | |||
| * node at which the AnyPath value starts from. Edges and nodes are removed from the graph, | |||
| * leaving "ragged" paths which will need to be reconnected later (the keys in the map are the | |||
| * set of nodes that need to be reconnected). | |||
| * | |||
| * @return whether the given node is the start of an AnyPath (i.e. if it immediately follows any | |||
| * edges which are not "any digit" sequences). | |||
| */ | |||
| private static boolean recursivelyDetachTrailingPaths( | |||
| Node node, AnyPath path, MutableValueGraph<Node, SimpleEdge> g, Map<Node, AnyPath> anyPaths) { | |||
| if (beginsAnAnyPath(node, g)) { | |||
| anyPaths.put(node, path); | |||
| return true; | |||
| } | |||
| // All incoming edges accept all digits, so we can recurse (but don't traverse epsilons). | |||
| List<Node> sources = g.predecessors(node).stream() | |||
| .filter(s -> !g.edgeValue(s, node).get().equals(Edge.epsilon())) | |||
| .collect(toList()); | |||
| for (Node source : sources) { | |||
| AnyPath newPath = path.extend(canTerminate(source, g)); | |||
| // Recurse to remove trailing paths higher in the tree and keep this source node only if | |||
| // recursion stopped here. | |||
| boolean keepSourceNode = recursivelyDetachTrailingPaths(source, newPath, g, anyPaths); | |||
| g.removeEdge(source, node); | |||
| // This removes the epsilon if it exists (and does nothing otherwise). This is safe since we | |||
| // know the other out-edge of this node accepts all digits, so the only remaining type of | |||
| // edge that could exist is an epsilon. After removing both we expect not to find any others. | |||
| g.removeEdge(source, Node.TERMINAL); | |||
| Preconditions.checkState(g.outDegree(source) == 0, "unexpected out edges in trailing graph"); | |||
| // If we were able to recurse past this node, it can be removed. | |||
| if (!keepSourceNode) { | |||
| g.removeNode(source); | |||
| } | |||
| } | |||
| return false; | |||
| } | |||
| /** | |||
| * Returns whether the given node has incoming edges that do not just accept "any digit". This is | |||
| * the point at which recursion must stop since AnyPath can only represent "any digit" sequences. | |||
| */ | |||
| private static boolean beginsAnAnyPath(Node target, ValueGraph<Node, SimpleEdge> g) { | |||
| // Obviously we cannot recurse past the initial node. | |||
| if (target == Node.INITIAL) { | |||
| return true; | |||
| } | |||
| return g.predecessors(target).stream() | |||
| .map(s -> g.edgeValue(s, target).get()) | |||
| .filter(e -> !e.equals(Edge.epsilon())) | |||
| .anyMatch(e -> e.getDigitMask() != ALL_DIGITS_MASK); | |||
| } | |||
| /** | |||
| * Returns whether this node can terminate. This logic relies on the input graph not having had | |||
| * its epsilon edges moved (i.e. if an epsilon edge exists it must point to the terminal node). | |||
| * This also looks for special "optional" edges which exist when a non-epsilon edge already | |||
| * exists from this node to the terminal node. | |||
| */ | |||
| private static boolean canTerminate(Node node, ValueGraph<Node, SimpleEdge> g) { | |||
| return g.successors(node).stream() | |||
| .map(t -> g.edgeValue(node, t).get()) | |||
| .anyMatch(e -> e.isOptional() || e.equals(Edge.epsilon())); | |||
| } | |||
| /** Adds the given "AnyPath" into the graph, generating new nodes and edges as necessary. */ | |||
| private static Node addPath( | |||
| Node node, Node end, AnyPath path, Node lastAdded, MutableValueGraph<Node, SimpleEdge> out) { | |||
| // Path length is always at least 1 for an AnyPath. | |||
| int pathLength = path.maxLength(); | |||
| for (int n = 0; n < pathLength - 1; n++) { | |||
| if (path.acceptsLength(n)) { | |||
| out.putEdgeValue(node, end, Edge.epsilon()); | |||
| } | |||
| lastAdded = lastAdded.createNext(); | |||
| out.addNode(lastAdded); | |||
| out.putEdgeValue(node, lastAdded, Edge.any()); | |||
| node = lastAdded; | |||
| } | |||
| // For the last edge we cannot add a parallel epsilon path if we need to skip to the end, | |||
| // so add the special "optional any" edge instead. | |||
| out.putEdgeValue( | |||
| node, end, path.acceptsLength(pathLength - 1) ? Edge.optionalAny() : Edge.any()); | |||
| return lastAdded; | |||
| } | |||
| private TrailingPathOptimizer() {} | |||
| } | |||
| @ -0,0 +1,76 @@ | |||
| /* | |||
| * Copyright (C) 2022 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static org.junit.Assert.assertThrows; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public final class LengthsParserTest { | |||
| @Test | |||
| public void shouldThrowIfStringContainsForbiddenCharacters() { | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("a-6,7")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8, B, C")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8, ,10")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("4, +7-9, +11")); | |||
| } | |||
| @Test | |||
| public void shouldThrowIfNumbersAreOutOfOrder() { | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("9-7")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8,12-11")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("5,4,7-8")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("6-8, 7-9")); | |||
| } | |||
| @Test | |||
| public void shouldThrowIfFormatIsWrong() { | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("4-6-8")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("7-")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("3, -7")); | |||
| assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("1 2-3 4, 5 6")); | |||
| } | |||
| @Test | |||
| public void testParseSingletons() { | |||
| assertThat(LengthsParser.parseLengths("8")).containsExactly(8); | |||
| assertThat(LengthsParser.parseLengths("14")).containsExactly(14); | |||
| } | |||
| @Test | |||
| public void testParseCommaSeparatedNumbers() { | |||
| assertThat(LengthsParser.parseLengths("6,8,9")).containsExactly(6, 8, 9); | |||
| assertThat(LengthsParser.parseLengths("13, 14")).containsExactly(13, 14); | |||
| } | |||
| @Test | |||
| public void testParseRanges() { | |||
| assertThat(LengthsParser.parseLengths("6-8")).containsExactly(6, 7, 8); | |||
| assertThat(LengthsParser.parseLengths("13 - 14")).containsExactly(13, 14); | |||
| } | |||
| @Test | |||
| public void testParseComplex() { | |||
| assertThat(LengthsParser.parseLengths("4,7,9-12")).containsExactly(4, 7, 9, 10, 11, 12); | |||
| assertThat(LengthsParser.parseLengths("4-6, 8, 10-12")).containsExactly(4, 5, 6, 8, 10, 11, 12); | |||
| } | |||
| } | |||
| @ -0,0 +1,210 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.INVALID; | |||
| import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.MATCHED; | |||
| import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.TOO_LONG; | |||
| import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.TOO_SHORT; | |||
| import com.google.common.base.CharMatcher; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler; | |||
| import com.google.i18n.phonenumbers.metadata.regex.RegexGenerator; | |||
| import java.util.Arrays; | |||
| import java.util.regex.Pattern; | |||
| import org.junit.Assert; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class DigitSequenceMatcherTest { | |||
| @Test public void testStringDigits() { | |||
| DigitSequence digits = DigitSequenceMatcher.digitsFromString("1234"); | |||
| Assert.assertTrue(digits.hasNext()); | |||
| Assert.assertEquals(1, digits.next()); | |||
| Assert.assertTrue(digits.hasNext()); | |||
| Assert.assertEquals(2, digits.next()); | |||
| Assert.assertTrue(digits.hasNext()); | |||
| Assert.assertEquals(3, digits.next()); | |||
| Assert.assertTrue(digits.hasNext()); | |||
| Assert.assertEquals(4, digits.next()); | |||
| Assert.assertFalse(digits.hasNext()); | |||
| } | |||
| @Test public void testSingleDigitMatching() { | |||
| assertNotMatches(ranges("0"), INVALID, "1", "9"); | |||
| assertNotMatches(ranges("0"), TOO_LONG, "00"); | |||
| assertMatches(ranges("x"), "0", "5", "9"); | |||
| assertNotMatches(ranges("x"), TOO_SHORT, ""); | |||
| assertNotMatches(ranges("x"), TOO_LONG, "00"); | |||
| assertMatches(ranges("[2-6]"), "2", "3", "4", "5", "6"); | |||
| assertNotMatches(ranges("[2-6]"), INVALID, "0", "1", "7", "8", "9"); | |||
| assertNotMatches(ranges("[2-6]"), TOO_LONG, "26"); | |||
| } | |||
| @Test public void testOptional() { | |||
| RangeTree dfa = ranges("12", "123"); | |||
| assertMatches(ranges("12", "123"), "12", "123"); | |||
| assertNotMatches(dfa, TOO_SHORT, "1"); | |||
| assertNotMatches(dfa, INVALID, "13"); | |||
| assertNotMatches(dfa, TOO_LONG, "1233"); | |||
| } | |||
| @Test public void testRepetition() { | |||
| assertMatches(ranges("12xx", "12xxx", "12xxxx"), "1234", "12345", "123456"); | |||
| } | |||
| @Test public void testOr() { | |||
| RangeTree dfa = ranges("01", "23"); | |||
| assertMatches(dfa, "01", "23"); | |||
| assertNotMatches(dfa, INVALID, "03", "12"); | |||
| assertNotMatches(dfa, TOO_SHORT, "0", "2"); | |||
| assertNotMatches(dfa, TOO_LONG, "011", "233"); | |||
| assertMatches(ranges("01", "23", "45", "6789"), "01", "23", "45", "6789"); | |||
| } | |||
| @Test public void testRealRegexShort() { | |||
| RangeTree dfa = ranges( | |||
| "11[2-7]xxxxxxx", | |||
| "2[02][2-7]xxxxxxx", | |||
| "33[2-7]xxxxxxx", | |||
| "4[04][2-7]xxxxxxx", | |||
| "79[2-7]xxxxxxx", | |||
| "80[2-467]xxxxxxx"); | |||
| assertMatches(dfa, "112 1234567", "797 1234567", "807 1234567"); | |||
| assertNotMatches(dfa, TOO_SHORT, "112 123", "797 12345", "807 123456"); | |||
| assertNotMatches(dfa, TOO_LONG, "112 12345678", "797 123456789"); | |||
| assertNotMatches(dfa, INVALID, "122 1234567", "799 1234567", "805 1234567"); | |||
| } | |||
| @Test public void testRealRegexLong() { | |||
| RangeTree dfa = ranges( | |||
| "12[0-249][2-7]xxxxxx", | |||
| "13[0-25][2-7]xxxxxx", | |||
| "14[145][2-7]xxxxxx", | |||
| "1[59][14][2-7]xxxxxx", | |||
| "16[014][2-7]xxxxxx", | |||
| "17[1257][2-7]xxxxxx", | |||
| "18[01346][2-7]xxxxxx", | |||
| "21[257][2-7]xxxxxx", | |||
| "23[013][2-7]xxxxxx", | |||
| "24[01][2-7]xxxxxx", | |||
| "25[0137][2-7]xxxxxx", | |||
| "26[0158][2-7]xxxxxx", | |||
| "278[2-7]xxxxxx", | |||
| "28[1568][2-7]xxxxxx", | |||
| "29[14][2-7]xxxxxx", | |||
| "326[2-7]xxxxxx", | |||
| "34[1-3][2-7]xxxxxx", | |||
| "35[34][2-7]xxxxxx", | |||
| "36[01489][2-7]xxxxxx", | |||
| "37[02-46][2-7]xxxxxx", | |||
| "38[159][2-7]xxxxxx", | |||
| "41[36][2-7]xxxxxx", | |||
| "42[1-47][2-7]xxxxxx", | |||
| "43[15][2-7]xxxxxx", | |||
| "45[12][2-7]xxxxxx", | |||
| "46[126-9][2-7]xxxxxx", | |||
| "47[0-24-9][2-7]xxxxxx", | |||
| "48[013-57][2-7]xxxxxx", | |||
| "49[014-7][2-7]xxxxxx", | |||
| "5[136][25][2-7]xxxxxx", | |||
| "522[2-7]xxxxxx", | |||
| "54[28][2-7]xxxxxx", | |||
| "55[12][2-7]xxxxxx", | |||
| "5[78]1[2-7]xxxxxx", | |||
| "59[15][2-7]xxxxxx", | |||
| "612[2-7]xxxxxx", | |||
| "6[2-4]1[2-7]xxxxxx", | |||
| "65[17][2-7]xxxxxx", | |||
| "66[13][2-7]xxxxxx", | |||
| "67[14][2-7]xxxxxx", | |||
| "680[2-7]xxxxxx", | |||
| "712[2-7]xxxxxx", | |||
| "72[14][2-7]xxxxxx", | |||
| "73[134][2-7]xxxxxx", | |||
| "74[47][2-7]xxxxxx", | |||
| "75[15][2-7]xxxxxx", | |||
| "7[67]1[2-7]xxxxxx", | |||
| "788[2-7]xxxxxx", | |||
| "816[2-7]xxxxxx", | |||
| "82[014][2-7]xxxxxx", | |||
| "83[126][2-7]xxxxxx", | |||
| "86[136][2-7]xxxxxx", | |||
| "87[078][2-7]xxxxxx", | |||
| "88[34][2-7]xxxxxx", | |||
| "891[2-7]xxxxxx"); | |||
| assertMatches(dfa, "364 2 123456", "674 4 123456", "883 7 123456"); | |||
| assertNotMatches(dfa, TOO_SHORT, "364 2 123", "674 4 1234", "883 7 12345"); | |||
| assertNotMatches(dfa, TOO_LONG, "364 2 1234567", "674 4 12345678"); | |||
| assertNotMatches(dfa, INVALID, | |||
| "365 2 123456", "364 8 123456", "670 4 123456", "670 5 123456", "892 2 123456"); | |||
| } | |||
| private static RangeTree ranges(String... lines) { | |||
| return RangeTree.from(Arrays.stream(lines).map(RangeSpecification::parse)); | |||
| } | |||
| private static void assertMatches(RangeTree dfa, String... numbers) { | |||
| checkRegex(dfa, true, numbers); | |||
| byte[] matcherData = MatcherCompiler.compile(dfa); | |||
| DigitSequenceMatcher matcher = DigitSequenceMatcher.create(matcherData); | |||
| assertMatcher(matcher, MATCHED, numbers); | |||
| } | |||
| private static void assertNotMatches(RangeTree dfa, Result error, String... numbers) { | |||
| checkArgument(error != MATCHED); | |||
| checkRegex(dfa, false, numbers); | |||
| byte[] matcherData = MatcherCompiler.compile(dfa); | |||
| DigitSequenceMatcher matcher = DigitSequenceMatcher.create(matcherData); | |||
| assertMatcher(matcher, error, numbers); | |||
| } | |||
| private static void checkRegex(RangeTree dfa, boolean expectMatch, String... numbers) { | |||
| Pattern pattern = Pattern.compile(RegexGenerator.basic().toRegex(dfa)); | |||
| for (String number : numbers) { | |||
| checkArgument(expectMatch == pattern.matcher(noSpace(number)).matches(), | |||
| "regex %s could not match input %s", dfa.asRangeSpecifications(), number); | |||
| } | |||
| } | |||
| private static void assertMatcher( | |||
| DigitSequenceMatcher matcher, Result expected, String... numbers) { | |||
| for (final String number : numbers) { | |||
| Assert.assertEquals(expected, | |||
| matcher.match(DigitSequenceMatcher.digitsFromString(noSpace(number)))); | |||
| } | |||
| } | |||
| private static String noSpace(String input) { | |||
| return CharMatcher.whitespace().removeFrom(input); | |||
| } | |||
| } | |||
| @ -0,0 +1,317 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| import static com.google.common.collect.ImmutableList.toImmutableList; | |||
| import static com.google.common.truth.Truth.assertWithMessage; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; | |||
| import static java.lang.Integer.bitCount; | |||
| import static java.lang.Integer.lowestOneBit; | |||
| import static java.lang.Integer.numberOfTrailingZeros; | |||
| import com.google.common.collect.Multimap; | |||
| import com.google.common.collect.MultimapBuilder; | |||
| import com.google.common.collect.SetMultimap; | |||
| import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto; | |||
| import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto.TestCase; | |||
| import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto.Tests; | |||
| import com.google.i18n.phonenumbers.metadata.DigitSequence; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result; | |||
| import com.google.protobuf.ByteString; | |||
| import com.google.protobuf.TextFormat; | |||
| import java.io.IOException; | |||
| import java.io.InputStream; | |||
| import java.io.InputStreamReader; | |||
| import java.io.PrintWriter; | |||
| import java.io.StringWriter; | |||
| import java.nio.charset.StandardCharsets; | |||
| import java.util.HashSet; | |||
| import java.util.List; | |||
| import java.util.Set; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class CompilerRegressionTest { | |||
| // Tests that the compiler produces the expected output, byte-for-byte. | |||
| @Test | |||
| public void testCompiledBytesEqualExpectedMatcherBytes() throws IOException { | |||
| StringWriter buffer = new StringWriter(); | |||
| PrintWriter errors = new PrintWriter(buffer); | |||
| try (InputStream data = | |||
| CompilerRegressionTest.class.getResourceAsStream("regression_test_data.textpb")) { | |||
| Tests.Builder tests = RegressionTestProto.Tests.newBuilder(); | |||
| TextFormat.merge(new InputStreamReader(data, StandardCharsets.UTF_8), tests); | |||
| for (TestCase tc : tests.getTestCaseList()) { | |||
| byte[] actual = MatcherCompiler.compile(ranges(tc.getRangeList())); | |||
| byte[] expected = combine(tc.getExpectedList()); | |||
| int diffIndex = indexOfDiff(actual, expected); | |||
| if (!tc.getShouldFail()) { | |||
| if (diffIndex != -1) { | |||
| errors.format("FAILED [%s]: First difference at index %d\n", tc.getName(), diffIndex); | |||
| errors.format("Actual : %s\n", formatPbSnippet(actual, diffIndex, 20)); | |||
| errors.format("Expected: %s\n", formatPbSnippet(expected, diffIndex, 20)); | |||
| writeGoldenPbOutput(actual, errors); | |||
| } | |||
| } else { | |||
| if (diffIndex == -1) { | |||
| errors.format("FAILED [%s]: Expected difference, but got none\n", tc.getName()); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| String errorMessage = buffer.toString(); | |||
| if (!errorMessage.isEmpty()) { | |||
| assertWithMessage(errorMessage).fail(); | |||
| } | |||
| } | |||
| // Test that the matcher behaves correctly with respect to the input ranges using the expected | |||
| // byte sequences. If this test fails, then the matcher implementation is doing something wrong, | |||
| // or the expected bytes were generated incorrectly (either by hand or from the compiler). | |||
| // | |||
| // IMPORTANT: This test tests that the expected bytes (rather than the compiled bytes) match the | |||
| // numbers in the ranges. This avoids the risk of any bugs in both the matcher and compiler | |||
| // somehow cancelling each other out. However this also means that this test depends on the | |||
| // equality test above for validity (i.e. this test can pass even if the matcher compiler is | |||
| // broken, so it should not be run in isolation when debugging). | |||
| @Test | |||
| public void testExpectedMatcherBytesMatchRanges() throws IOException { | |||
| try (InputStream data = | |||
| CompilerRegressionTest.class.getResourceAsStream("regression_test_data.textpb")) { | |||
| RegressionTestProto.Tests.Builder tests = RegressionTestProto.Tests.newBuilder(); | |||
| TextFormat.merge(new InputStreamReader(data, StandardCharsets.UTF_8), tests); | |||
| for (TestCase tc : tests.getTestCaseList()) { | |||
| RangeTree ranges = ranges(tc.getRangeList()); | |||
| // If we compiled the ranges here, we could risk a situation where the compiled bytes were | |||
| // broken but the compiler had a corresponding bug that cancelled it out. This test only | |||
| // tests the matcher behaviour, whereas the test above only tests the compiler behaviour. | |||
| DigitSequenceMatcher matcher = DigitSequenceMatcher.create(combine(tc.getExpectedList())); | |||
| Multimap<Result, DigitSequence> numbers = buildTestNumbers(ranges); | |||
| if (!tc.getShouldFail()) { | |||
| testExpectedMatch(tc.getName(), matcher, numbers); | |||
| } else { | |||
| testExpectedFailure(tc.getName(), matcher, numbers); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| private static void testExpectedMatch(String testName, DigitSequenceMatcher matcher, | |||
| Multimap<Result, DigitSequence> numbers) { | |||
| for (Result expectedResult : Result.values()) { | |||
| for (DigitSequence s : numbers.get(expectedResult)) { | |||
| Result result = matcher.match(new Sequence(s)); | |||
| assertWithMessage("FAILED [%s]: Sequence %s", testName, s) | |||
| .that(result).isEqualTo(expectedResult); | |||
| } | |||
| } | |||
| } | |||
| private static void testExpectedFailure(String testName, DigitSequenceMatcher matcher, | |||
| Multimap<Result, DigitSequence> numbers) { | |||
| for (Result expectedResult : Result.values()) { | |||
| for (DigitSequence s : numbers.get(expectedResult)) { | |||
| Result result = matcher.match(new Sequence(s)); | |||
| if (result != expectedResult) { | |||
| return; | |||
| } | |||
| } | |||
| } | |||
| assertWithMessage("FAILED [%s]: Expected at least one failure", testName).fail(); | |||
| } | |||
| // Magic number: DigitSequences cannot be longer than 18 digits at the moment, so a check is | |||
| // needed to prevent us trying to make a longer-than-allowed sequences in tests. This only | |||
| // happens in the case of a terminal node, since non-terminal paths must be < 17 digits long. | |||
| // If the allowed digits increases, this value can be modified or left as-is. | |||
| private static final int MAX_SEQUENCE_LENGTH = 18; | |||
| // Trivial adapter from the metadata DigitSequence to the matcher's lightweight sequence. | |||
| private static final class Sequence implements DigitSequenceMatcher.DigitSequence { | |||
| private final DigitSequence seq; | |||
| private int index = 0; | |||
| Sequence(DigitSequence seq) { | |||
| this.seq = seq; | |||
| } | |||
| @Override | |||
| public boolean hasNext() { | |||
| return index < seq.length(); | |||
| } | |||
| @Override | |||
| public int next() { | |||
| return seq.getDigit(index++); | |||
| } | |||
| } | |||
| // Returns a RangeTree for the list of RangeSpecification strings. | |||
| RangeTree ranges(List<String> specs) { | |||
| return RangeTree.from(specs.stream().map(RangeSpecification::parse).collect(toImmutableList())); | |||
| } | |||
| // Builds a map of numbers for the given RangeTree to test every branching point in the DFA. | |||
| // All paths combinations are generated exactly once to give coverage. This does use pseudo | |||
| // random numbers to pick random digits from masks, but it should not be flaky. If it _ever_ | |||
| // fails then it implies a serious problem with the matcher compiler or matcher implementation. | |||
| private static Multimap<Result, DigitSequence> buildTestNumbers(RangeTree ranges) { | |||
| SetMultimap<Result, DigitSequence> numbers = | |||
| MultimapBuilder.enumKeys(Result.class).treeSetValues().build(); | |||
| Set<DfaNode> visited = new HashSet<>(); | |||
| ranges.accept(new Visitor(RangeSpecification.empty(), numbers, visited)); | |||
| return numbers; | |||
| } | |||
| /** | |||
| * Visitor to generate a targeted set of test numbers from a range tree DFA, which should | |||
| * exercise every instruction in the corresponding matcher data. These numbers should ensure | |||
| * that every "branch" (including early terminations) is taken at least once. Where digits | |||
| * should be equivalent (i.e. both x & y have the same effect) they are chosen randomly, since | |||
| * otherwise you would need to generate billions of numbers to cover every possible combination. | |||
| */ | |||
| private static final class Visitor implements DfaVisitor { | |||
| private final RangeSpecification sourcePath; | |||
| private final SetMultimap<Result, DigitSequence> numbers; | |||
| private final Set<DfaNode> visited; | |||
| private int outEdgesMask = 0; | |||
| Visitor(RangeSpecification sourcePath, | |||
| SetMultimap<Result, DigitSequence> numbers, | |||
| Set<DfaNode> visited) { | |||
| this.sourcePath = sourcePath; | |||
| this.numbers = numbers; | |||
| this.visited = visited; | |||
| } | |||
| @Override | |||
| public void visit(DfaNode source, DfaEdge edge, DfaNode target) { | |||
| // Record the current outgoing edge mask. | |||
| int mask = edge.getDigitMask(); | |||
| outEdgesMask |= mask; | |||
| // Get the current path and add a test number for it. | |||
| RangeSpecification path = sourcePath.extendByMask(mask); | |||
| numbers.put(target.canTerminate() ? Result.MATCHED : Result.TOO_SHORT, sequenceIn(path)); | |||
| // Avoid recursing into nodes we've already visited. This avoids generating many (hundreds) | |||
| // of test numbers for nodes which are reachable in many ways (via many path prefixes). This | |||
| // is an optional check and could be removed, but for testing larger ranges it seems to make | |||
| // a difference in test time. DFA node/instruction coverage should be unaffected by this. | |||
| if (visited.contains(target)) { | |||
| return; | |||
| } | |||
| visited.add(target); | |||
| // Recurse into the next level with a new visitor starting from our path (it's okay to visit | |||
| // the terminal node here since it does nothing and leaves the out edges mask zero). | |||
| Visitor childVisitor = new Visitor(path, numbers, visited); | |||
| target.accept(childVisitor); | |||
| // After recursion, find out which of our target's out-edges cannot be reached. | |||
| int unreachableMask = ~childVisitor.outEdgesMask & ALL_DIGITS_MASK; | |||
| if (unreachableMask != 0 && path.length() < MAX_SEQUENCE_LENGTH) { | |||
| // Create a path which cannot be reached directly from our target node. If this is the | |||
| // terminal node then we create a path that's too long, otherwise it's just invalid. | |||
| Result expected = target.equals(RangeTree.getTerminal()) ? Result.TOO_LONG : Result.INVALID; | |||
| numbers.put(expected, sequenceIn(path.extendByMask(unreachableMask))); | |||
| } | |||
| } | |||
| } | |||
| // Returns a pseudo randomly chosen sequence from the given path. | |||
| private static final DigitSequence sequenceIn(RangeSpecification path) { | |||
| DigitSequence seq = DigitSequence.empty(); | |||
| for (int n = 0; n < path.length(); n++) { | |||
| int mask = path.getBitmask(n); | |||
| // A random number M in [0..BitCount), not the bit itself. | |||
| // E.g. mask = 0011010011 ==> (0 <= maskBit < 5) (allowed digits are {0,1,4,6,7}) | |||
| int maskBit = (int) (bitCount(mask) * Math.random()); | |||
| // Mask out the M lower bits which come before the randomly selected one. | |||
| // E.g. maskBit = 3 ==> mask = 0011000000 (3 lower bits cleared) | |||
| while (maskBit > 0) { | |||
| mask &= ~lowestOneBit(mask); | |||
| maskBit--; | |||
| } | |||
| // Extend the sequence by the digit value of the randomly selected bit. | |||
| // E.g. mask = 0011000000 ==> digit = 6 (randomly chosen from the allowed digits). | |||
| seq = seq.extendBy(numberOfTrailingZeros(mask)); | |||
| } | |||
| return seq; | |||
| } | |||
| // Combines multiple ByteStrings into a single byte[] (we allow splitting in the regression test | |||
| // file for readability. | |||
| private static byte[] combine(List<ByteString> bytes) { | |||
| int size = bytes.stream().mapToInt(ByteString::size).sum(); | |||
| byte[] out = new byte[size]; | |||
| int offset = 0; | |||
| for (ByteString b : bytes) { | |||
| b.copyTo(out, offset); | |||
| offset += b.size(); | |||
| } | |||
| return out; | |||
| } | |||
| // Return the index of the first difference, or -1 is the byte arrays are the same. | |||
| private static int indexOfDiff(byte[] a, byte[] b) { | |||
| int length = Math.min(a.length, b.length); | |||
| for (int n = 0; n < length; n++) { | |||
| if (a[n] != b[n]) { | |||
| return n; | |||
| } | |||
| } | |||
| return (a.length == length && b.length == length) ? -1 : length; | |||
| } | |||
| // Formats a subset of the bytes as a human readable snippet using C-style hex escaping (which | |||
| // is compatible with the regression test data). | |||
| private static String formatPbSnippet(byte[] bytes, int start, int length) { | |||
| StringBuilder out = new StringBuilder(); | |||
| if (start > 0) { | |||
| out.append("..."); | |||
| } | |||
| appendBytes(out, bytes, start, length); | |||
| if (start + length < bytes.length) { | |||
| out.append("..."); | |||
| } | |||
| return out.toString(); | |||
| } | |||
| // Writes bytes such that they can be cut & pasted into a regression test file as new golden data. | |||
| private static void writeGoldenPbOutput(byte[] bytes, PrintWriter errors) { | |||
| errors.println("Golden Data:"); | |||
| StringBuilder out = new StringBuilder(); | |||
| for (int start = 0; start < bytes.length; start += 20) { | |||
| errors.format(" expected: \"%s\"\n", appendBytes(out, bytes, start, 20)); | |||
| out.setLength(0); | |||
| } | |||
| } | |||
| // Appends a set of bytes in C-style hex format (e.g. \xHH). | |||
| private static StringBuilder appendBytes(StringBuilder out, byte[] bytes, int start, int length) { | |||
| int end = Math.min(start + length, bytes.length); | |||
| for (int n = start; n < end; n++) { | |||
| out.append(String.format("\\x%02x", bytes[n] & 0xFF)); | |||
| } | |||
| return out; | |||
| } | |||
| } | |||
| @ -0,0 +1,144 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.common.primitives.Bytes.asList; | |||
| import static com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler.compile; | |||
| import com.google.common.truth.Truth; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode; | |||
| import java.util.Arrays; | |||
| import java.util.List; | |||
| import org.junit.Assert; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class MatcherCompilerTest { | |||
| private static final Byte TERMINATOR = (byte) 0; | |||
| @Test public void testSingleOperation() { | |||
| byte digit0 = single(0); | |||
| byte digit5 = single(5); | |||
| byte digit9 = single(9); | |||
| assertCompile(ranges("0"), digit0, TERMINATOR); | |||
| assertCompile(ranges("5"), digit5, TERMINATOR); | |||
| assertCompile(ranges("9"), digit9, TERMINATOR); | |||
| assertCompile(ranges("0559"), digit0, digit5, digit5, digit9, TERMINATOR); | |||
| byte digit5Terminating = (byte) (digit5 | (1 << 4)); | |||
| assertCompile(ranges("05", "0559"), | |||
| digit0, digit5, digit5Terminating, digit9, TERMINATOR); | |||
| } | |||
| @Test public void testAnyOperation() { | |||
| byte anyDigit = any(1); | |||
| byte anyDigit16Times = any(16); | |||
| assertCompile(ranges("x"), anyDigit, TERMINATOR); | |||
| assertCompile(ranges("xxxx_xxxx_xxxx_xxxx"), anyDigit16Times, TERMINATOR); | |||
| assertCompile(ranges("xxxx_xxxx_xxxx_xxxx_x"), | |||
| anyDigit16Times, anyDigit, TERMINATOR); | |||
| byte anyDigitTerminating = (byte) (anyDigit | (1 << 4)); | |||
| assertCompile(ranges("x", "xx"), anyDigit, anyDigitTerminating, TERMINATOR); | |||
| assertCompile(ranges("xxxx_xxxx_xxxx_xxxx", "xxxx_xxxx_xxxx_xxxx_x"), | |||
| anyDigit16Times, anyDigitTerminating, TERMINATOR); | |||
| } | |||
| @Test public void testRangeOperation() { | |||
| int range09 = range(0, 9); | |||
| int range123 = range(1, 2, 3); | |||
| int range789 = range(7, 8, 9); | |||
| assertCompile(ranges("[09]"), hi(range09), lo(range09), TERMINATOR); | |||
| assertCompile(ranges("[123][789]"), | |||
| hi(range123), lo(range123), hi(range789), lo(range789), TERMINATOR); | |||
| } | |||
| @Test public void testMapOperation() { | |||
| // Force all 10 possible branches to be taken. | |||
| byte[] data = compile(ranges("00", "11", "22", "33", "44", "55", "66", "77", "88", "99")); | |||
| // Check only the first 4 bytes for exact values. | |||
| Assert.assertEquals( | |||
| asList((byte) 0x95, (byte) 0x31, (byte) 0xF5, (byte) 0x9D), | |||
| asList(data).subList(0, 4)); | |||
| // Each branch should jump to a 2 byte sequence between 10 and 28 bytes away (inclusive). | |||
| List<Byte> jumpTable = asList(data).subList(4, 14); | |||
| List<Byte> remainder = asList(data).subList(14, data.length); | |||
| // TODO: Now that ordering should be consistent, tighten up this test to ensure | |||
| // consistency and remove the shorter consistency test below. | |||
| for (byte jump : new byte[] {0xA, 0xC, 0xE, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C}) { | |||
| Assert.assertTrue(jumpTable.contains(jump)); | |||
| int index = jumpTable.indexOf(jump); | |||
| // Subtract the length of the jump table to get relative offset in remaining code. | |||
| jump = (byte) (jump - 10); | |||
| // Each jump should end in 2 single-byte instructions (match corresponding digit, terminate). | |||
| Assert.assertEquals(single(index), remainder.get(jump)); | |||
| Assert.assertEquals(TERMINATOR, remainder.get(jump + 1)); | |||
| } | |||
| } | |||
| @Test public void testConsistentSorting() { | |||
| // Ensure that the MatcherCompiler output is consistent, otherwise it can result in a | |||
| // non-deterministic build, because the generated file changes with each execution. | |||
| byte[] expected = new byte[] {-128, 0, 0, 29, 3, 5, 7, 32, 0, 33, 0, 34, 0}; | |||
| assertCompile(ranges("00", "11", "22"), expected); | |||
| } | |||
| /** Returns the 1-byte instruction representing matching a single digit once. */ | |||
| private static Byte single(int value) { | |||
| checkArgument(value >= 0 && value < 10); | |||
| return (byte) ((OpCode.SINGLE.ordinal() << 5) | value); | |||
| } | |||
| /** Returns the 1-byte instruction representing matching any digit a specified number of times. */ | |||
| private static Byte any(int count) { | |||
| checkArgument(count > 0 && count <= 16); | |||
| return (byte) ((OpCode.ANY.ordinal() << 5) | (count - 1)); | |||
| } | |||
| /** Returns the 2-byte instruction representing matching a range of digits. */ | |||
| private static int range(int... digits) { | |||
| int mask = 0; | |||
| for (int d : digits) { | |||
| checkArgument(0 <= d && d <= 9); | |||
| mask |= 1 << d; | |||
| } | |||
| return (OpCode.RANGE.ordinal() << 13) | mask; | |||
| } | |||
| private static Byte hi(int shortInstruction) { | |||
| return (byte) (shortInstruction >> 8); | |||
| } | |||
| private static Byte lo(int shortInstruction) { | |||
| return (byte) (shortInstruction & 0xFF); | |||
| } | |||
| private void assertCompile(RangeTree dfa, byte... expected) { | |||
| Truth.assertThat(compile(dfa)).isEqualTo(expected); | |||
| } | |||
| private static RangeTree ranges(String... lines) { | |||
| return RangeTree.from(Arrays.stream(lines).map(RangeSpecification::parse)); | |||
| } | |||
| } | |||
| @ -0,0 +1,60 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; | |||
| import static com.google.common.primitives.Bytes.asList; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.common.io.ByteArrayDataOutput; | |||
| import com.google.common.io.ByteStreams; | |||
| import junit.framework.Assert; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class OperationTest { | |||
| @Test public void testWriteJumpTableNoExtraBranches() { | |||
| ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(); | |||
| Operation.writeJumpTable(outBytes, ImmutableList.of(0x10, 0x80, 0xFC), Statistics.NO_OP); | |||
| // The jump table size is added to the offsets. | |||
| Assert.assertEquals( | |||
| asList(new byte[] {(byte) 0x13, (byte) 0x83, (byte) 0xFF}), | |||
| asList(outBytes.toByteArray())); | |||
| } | |||
| // An easy way to reason about what the offsets for the branches should be is to consider | |||
| // that the last branch must always have the original offset (it jumps from the very end of | |||
| // the jump table, which is exactly what the original offset specified. The branch before it | |||
| // is the same except that it must jump over the final branch (ie, +2 bytes) and so on. | |||
| // Direct offsets are relative to the start of the jump table however and must be adjusted. | |||
| @Test public void testWriteJumpTableExtraBranches() { | |||
| ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(); | |||
| // Two extra branches needed (0x200 and 0xF7). Worst case adjustment is 9 bytes. | |||
| // Total adjustment is 7 bytes (jump table size + 2 * branch) | |||
| Operation.writeJumpTable(outBytes, ImmutableList.of(0xF7, 0xF6, 0x200), Statistics.NO_OP); | |||
| Assert.assertEquals(asList(new byte[] { | |||
| // Jump table: (offset-to-branch, direct-adjusted-offset, offset-to-branch) | |||
| (byte) 0x03, (byte) 0xFD, (byte) 0x05, | |||
| // Extra branch: offset = 0xF7 + 2 (jumps over last branch) | |||
| (byte) 0x10, (byte) 0xF9, | |||
| // Extra branch: offset = 0x200 (last branch always has original offset) | |||
| (byte) 0x12, (byte) 0x00}), | |||
| asList(outBytes.toByteArray())); | |||
| } | |||
| } | |||
| @ -0,0 +1,295 @@ | |||
| # Copyright (C) 2017 The Libphonenumber Authors. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ---- Manually crafted "unit" tests ---- | |||
| test_case { | |||
| name: "Simple Range" | |||
| range: "1234xxx" | |||
| # 4 single byte, single value instructions: 0x20 + value | |||
| # 1 single byte, "ANY" instruction: 0x40 + (count-1) | |||
| expected: "\x21\x22\x23\x24\x42\x00" | |||
| } | |||
| test_case { | |||
| # NOTE: When the ANY instruction is marked as terminating, it applies when the instruction is | |||
| # reached, not after it's executed (i.e. \x50... is "(\d...)?", and not "\d(...)?"). | |||
| # Match 3 x ANY (0x42), then "terminate or ANY" (0x50), then 2 x ANY | |||
| name: "Variable Any Match #1" | |||
| range: "1xxx" | |||
| range: "1xxxxxx" | |||
| expected: "\x21\x42\x50\x41\x00" | |||
| } | |||
| test_case { | |||
| name: "Variable Any Match #2" | |||
| range: "1xxx" | |||
| range: "1xxxx" | |||
| range: "1xxxxx" | |||
| range: "1xxxxxx" | |||
| # A repeated terminating ANY match applies on every repeat, not just the first time. | |||
| # Match 3 x ANY (0x42 = \d{3}), then 3 x "terminate or ANY" (0x52 = \d{0,3}). | |||
| expected: "\x21\x42\x52\x00" | |||
| } | |||
| test_case { | |||
| name: "Overflow Any Match" | |||
| range: "xxxxxxxxxxxxxxxxxx" | |||
| # 18 'any' digits can't fit in one instruction, so write 2 separate opcodes to match 16 (0x4F) | |||
| # and then 2 (0x41). This will almost never occur since DigitSequence is limited to 18 digits. | |||
| expected: "\x4F\x41\x00" | |||
| } | |||
| test_case { | |||
| name: "Range Matching" | |||
| range: "[0-4]12" | |||
| # First 2 bytes are a "branch" operation (opcode = 0x60 plus mask), but there are no offsets | |||
| # after it (since one "branch" is just to continue matching, while the other is failure). | |||
| expected: "\x60\x1F\x21\x22\x00" | |||
| } | |||
| test_case { | |||
| name: "Range Matching" | |||
| # Requires a 2-way branch in the DFA where both paths cover all input digits [0-9]. | |||
| range: "[0-4]12" | |||
| range: "[5-9]34" | |||
| # First 2 bytes are a 2-way branch operation (opcode = 0x68 plus mask), then 2 jump offsets | |||
| # from the end of the branch instruction. | |||
| expected: "\x68\x1F\x02\x05\x21\x22\x00\x23\x24\x00" | |||
| } | |||
| # ---- Deliberate failure cases ---- | |||
| test_case { | |||
| name: "Modified Single Match Bytecode" | |||
| should_fail: true | |||
| range: "123xxxx" | |||
| range: "123xxxxx" | |||
| range: "123xxxxxx" | |||
| # Expected bytes have been tweaked to accept 4 (\x24), rather than 3 (\x23). | |||
| expected: "\x21\x22\x24\x43\x51\x00" | |||
| } | |||
| test_case { | |||
| name: "Modified Range Bytecode" | |||
| should_fail: true | |||
| range: "1[2-5]xxxx" | |||
| # Expected bytes have been tweaked to accept [7-9] (\x63\x80), rather than [2-5] (\x60\x3C) | |||
| expected: "\x21\x63\x80\x43\x00" | |||
| } | |||
| test_case { | |||
| name: "Modified Any Match Bytecode" | |||
| should_fail: true | |||
| range: "1xxxx" | |||
| # Expected bytes have been tweaked to accept xxx (\x42), rather than xxxx (\x43) | |||
| expected: "\x21\x42\x00" | |||
| } | |||
| # ---- Auto-generated "stress tests" ---- | |||
| test_case { | |||
| name: "GB Mobile" | |||
| range: "7[1-3]xxxxxxxx" | |||
| range: "74[0-46-9]xxxxxxx" | |||
| range: "745[0-689]xxxxxx" | |||
| range: "7457[0-57-9]xxxxx" | |||
| range: "750[0-8]xxxxxx" | |||
| range: "75[13-9]xxxxxxx" | |||
| range: "752[0-35-9]xxxxxx" | |||
| range: "7700[01]xxxxx" | |||
| range: "770[1-9]xxxxxx" | |||
| range: "77[1-7]xxxxxxx" | |||
| range: "778[02-9]xxxxxx" | |||
| range: "779[0-689]xxxxxx" | |||
| range: "78[014-9]xxxxxxx" | |||
| range: "78[23][0-8]xxxxxx" | |||
| range: "79[024-9]xxxxxxx" | |||
| range: "791[02-9]xxxxxx" | |||
| range: "7911[028]xxxxx" | |||
| range: "793[0-689]xxxxxx" | |||
| # Not much insight here - other than it starts by matching a '7' and terminates in one place | |||
| # after matching "any digit" 5 times (which is the shortest trailing match in the ranges). | |||
| expected: "\x27\x8c\xa8\x1a\x2a\x06\x09\x0d\x14\x1c\x20\x40\x10\x1e\x6b\xdf\x1c\x1f\x84\x44" | |||
| expected: "\x92\x5d\x1d\x16\x21\x88\x64\x92\x55\x1d\x0f\x21\x24\x6b\xf3\x09\x10\x82\x22\x49" | |||
| expected: "\x6d\x03\x1b\x18\x40\x10\x19\x6b\x7f\x17\x19\x61\xff\x10\x11\x63\xef\x0e\x68\x01" | |||
| expected: "\x11\x0c\x63\xfd\x07\x63\x7f\x04\x6b\xfd\x02\x0a\x40\x08\x63\xbf\x05\x60\x03\x02" | |||
| expected: "\x61\x05\x44\x00" | |||
| } | |||
| test_case { | |||
| name: "India Fixed Line" | |||
| range: "11[2-7]xxxxxxx" | |||
| range: "12[0-249][2-7]xxxxxx" | |||
| range: "12[35-8]x[2-7]xxxxx" | |||
| range: "13[0-25][2-7]xxxxxx" | |||
| range: "13[346-9]x[2-7]xxxxx" | |||
| range: "14[145][2-7]xxxxxx" | |||
| range: "14[236-9]x[2-7]xxxxx" | |||
| range: "1[59][0235-9]x[2-7]xxxxx" | |||
| range: "1[59][14][2-7]xxxxxx" | |||
| range: "16[014][2-7]xxxxxx" | |||
| range: "16[235-9]x[2-7]xxxxx" | |||
| range: "17[1257][2-7]xxxxxx" | |||
| range: "17[34689]x[2-7]xxxxx" | |||
| range: "18[01346][2-7]xxxxxx" | |||
| range: "18[257-9]x[2-7]xxxxx" | |||
| range: "2[02][2-7]xxxxxxx" | |||
| range: "21[134689]x[2-7]xxxxx" | |||
| range: "21[257][2-7]xxxxxx" | |||
| range: "23[013][2-7]xxxxxx" | |||
| range: "23[24-8]x[2-7]xxxxx" | |||
| range: "24[01][2-7]xxxxxx" | |||
| range: "24[2-8]x[2-7]xxxxx" | |||
| range: "25[0137][2-7]xxxxxx" | |||
| range: "25[25689]x[2-7]xxxxx" | |||
| range: "26[0158][2-7]xxxxxx" | |||
| range: "26[2-4679]x[2-7]xxxxx" | |||
| range: "27[13-79]x[2-7]xxxxx" | |||
| range: "278[2-7]xxxxxx" | |||
| range: "28[1568][2-7]xxxxxx" | |||
| range: "28[2-479]x[2-7]xxxxx" | |||
| range: "29[14][2-7]xxxxxx" | |||
| range: "29[235-9]x[2-7]xxxxx" | |||
| range: "301x[2-7]xxxxx" | |||
| range: "31[79]x[2-7]xxxxx" | |||
| range: "32[1-5]x[2-7]xxxxx" | |||
| range: "326[2-7]xxxxxx" | |||
| range: "33[2-7]xxxxxxx" | |||
| range: "34[13][2-7]xxxxxx" | |||
| range: "342[0189][2-7]xxxxx" | |||
| range: "342[2-7]xxxxxx" | |||
| range: "34[5-8]x[2-7]xxxxx" | |||
| range: "35[125689]x[2-7]xxxxx" | |||
| range: "35[34][2-7]xxxxxx" | |||
| range: "36[01489][2-7]xxxxxx" | |||
| range: "36[235-7]x[2-7]xxxxx" | |||
| range: "37[02-46][2-7]xxxxxx" | |||
| range: "37[157-9]x[2-7]xxxxx" | |||
| range: "38[159][2-7]xxxxxx" | |||
| range: "38[2-467]x[2-7]xxxxx" | |||
| range: "4[04][2-7]xxxxxxx" | |||
| range: "41[14578]x[2-7]xxxxx" | |||
| range: "41[36][2-7]xxxxxx" | |||
| range: "42[1-47][2-7]xxxxxx" | |||
| range: "42[5689]x[2-7]xxxxx" | |||
| range: "43[15][2-7]xxxxxx" | |||
| range: "43[2-467]x[2-7]xxxxx" | |||
| range: "45[12][2-7]xxxxxx" | |||
| range: "45[4-7]x[2-7]xxxxx" | |||
| range: "46[0-26-9][2-7]xxxxxx" | |||
| range: "46[35]x[2-7]xxxxx" | |||
| range: "47[0-24-9][2-7]xxxxxx" | |||
| range: "473x[2-7]xxxxx" | |||
| range: "48[013-57][2-7]xxxxxx" | |||
| range: "48[2689]x[2-7]xxxxx" | |||
| range: "49[014-7][2-7]xxxxxx" | |||
| range: "49[2389]x[2-7]xxxxx" | |||
| range: "51[025][2-7]xxxxxx" | |||
| range: "51[146-9]x[2-7]xxxxx" | |||
| range: "52[14-8]x[2-7]xxxxx" | |||
| range: "522[2-7]xxxxxx" | |||
| range: "53[1346]x[2-7]xxxxx" | |||
| range: "53[25][2-7]xxxxxx" | |||
| range: "54[14-69]x[2-7]xxxxx" | |||
| range: "54[28][2-7]xxxxxx" | |||
| range: "55[12][2-7]xxxxxx" | |||
| range: "55[46]x[2-7]xxxxx" | |||
| range: "56[146-9]x[2-7]xxxxx" | |||
| range: "56[25][2-7]xxxxxx" | |||
| range: "571[2-7]xxxxxx" | |||
| range: "57[2-4]x[2-7]xxxxx" | |||
| range: "581[2-7]xxxxxx" | |||
| range: "58[2-8]x[2-7]xxxxx" | |||
| range: "59[15][2-7]xxxxxx" | |||
| range: "59[246]x[2-7]xxxxx" | |||
| range: "61[1358]x[2-7]xxxxx" | |||
| range: "612[2-7]xxxxxx" | |||
| range: "621[2-7]xxxxxx" | |||
| range: "62[2457]x[2-7]xxxxx" | |||
| range: "631[2-7]xxxxxx" | |||
| range: "63[2-4]x[2-7]xxxxx" | |||
| range: "641[2-7]xxxxxx" | |||
| range: "64[235-7]x[2-7]xxxxx" | |||
| range: "65[17][2-7]xxxxxx" | |||
| range: "65[2-689]x[2-7]xxxxx" | |||
| range: "66[13][2-7]xxxxxx" | |||
| range: "66[24578]x[2-7]xxxxx" | |||
| range: "671[2-7]xxxxxx" | |||
| range: "67[235689]x[2-7]xxxxx" | |||
| range: "674[0189][2-7]xxxxx" | |||
| range: "674[2-7]xxxxxx" | |||
| range: "680[2-7]xxxxxx" | |||
| range: "68[1-6]x[2-7]xxxxx" | |||
| range: "71[013-9]x[2-7]xxxxx" | |||
| range: "712[2-7]xxxxxx" | |||
| range: "72[0235-9]x[2-7]xxxxx" | |||
| range: "72[14][2-7]xxxxxx" | |||
| range: "73[134][2-7]xxxxxx" | |||
| range: "73[2679]x[2-7]xxxxx" | |||
| range: "74[1-35689]x[2-7]xxxxx" | |||
| range: "74[47][2-7]xxxxxx" | |||
| range: "75[15][2-7]xxxxxx" | |||
| range: "75[2-46-9]x[2-7]xxxxx" | |||
| range: "7[67][02-9]x[2-7]xxxxx" | |||
| range: "7[67]1[2-7]xxxxxx" | |||
| range: "78[013-7]x[2-7]xxxxx" | |||
| range: "782[0-6][2-7]xxxxx" | |||
| range: "788[0189][2-7]xxxxx" | |||
| range: "788[2-7]xxxxxx" | |||
| range: "79[0189]x[2-7]xxxxx" | |||
| range: "79[2-7]xxxxxxx" | |||
| range: "80[2-467]xxxxxxx" | |||
| range: "81[1357-9]x[2-7]xxxxx" | |||
| range: "816[2-7]xxxxxx" | |||
| range: "82[014][2-7]xxxxxx" | |||
| range: "82[235-8]x[2-7]xxxxx" | |||
| range: "83[03-57-9]x[2-7]xxxxx" | |||
| range: "83[126][2-7]xxxxxx" | |||
| range: "84[0-24-9]x[2-7]xxxxx" | |||
| range: "85xx[2-7]xxxxx" | |||
| range: "86[136][2-7]xxxxxx" | |||
| range: "86[2457-9]x[2-7]xxxxx" | |||
| range: "87[078][2-7]xxxxxx" | |||
| range: "87[1-6]x[2-7]xxxxx" | |||
| range: "88[1256]x[2-7]xxxxx" | |||
| range: "88[34][2-7]xxxxxx" | |||
| range: "891[2-7]xxxxxx" | |||
| range: "89[2-4]x[2-7]xxxxx" | |||
| expected: "\x81\x0f\xac\x72\x08\x1e\x3b\x58\xad\xcc\x75\x8d\x8b\x0f\xac\x72\xdc\xec\xf4\x08" | |||
| expected: "\x0a\x0c\x0e\x10\x10\xf2\x10\xfa\x11\x00\x11\x06\x11\x0e\x93\x0f\xac\x6d\xc6\x09" | |||
| expected: "\x0b\x0d\x0f\x11\x13\x15\x17\x11\x07\x11\x0f\x11\x17\x11\x1f\x11\x27\x11\x2d\x11" | |||
| expected: "\x35\x11\x3d\x81\x31\xf5\x9d\x09\x0b\x0d\xa9\x0f\x11\x13\x15\x17\x12\x27\x12\x28" | |||
| expected: "\x11\x34\x11\x38\x11\x3d\x11\x41\x11\x43\x11\x45\x93\x0f\xa9\x9d\x8c\x09\x0b\x0d" | |||
| expected: "\x0f\x11\x13\x15\x17\x11\x3c\x11\x40\x11\x44\x11\x48\x11\x4c\x11\x50\x11\x52\x11" | |||
| expected: "\x54\x90\xed\xac\x72\x08\x99\x0a\x0c\x0e\x10\x12\x73\x11\xab\x11\xad\x11\xb1\x11" | |||
| expected: "\xb5\x11\xb9\x11\xdd\x95\x31\xf5\x9d\x63\x0a\x0c\x0e\x10\x12\x14\x16\x18\x1a\x11" | |||
| expected: "\xab\x11\xaf\x11\xb3\x11\xd4\x11\xd5\x11\xb1\x11\xb5\x11\xb9\x11\x44\x93\x0f\xac" | |||
| expected: "\x72\x09\x0b\x0d\x0f\x11\x13\x15\x17\x19\x11\x11\x11\x15\x11\x19\x11\x1d\x11\x21" | |||
| expected: "\x11\x25\x11\x29\x11\x2d\x11\x31\x81\x0f\xac\x72\x08\x0a\x0c\x0e\x10\x12\x14\x16" | |||
| expected: "\x11\x29\x11\x2d\x11\x13\x11\x2f\x11\x33\x11\x37\x11\x3b\x11\x40\x60\xfc\x11\x90" | |||
| expected: "\x6b\x03\x02\x04\x11\x93\x11\x88\x60\xdc\x11\x84\x6a\x17\x02\x04\x11\x80\x11\x85" | |||
| expected: "\x68\x27\x02\x04\x11\x78\x11\x7d\x84\x44\x89\x52\x02\x04\x11\x6e\x11\x73\x6b\xed" | |||
| expected: "\x02\x04\x11\x6d\x11\x64\x68\x13\x02\x04\x11\x5e\x11\x63\x84\x42\x8a\x4a\x02\x04" | |||
| expected: "\x11\x54\x11\x59\x68\x5b\x02\x04\x11\x4c\x11\x51\x82\x24\x51\x32\x02\x04\x11\x49" | |||
| expected: "\x11\x40\x80\x44\x92\x33\x02\x04\x11\x38\x11\x3d\x80\x44\x92\x53\x02\x04\x11\x2e" | |||
| expected: "\x11\x33\x84\x42\x90\x33\x02\x04\x11\x24\x11\x29\x69\x23\x02\x04\x11\x1c\x11\x21" | |||
| expected: "\x82\x42\x49\x22\x02\x04\x11\x19\x11\x10\x84\x24\x4a\x52\x02\x04\x11\x08\x11\x0d" | |||
| expected: "\x84\x44\x91\x52\x02\x04\x10\xfe\x11\x03\x80\x00\x89\x2a\xff\xf8\x80\x66\xd8\x32" | |||
| expected: "\xf2\xf5\xf9\x82\x20\x4a\x4a\xf2\xeb\x6b\x13\xe7\xee\x68\x5d\xe3\xea\x82\x04\x8a" | |||
| expected: "\x52\xdd\xe4\x80\x22\x89\x42\xde\xd7\x84\x42\x91\x2a\xd1\xd8\x80\x04\x8a\x52\xcb" | |||
| expected: "\xd2\x80\x04\x92\x0a\xc5\xcc\x82\x22\x50\x4b\xbf\xc6\x6b\xf7\xbb\xc2\x68\xbb\xb7" | |||
| expected: "\xbe\x68\xf3\xb3\xba\x84\x44\x8a\x0d\xad\xb4\x80\x22\x49\x12\xae\xa7\x80\x00\x51" | |||
| expected: "\x32\xa8\xa1\x82\x40\x49\x12\xa2\x9b\x80\x00\x82\x0a\x95\x9c\x82\x22\x51\x12\x96" | |||
| expected: "\x8f\x80\x00\x02\x52\x89\x90\x80\x44\x92\x52\x83\x8a\x80\x00\x8a\x12\x7d\x84\x80" | |||
| expected: "\x20\x08\x32\x7e\x77\x80\x04\x12\x12\x71\x78\x80\x04\x90\x52\x6b\x72\x84\x42\x92" | |||
| expected: "\x52\x65\x6c\x80\x44\x12\x32\x5f\x66\x84\x40\x93\x52\x59\x60\x5c\x80\x00\x92\x55" | |||
| expected: "\x52\x59\x6b\xfb\x55\x4e\x84\x04\x81\x32\x48\x4f\x82\x24\x4a\x2a\x49\x42\x84\x44" | |||
| expected: "\x8a\x52\x3c\x43\x6b\xfd\x3f\x38\x82\x22\x88\x22\x39\x32\x80\x44\x91\x53\x2c\x33" | |||
| expected: "\x6b\xb9\x2f\x28\x84\x44\x52\x32\x22\x29\x80\x22\x92\x55\x1c\x23\x80\x00\x4a\x4a" | |||
| expected: "\x1d\x16\x80\x62\x49\x33\x17\x19\x13\x21\x10\x11\x62\x80\x0e\x63\xf7\x0b\x40\x09" | |||
| expected: "\x40\x0c\x60\xfc\x09\x6b\x03\x09\x07\x40\x05\x60\x7f\x02\x40\x02\x60\xfc\x44\x00" | |||
| } | |||
| @ -0,0 +1,106 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.EMPTY; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.OPTIONAL; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.SINGLE; | |||
| import com.google.common.collect.ImmutableSortedSet; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class AnyPathTest { | |||
| @Test | |||
| public void testConstants() { | |||
| assertPath(EMPTY, 0); | |||
| assertPath(SINGLE, 1); | |||
| assertPath(OPTIONAL, 0, 1); | |||
| } | |||
| @Test | |||
| public void testExtend() { | |||
| assertThat(EMPTY.extend(false)).isEqualTo(SINGLE); | |||
| assertThat(EMPTY.extend(true)).isEqualTo(OPTIONAL); | |||
| // Non-optional extension is the same as joining with SINGLE. | |||
| assertPath(SINGLE.extend(false), 2); | |||
| // This is not the same as joining SINGLE.join(OPTIONAL). | |||
| assertPath(SINGLE.extend(true), 0, 2); | |||
| // 100 extends to 1000 or 1001 (if optional). | |||
| assertPath(AnyPath.of(0x4).extend(false), 3); | |||
| assertPath(AnyPath.of(0x4).extend(true), 0, 3); | |||
| } | |||
| @Test | |||
| public void testJoin() { | |||
| assertThat(EMPTY.join(SINGLE)).isEqualTo(SINGLE); | |||
| assertThat(EMPTY.join(OPTIONAL)).isEqualTo(OPTIONAL); | |||
| assertPath(SINGLE.join(SINGLE), 2); | |||
| assertPath(SINGLE.join(OPTIONAL), 1, 2); | |||
| assertPath(OPTIONAL.join(OPTIONAL), 0, 1, 2); | |||
| // "(x(x)?)?" == 110 and matches 0 to 2. | |||
| // "(x(x)?)?".join("(x(x)?)?") == "(x(x(x(x)?)?)?)?" == 11111 and matches 0 to 4. | |||
| assertThat(AnyPath.of(0x7).join(AnyPath.of(0x7))).isEqualTo(AnyPath.of(0x1F)); | |||
| // "xx(x)?" == 1100 and matches 2 or 3. | |||
| // "(xx)?" == 0101 and matches 0 or 2. | |||
| // "xx(x)?".join("(xx)?") == "xx(xx)?" == 111100 and matches 2 to 5. | |||
| assertThat(AnyPath.of(0xC).join(AnyPath.of(0x5))).isEqualTo(AnyPath.of(0x3C)); | |||
| } | |||
| @Test | |||
| public void testMakeOptional() { | |||
| assertThat(OPTIONAL.makeOptional()).isEqualTo(OPTIONAL); | |||
| assertThat(SINGLE.makeOptional()).isEqualTo(OPTIONAL); | |||
| assertPath(AnyPath.of(0x4).makeOptional(), 0, 2); | |||
| } | |||
| @Test | |||
| public void testToString() { | |||
| assertThat(SINGLE.toString()).isEqualTo("x"); | |||
| assertThat(OPTIONAL.toString()).isEqualTo("(x)?"); | |||
| assertThat(AnyPath.of(0x8).toString()).isEqualTo("xxx"); // 1000 = 3 digits | |||
| assertThat(AnyPath.of(0xA).toString()).isEqualTo("x(xx)?"); // 1010 = 1 or 3 digits | |||
| assertThat(AnyPath.of(0xF).toString()).isEqualTo("(x(x(x)?)?)?"); // 1111 = 0 to 3 digits | |||
| } | |||
| // Ordering is important as we need to find the shortest path at certain times. | |||
| @Test | |||
| public void testOrdering() { | |||
| assertThat(SINGLE).isGreaterThan(EMPTY); | |||
| assertThat(OPTIONAL).isGreaterThan(SINGLE); | |||
| assertThat(AnyPath.of(0x8)).isGreaterThan(AnyPath.of(0x4)); | |||
| // Same length, but the 2nd highest length match is taken into account as a tie break. | |||
| // This strategy turns out to match numeric comparison perfectly since set-bits are lengths. | |||
| assertThat(AnyPath.of(0xA)).isGreaterThan(AnyPath.of(0x9)); | |||
| } | |||
| private static void assertPath(AnyPath p, Integer... n) { | |||
| ImmutableSortedSet<Integer> lengths = ImmutableSortedSet.copyOf(n); | |||
| int maxLength = lengths.last(); | |||
| assertThat(p.maxLength()).isEqualTo(maxLength); | |||
| for (int i = 0; i <= maxLength; i++) { | |||
| assertThat(p.acceptsLength(i)).isEqualTo(lengths.contains(i)); | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,224 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; | |||
| import static org.junit.Assert.fail; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.common.collect.ImmutableSet; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.Visitor; | |||
| import java.util.Arrays; | |||
| import java.util.List; | |||
| import java.util.Set; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class EdgeTest { | |||
| @Test | |||
| public void testSimple() { | |||
| assertThat(Edge.fromMask(0x6).getDigitMask()).isEqualTo(0x6); | |||
| assertThat(Edge.fromMask(0x6).isOptional()).isFalse(); | |||
| assertThat(Edge.fromMask(0x3).toString()).isEqualTo("[01]"); // 0000000011 | |||
| assertThat(Edge.fromMask(0x300).toString()).isEqualTo("[89]"); // 1100000000 | |||
| assertThat(Edge.fromMask(0x1FE).toString()).isEqualTo("[1-8]"); // 0111111110 | |||
| assertThat(Edge.fromMask(ALL_DIGITS_MASK).toString()).isEqualTo("x"); // any digit | |||
| } | |||
| @Test | |||
| public void testAny() { | |||
| assertThat(Edge.fromMask(ALL_DIGITS_MASK)).isEqualTo(Edge.any()); | |||
| assertThat(Edge.any().optional()).isEqualTo(Edge.optionalAny()); | |||
| assertThat(Edge.any().toString()).isEqualTo("x"); | |||
| // Unlike AnyPath, simple edges are not sequences, so don't need parens for optional. | |||
| assertThat(Edge.optionalAny().toString()).isEqualTo("x?"); | |||
| } | |||
| @Test | |||
| public void testEpsilon() { | |||
| // Epsilon isn't optional, it represents a path that non-optionally accepts no input. | |||
| assertThat(Edge.epsilon().isOptional()).isFalse(); | |||
| assertThat(Edge.epsilon().toString()).isEqualTo("e"); | |||
| } | |||
| @Test | |||
| public void testConcatenation() { | |||
| Edge concatenated = Edge.concatenation(Edge.fromMask(0x3), Edge.any()); | |||
| assertThat(concatenated.toString()).isEqualTo("[01]x"); | |||
| TestingVisitor v = new TestingVisitor() { | |||
| @Override | |||
| public void visitSequence(List<Edge> edges) { | |||
| assertThat(edges).containsExactly(Edge.fromMask(0x3), Edge.any()).inOrder(); | |||
| wasTested = true; | |||
| } | |||
| }; | |||
| concatenated.accept(v); | |||
| assertThat(v.wasTested).isTrue(); | |||
| } | |||
| @Test | |||
| public void testGroup() { | |||
| Edge group = Edge.disjunction(ImmutableSet.of(Edge.fromMask(0x3), Edge.any())); | |||
| TestingVisitor v = new TestingVisitor() { | |||
| @Override | |||
| public void visitGroup(Set<Edge> edges, boolean isOptional) { | |||
| assertThat(edges).containsExactly(Edge.any(), Edge.fromMask(0x3)).inOrder(); | |||
| assertThat(isOptional).isFalse(); | |||
| wasTested = true; | |||
| } | |||
| }; | |||
| group.accept(v); | |||
| assertThat(group.toString()).isEqualTo("(x|[01])"); | |||
| assertThat(v.wasTested).isTrue(); | |||
| } | |||
| @Test | |||
| public void testOptionalGroup() { | |||
| Edge group = Edge.disjunction(ImmutableSet.of(Edge.fromMask(0x3), Edge.epsilon(), Edge.any())); | |||
| TestingVisitor v = new TestingVisitor() { | |||
| @Override | |||
| public void visitGroup(Set<Edge> edges, boolean isOptional) { | |||
| // Reordered and epsilon removed. | |||
| assertThat(edges).containsExactly(Edge.any(), Edge.fromMask(0x3)).inOrder(); | |||
| assertThat(isOptional).isTrue(); | |||
| wasTested = true; | |||
| } | |||
| }; | |||
| group.accept(v); | |||
| assertThat(group.toString()).isEqualTo("(x|[01])?"); | |||
| assertThat(v.wasTested).isTrue(); | |||
| } | |||
| @Test | |||
| public void testOrdering() { | |||
| // Testing ordering is important because when generating regular expressions, the edge order | |||
| // defines a lot about the visual order of the final regular expression. This order should be | |||
| // as close to "what a person would consider reasonable" as possible. In fact some of the cases | |||
| // tested here will never occur in real situations (e.g. sequences compared with groups) | |||
| // because of the way composite edges are created. However it seems sensible to test the | |||
| // behaviour nevertheless. | |||
| // Simple Edges | |||
| assertSameOrder(e("0"), e("0")); | |||
| // "0" < "1" - lowest bit set wins | |||
| assertOrdered(e("0"), e("1")); | |||
| // "[01]" < "1" - lowest bit set wins | |||
| assertOrdered(e("[01]"), e("1")); | |||
| // "x" < "9" - lowest bit set wins | |||
| assertOrdered(X, e("9")); | |||
| // Sequences | |||
| // ("0x" < "1") and ("0" < "1x") - first edge in sequence is compared to single edge. | |||
| assertOrdered(seq(e("0"), X), e("1")); | |||
| assertOrdered(e("0"), seq(e("1"), X)); | |||
| // "[01]" < "[01]x" - single edges are "smaller" than sequences of edges if all else is equal. | |||
| assertOrdered(e("[01]"), seq(e("[01]"), X)); | |||
| // "[01]x" == "[01]x" | |||
| assertSameOrder(seq(e("[01]"), X), seq(e("[01]"), X)); | |||
| // "x1" < "x2" - comparing 2 sequences compares all edges. | |||
| assertOrdered(seq(X, e("1")), seq(X, e("2"))); | |||
| // "[01]x" < "[01]xx" - shortest sequence wins in tie break (similar to how "[01]" < "[01]x") | |||
| assertOrdered(seq(e("[01]"), X), seq(e("[01]"), X, X)); | |||
| // Disjunctions | |||
| // "(1|2)" == "(2|1)" - edges are sorted when creating disjunctions | |||
| assertSameOrder(or(e("1"), e("2")), or(e("2"), e("1"))); | |||
| // "(1|2|3)" < "(1|2|4)" - comparing 2 disjunctions compares all edges. | |||
| assertOrdered(or(e("1"), e("2"), e("3")), or(e("1"), e("2"), e("4"))); | |||
| // "(1|2)" < "(1|2|3)" - shortest sequence wins in tie break | |||
| assertOrdered(or(e("1"), e("2")), or(e("1"), e("2"), e("3"))); | |||
| // Miscellaneous | |||
| // "1" < "(1|2)" - if first edge matches, single edges sort before groups. | |||
| assertOrdered(e("1"), or(e("1"), e("2"))); | |||
| // "(1|x)" < "1x" - because "(1|x)" is actually "(x|1)" and "x" < "1". | |||
| assertOrdered(or(e("1"), X), seq(e("1"), X)); | |||
| } | |||
| private static void assertSameOrder(Edge lhs, Edge rhs) { | |||
| assertThat(lhs).isEquivalentAccordingToCompareTo(rhs); | |||
| assertThat(lhs).isEqualTo(rhs); | |||
| } | |||
| private static void assertOrdered(Edge lhs, Edge rhs) { | |||
| assertThat(lhs).isNotEqualTo(rhs); | |||
| assertThat(lhs).isLessThan(rhs); | |||
| assertThat(rhs).isGreaterThan(lhs); | |||
| } | |||
| // A bit like a mock, but not really "mocking" existing behaviour. | |||
| private static class TestingVisitor implements Visitor { | |||
| // Set this in overridden method(s). | |||
| protected boolean wasTested = false; | |||
| @Override | |||
| public void visit(SimpleEdge edge) { | |||
| fail("unexpected call"); | |||
| } | |||
| @Override | |||
| public void visitSequence(List<Edge> edges) { | |||
| fail("unexpected call"); | |||
| } | |||
| @Override | |||
| public void visitGroup(Set<Edge> edges, boolean isOptional) { | |||
| fail("unexpected call"); | |||
| } | |||
| } | |||
| // The 'any digit' edge. | |||
| private static final Edge X = e("x"); | |||
| // Creates a simple edge from a range specification string for testing. | |||
| private static SimpleEdge e(String s) { | |||
| RangeSpecification spec = RangeSpecification.parse(s); | |||
| Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges"); | |||
| return SimpleEdge.fromMask(spec.getBitmask(0)); | |||
| } | |||
| // Creates sequence of edges (wrapping for convenience). | |||
| private static Edge seq(Edge first, Edge second, Edge... rest) { | |||
| // This already rejects epsilon edges. | |||
| Edge edge = Edge.concatenation(first, second); | |||
| for (Edge e : rest) { | |||
| edge = Edge.concatenation(edge, e); | |||
| } | |||
| return edge; | |||
| } | |||
| // Creates a non-optional disjunction of edges. | |||
| private static Edge or(Edge... edges) { | |||
| List<Edge> e = Arrays.asList(edges); | |||
| Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups"); | |||
| return Edge.disjunction(e); | |||
| } | |||
| } | |||
| @ -0,0 +1,154 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.ArrayList; | |||
| import java.util.Arrays; | |||
| import java.util.List; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class EdgeWriterTest { | |||
| // Note that this code is tested very thoroughly by any "round-tripping" of regular expressions | |||
| // in the metadata (i.e. generating regular expressions from DFAs and then re-parsing then to | |||
| // ensure that the same DFA is produced). This is part of any acceptance test for generating | |||
| // regular expressions and serves as a far more comprehensive stress test on the code. These | |||
| // tests are thus limited to simpler cases and highlighting interesting behaviour. | |||
| // The 'any digit' edge. | |||
| private static final Edge X = e("x"); | |||
| @Test | |||
| public void testSimple() { | |||
| assertThat(regex(e("0"))).isEqualTo("0"); | |||
| assertThat(regex(e("[0-7]"))).isEqualTo("[0-7]"); | |||
| assertThat(regex(e("[0-9]"))).isEqualTo("\\d"); | |||
| assertThat(regex(X)).isEqualTo("\\d"); | |||
| } | |||
| @Test | |||
| public void testSequences() { | |||
| assertThat(regex(seq(e("0"), e("1"), e("2")))).isEqualTo("012"); | |||
| } | |||
| @Test | |||
| public void testGroups() { | |||
| // Non-optional groups spanning the top level don't need parentheses. | |||
| assertThat(regex(or(e("0"), e("1"), e("2")))).isEqualTo("0|1|2"); | |||
| // Optional groups always need parentheses. | |||
| assertThat(regex(opt(e("0"), e("1"), e("2")))).isEqualTo("(?:0|1|2)?"); | |||
| // Once a group has prefix or suffix, parentheses are needed. | |||
| assertThat(regex( | |||
| seq( | |||
| or(e("0"), e("1")), | |||
| e("2")))) | |||
| .isEqualTo("(?:0|1)2"); | |||
| } | |||
| @Test | |||
| public void testNesting() { | |||
| // Basic nesting is handled by a very straightforward edge visitor, so one non-trivial test | |||
| // will cover all the basic cases ("any digit" sequences are a different matter however). | |||
| assertThat(regex( | |||
| seq( | |||
| e("0"), | |||
| or( | |||
| e("1"), | |||
| seq( | |||
| e("2"), | |||
| opt(e("3"), e("4")))), | |||
| e("5"), e("6")))) | |||
| .isEqualTo("0(?:1|2(?:3|4)?)56"); | |||
| } | |||
| @Test | |||
| public void testAnyDigitSequences() { | |||
| // This is the complex part of efficient regular expression generation. | |||
| assertThat(regex(seq(e("0"), e("1"), X))).isEqualTo("01\\d"); | |||
| // "\d\d" is shorter than "\d{2}" | |||
| assertThat(regex(seq(X, X))).isEqualTo("\\d\\d"); | |||
| assertThat(regex(seq(X, X, X))).isEqualTo("\\d{3}"); | |||
| // Top level optional groups are supported. | |||
| assertThat(regex(opt(seq(X, X)))).isEqualTo("(?:\\d{2})?"); | |||
| // Optional parts go at the end. | |||
| assertThat(regex( | |||
| seq( | |||
| opt(seq(X, X)), | |||
| X, X))) | |||
| .isEqualTo("\\d\\d(?:\\d{2})?"); | |||
| // "(x(x(x)?)?)?" | |||
| Edge anyGrp = opt(seq( | |||
| X, | |||
| opt(seq( | |||
| X, | |||
| opt(X))))); | |||
| // The two cases of a group on its own or as part of a sequence are handled separately, so | |||
| // must be tested separately. | |||
| assertThat(regex(anyGrp)).isEqualTo("\\d{0,3}"); | |||
| assertThat(regex(seq(e("1"), e("2"), anyGrp))).isEqualTo("12\\d{0,3}"); | |||
| // xx(x(x(x)?)?)?" | |||
| assertThat(regex(seq(X, X, anyGrp))).isEqualTo("\\d{2,5}"); | |||
| // Combining "any digit" groups produces minimal representation | |||
| assertThat(regex(seq(anyGrp, anyGrp))).isEqualTo("\\d{0,6}"); | |||
| } | |||
| // Helper to call standard version of regex generator (not using 'dot' for matching). | |||
| private String regex(Edge e) { | |||
| return EdgeWriter.toRegex(e, false /* use dot match */); | |||
| } | |||
| // Creates a simple edge from a range specification string for testing. | |||
| private static SimpleEdge e(String s) { | |||
| RangeSpecification spec = RangeSpecification.parse(s); | |||
| Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges"); | |||
| return SimpleEdge.fromMask(spec.getBitmask(0)); | |||
| } | |||
| // Creates sequence of edges (wrapping for convenience). | |||
| private static Edge seq(Edge first, Edge second, Edge... rest) { | |||
| // This already rejects epsilon edges. | |||
| Edge edge = Edge.concatenation(first, second); | |||
| for (Edge e : rest) { | |||
| edge = Edge.concatenation(edge, e); | |||
| } | |||
| return edge; | |||
| } | |||
| // Creates a non-optional disjunction of edges. | |||
| private static Edge or(Edge... edges) { | |||
| List<Edge> e = Arrays.asList(edges); | |||
| Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups"); | |||
| return Edge.disjunction(e); | |||
| } | |||
| // Creates an optional disjunction of edges. | |||
| private static Edge opt(Edge... edges) { | |||
| List<Edge> e = new ArrayList<>(); | |||
| e.addAll(Arrays.asList(edges)); | |||
| Preconditions.checkArgument(!e.contains(Edge.epsilon()), "don't pass epsilon directly"); | |||
| e.add(Edge.epsilon()); | |||
| return Edge.disjunction(e); | |||
| } | |||
| } | |||
| @ -0,0 +1,98 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.Preconditions.checkArgument; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; | |||
| import com.google.common.graph.MutableValueGraph; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.common.graph.ValueGraphBuilder; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| /** Simple fluent API for constructing graphs for testing. */ | |||
| final class NfaBuilder { | |||
| private final MutableValueGraph<Node, SimpleEdge> graph = | |||
| ValueGraphBuilder.directed().allowsSelfLoops(false).build(); | |||
| // The last node added to the graph. | |||
| private Node lastNode; | |||
| /** Creates a new mutable NFA graph. */ | |||
| public NfaBuilder() { | |||
| graph.addNode(INITIAL); | |||
| graph.addNode(TERMINAL); | |||
| lastNode = TERMINAL; | |||
| } | |||
| /** | |||
| * Returns an unmodifiable view of the underlying graph (not a snapshot). If the builder is | |||
| * modified after this method is called, it will affect what was returned. | |||
| */ | |||
| public ValueGraph<Node, SimpleEdge> graph() { | |||
| return graph; | |||
| } | |||
| /** Adds a new path from the given source node, returning the newly created target node. */ | |||
| public Node addPath(Node source, String path) { | |||
| RangeSpecification spec = RangeSpecification.parse(path); | |||
| for (int n = 0; n < spec.length(); n++) { | |||
| lastNode = lastNode.createNext(); | |||
| addEdge(source, lastNode, SimpleEdge.fromMask(spec.getBitmask(n))); | |||
| source = lastNode; | |||
| } | |||
| return lastNode; | |||
| } | |||
| /** Adds a new path between the given source and target (all intermediate nodes are new). */ | |||
| public void addPath(Node source, Node target, String path) { | |||
| RangeSpecification spec = RangeSpecification.parse(path); | |||
| for (int n = 0; n < spec.length() - 1; n++) { | |||
| lastNode = lastNode.createNext(); | |||
| addEdge(source, lastNode, SimpleEdge.fromMask(spec.getBitmask(n))); | |||
| source = lastNode; | |||
| } | |||
| addEdge(source, target, SimpleEdge.fromMask(spec.getBitmask(spec.length() - 1))); | |||
| } | |||
| /** | |||
| * Adds a new path between the given source and target nodes, along with an epsilon edge from the | |||
| * source to the target. | |||
| */ | |||
| public void addOptionalPath(Node source, Node target, String path) { | |||
| addPath(source, target, path); | |||
| addEpsilon(source, target); | |||
| } | |||
| private void addEpsilon(Node s, Node t) { | |||
| checkArgument(graph.nodes().contains(s), "missing source node"); | |||
| checkArgument(graph.nodes().contains(s), "missing target node"); | |||
| SimpleEdge e = graph.putEdgeValue(s, t, Edge.epsilon()); | |||
| if (e != null) { | |||
| // Edge already exists; if not an epsilon, make it optional. | |||
| checkArgument(!e.equals(Edge.epsilon()) && !e.isOptional(), "epsilon already added"); | |||
| graph.putEdgeValue(s, t, e.optional()); | |||
| } | |||
| } | |||
| private void addEdge(Node s, Node t, SimpleEdge e) { | |||
| graph.addNode(s); | |||
| graph.addNode(t); | |||
| checkArgument(graph.putEdgeValue(s, t, e) == null, "edge already exists"); | |||
| } | |||
| } | |||
| @ -0,0 +1,229 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; | |||
| import com.google.common.base.Preconditions; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.ArrayList; | |||
| import java.util.Arrays; | |||
| import java.util.Comparator; | |||
| import java.util.List; | |||
| import java.util.TreeSet; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class NfaFlattenerTest { | |||
| // The 'any digit' edge. | |||
| private static final Edge X = e("x"); | |||
| @Test | |||
| public void testSimple() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| nfa.addPath(INITIAL, TERMINAL, "12"); | |||
| Edge flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo(seq(e("1"), e("2"))); | |||
| assertThat(flat.toString()).isEqualTo("12"); | |||
| nfa.addPath(INITIAL, TERMINAL, "34"); | |||
| flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo( | |||
| or( | |||
| seq(e("1"), e("2")), | |||
| seq(e("3"), e("4")))); | |||
| assertThat(flat.toString()).isEqualTo("(12|34)"); | |||
| } | |||
| @Test | |||
| public void testSubgroup() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| Node split = nfa.addPath(INITIAL, "12"); | |||
| Node join = nfa.addPath(split, "34"); | |||
| nfa.addPath(split, join, "56"); | |||
| nfa.addPath(join, TERMINAL, "78"); | |||
| Edge flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo( | |||
| seq(e("1"), e("2"), | |||
| or( | |||
| seq(e("3"), e("4")), | |||
| seq(e("5"), e("6")) | |||
| ), | |||
| e("7"), e("8"))); | |||
| assertThat(flat.toString()).isEqualTo("12(34|56)78"); | |||
| } | |||
| @Test | |||
| public void testSubgroupWithEarlyJoining() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| // Create a graph with 4 initial paths branching out which collapses to 3, 2 and then 1. | |||
| Node groupStart = nfa.addPath(INITIAL, "0"); | |||
| // Add 2 edges to the first join point (if we add only one edge then it clashes with the | |||
| // joining edge, which goes directly from groupStart to firstJoin. | |||
| Node firstJoin = nfa.addPath(nfa.addPath(groupStart, "1"), "2"); | |||
| nfa.addPath(groupStart, firstJoin, "3"); | |||
| Node secondJoin = nfa.addPath(firstJoin, "4"); | |||
| nfa.addPath(groupStart, secondJoin, "5"); | |||
| Node groupEnd = nfa.addPath(secondJoin, "6"); | |||
| nfa.addPath(groupStart, groupEnd, "7"); | |||
| nfa.addPath(groupEnd, TERMINAL, "8"); | |||
| Edge flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo( | |||
| seq(e("0"), | |||
| or( | |||
| seq( | |||
| or( | |||
| seq( | |||
| or( | |||
| seq(e("1"), e("2")), | |||
| e("3")), | |||
| e("4")), | |||
| e("5")), | |||
| e("6")), | |||
| e("7")), | |||
| e("8"))); | |||
| assertThat(flat.toString()).isEqualTo("0(((12|3)4|5)6|7)8"); | |||
| } | |||
| @Test | |||
| public void testPathDuplication() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| Node groupStart = nfa.addPath(INITIAL, "0"); | |||
| Node lhsMid = nfa.addPath(groupStart, "1"); | |||
| Node groupEnd = nfa.addPath(lhsMid, "2"); | |||
| Node rhsMid = nfa.addPath(groupStart, "3"); | |||
| nfa.addPath(rhsMid, groupEnd, "4"); | |||
| nfa.addPath(groupEnd, TERMINAL, "5"); | |||
| // So far this is a normal nestable graph: | |||
| // ,--1-->()--2--v | |||
| // (I)--0-->() ()--5-->(T) | |||
| // `--3-->()--4--^ | |||
| Edge flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo( | |||
| seq(e("0"), | |||
| or( | |||
| seq(e("1"), e("2")), | |||
| seq(e("3"), e("4"))), | |||
| e("5"))); | |||
| assertThat(flat.toString()).isEqualTo("0(12|34)5"); | |||
| // This new path "crosses" the group, creating a non-nestable structure which can only be | |||
| // resolved by duplicating some path (in this case it's the 2nd part of the right-hand-side). | |||
| nfa.addPath(lhsMid, rhsMid, "x"); | |||
| flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo( | |||
| seq(e("0"), | |||
| or( | |||
| seq(e("1"), | |||
| or( | |||
| e("2"), | |||
| seq(X, e("4")))), | |||
| seq(e("3"), e("4"))), | |||
| e("5"))); | |||
| // Note the duplication of the '4' to make the graph nestable. | |||
| assertThat(flat.toString()).isEqualTo("0(1(x4|2)|34)5"); | |||
| } | |||
| @Test | |||
| public void testNodeOrdering_bug_65250963() { | |||
| // ,--->(C)----------. | |||
| // | v | |||
| // (I)-->(D)-->(B)-->(T) | |||
| // | ^ | |||
| // `--->(A)----' | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| // IMPORTANT: Order of insertion determines the node IDs (A=1, B=2...). The edge index just | |||
| // happens to match node ID for readability, but doesn't affect the test directly. | |||
| Node a = nfa.addPath(INITIAL, "1"); | |||
| Node b = nfa.addPath(a, "2"); | |||
| Node c = nfa.addPath(INITIAL, "3"); | |||
| Node d = nfa.addPath(INITIAL, "4"); | |||
| // Now join up remaining paths. | |||
| nfa.addPath(d, b, "5"); | |||
| nfa.addPath(b, TERMINAL, "6"); | |||
| nfa.addPath(c, TERMINAL, "7"); | |||
| Comparator<Node> ordering = NfaFlattener.nodeOrdering(nfa.graph()); | |||
| // In the old ordering code, because (B) and (D) are not reachable to/from (C) we would have | |||
| // had the ordering (D < B), (B < C), (C < D) giving a cycle. In the new code, the longest path | |||
| // length to reach (C) is less than (B), so we get (C < B) and we no longer have a cycle. | |||
| // The node ordering is now: (INITIAL, A, C, D, B, TERMINAL) | |||
| TreeSet<Node> nodes = new TreeSet<>(ordering); | |||
| nodes.add(INITIAL); | |||
| nodes.add(TERMINAL); | |||
| nodes.add(a); | |||
| nodes.add(b); | |||
| nodes.add(c); | |||
| nodes.add(d); | |||
| assertThat(nodes).containsExactly(INITIAL, a, c, d, b, TERMINAL).inOrder(); | |||
| } | |||
| @Test | |||
| public void testOptionalTopLevelGroup_bug_69101586() { | |||
| // ,--->(e)----. | |||
| // | v | |||
| // (I)-->(A)-->(T) | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| nfa.addOptionalPath(INITIAL, TERMINAL, "xx"); | |||
| Edge flat = NfaFlattener.flatten(nfa.graph()); | |||
| assertThat(flat).isEqualTo(opt(seq(X, X))); | |||
| assertThat(flat.toString()).isEqualTo("(xx)?"); | |||
| } | |||
| // Creates a simple edge from a range specification string for testing. | |||
| private static SimpleEdge e(String s) { | |||
| RangeSpecification spec = RangeSpecification.parse(s); | |||
| Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges"); | |||
| return SimpleEdge.fromMask(spec.getBitmask(0)); | |||
| } | |||
| // Creates sequence of edges (wrapping for convenience). | |||
| private static Edge seq(Edge first, Edge second, Edge... rest) { | |||
| // This already rejects epsilon edges. | |||
| Edge edge = Edge.concatenation(first, second); | |||
| for (Edge e : rest) { | |||
| edge = Edge.concatenation(edge, e); | |||
| } | |||
| return edge; | |||
| } | |||
| // Creates an optional disjunction of edges. | |||
| private static Edge opt(Edge... edges) { | |||
| List<Edge> e = new ArrayList<>(); | |||
| e.addAll(Arrays.asList(edges)); | |||
| Preconditions.checkArgument(!e.contains(Edge.epsilon()), "don't pass epsilon directly"); | |||
| e.add(Edge.epsilon()); | |||
| return Edge.disjunction(e); | |||
| } | |||
| // Creates a non-optional disjunction of edges. | |||
| private static Edge or(Edge... edges) { | |||
| List<Edge> e = Arrays.asList(edges); | |||
| Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups"); | |||
| return Edge.disjunction(e); | |||
| } | |||
| } | |||
| @ -0,0 +1,68 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class NodeTest { | |||
| @Test | |||
| public void testConstants() { | |||
| assertThat(INITIAL.id()).isEqualTo(0); | |||
| assertThat(TERMINAL.id()).isEqualTo(1); | |||
| assertThat(TERMINAL).isNotEqualTo(INITIAL); | |||
| } | |||
| @Test | |||
| public void testNext() { | |||
| assertThat(INITIAL.createNext()).isSameInstanceAs(TERMINAL); | |||
| assertThat(TERMINAL.createNext()).isNotEqualTo(TERMINAL); | |||
| assertThat(TERMINAL.createNext().id()).isEqualTo(2); | |||
| Node node = INITIAL; | |||
| for (int id = 0; id < 10; id++) { | |||
| assertThat(node.id()).isEqualTo(id); | |||
| node = node.createNext(); | |||
| } | |||
| } | |||
| @Test | |||
| public void testToString() { | |||
| Node node = INITIAL; | |||
| for (int id = 0; id < 10; id++) { | |||
| assertThat(node.toString()).isEqualTo(Integer.toString(id)); | |||
| node = node.createNext(); | |||
| } | |||
| } | |||
| // Consistent ordering helps ensure regular expressions derived from graphs are deterministic. | |||
| @Test | |||
| public void testOrdering() { | |||
| assertThat(TERMINAL).isGreaterThan(INITIAL); | |||
| Node node = INITIAL; | |||
| for (int id = 0; id < 10; id++) { | |||
| Node next = node.createNext(); | |||
| assertThat(next).isGreaterThan(node); | |||
| node = next; | |||
| } | |||
| } | |||
| } | |||
| @ -0,0 +1,154 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.collect.ImmutableList.toImmutableList; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Edge.any; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Edge.epsilon; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Edge.optionalAny; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; | |||
| import com.google.common.collect.Iterables; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import java.util.List; | |||
| import java.util.stream.Stream; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class RangeTreeConverterTest { | |||
| // Simple 4 node DFA. | |||
| // (I) --1--> ( ) --2--> ( ) --x--> (T) | |||
| @Test | |||
| public void testSimple() { | |||
| RangeTree dfa = RangeTree.from(specs("12x")); | |||
| ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa); | |||
| assertThat(nfa.nodes()).hasSize(4); | |||
| Node node = assertOutEdge(nfa, INITIAL, edge(1)); | |||
| node = assertOutEdge(nfa, node, edge(2)); | |||
| node = assertOutEdge(nfa, node, any()); | |||
| assertThat(node).isEqualTo(TERMINAL); | |||
| } | |||
| // Simple 4 node DFA with optional termination immediately before the terminal node. | |||
| // (I) --1--> ( ) --2--> (T) --x--> (T) | |||
| @Test | |||
| public void testWithOptionalEdge() { | |||
| RangeTree dfa = RangeTree.from(specs("12x", "12")); | |||
| ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa); | |||
| assertThat(nfa.nodes()).hasSize(4); | |||
| Node node = assertOutEdge(nfa, INITIAL, edge(1)); | |||
| node = assertOutEdge(nfa, node, edge(2)); | |||
| node = assertOutEdge(nfa, node, optionalAny()); | |||
| assertThat(node).isEqualTo(TERMINAL); | |||
| } | |||
| // Simple 4 node DFA with optional termination. | |||
| // (I) --1--> (T) --2--> ( ) --x--> (T) | |||
| @Test | |||
| public void testWithEpsilon() { | |||
| RangeTree dfa = RangeTree.from(specs("12x", "1")); | |||
| ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa); | |||
| assertThat(nfa.nodes()).hasSize(4); | |||
| Node node = assertOutEdge(nfa, INITIAL, edge(1)); | |||
| assertOutEdges(nfa, node, edge(2), epsilon()); | |||
| // One of the out nodes should be the terminal. | |||
| assertThat(follow(nfa, node, epsilon())).isEqualTo(Node.TERMINAL); | |||
| node = follow(nfa, node, edge(2)); | |||
| // The other is the normal edge that leads to the terminal. | |||
| node = follow(nfa, node, any()); | |||
| assertThat(node).isEqualTo(TERMINAL); | |||
| } | |||
| // Simple 5 node DFA with 2 paths. | |||
| // (I) --1--> ( ) --2--> ( ) --x--> (T) | |||
| // `---3--> ( ) --4----^ | |||
| @Test | |||
| public void testMultiplePathsWithCommonTail() { | |||
| RangeTree dfa = RangeTree.from(specs("12x", "34x")); | |||
| ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa); | |||
| assertThat(nfa.nodes()).hasSize(5); | |||
| assertOutEdges(nfa, INITIAL, edge(1), edge(3)); | |||
| Node lhs = follow(nfa, INITIAL, edge(1)); | |||
| lhs = assertOutEdge(nfa, lhs, edge(2)); | |||
| Node rhs = follow(nfa, INITIAL, edge(3)); | |||
| rhs = assertOutEdge(nfa, rhs, edge(4)); | |||
| assertThat(lhs).isEqualTo(rhs); | |||
| Node node = assertOutEdge(nfa, lhs, any()); | |||
| assertThat(node).isEqualTo(TERMINAL); | |||
| } | |||
| @Test | |||
| public void testOptionalTopLevelGroup_bug_69101586() { | |||
| // Requires making a top level optional group, which is (deliberately) not easy with the | |||
| // DFA tooling since it's pretty rare. This is a DFA which can terminate immediately and will | |||
| // match the empty input (as well as its normal input). | |||
| RangeTree dfa = RangeTree.from(specs("xx")).union(RangeTree.from(RangeSpecification.empty())); | |||
| ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa); | |||
| assertThat(nfa.nodes()).hasSize(3); | |||
| assertThat(follow(nfa, INITIAL, epsilon())).isEqualTo(Node.TERMINAL); | |||
| Node node = follow(nfa, INITIAL, any()); | |||
| node = assertOutEdge(nfa, node, any()); | |||
| assertThat(node).isEqualTo(TERMINAL); | |||
| } | |||
| // Returns the simple edge matching exactly this one digit value. | |||
| SimpleEdge edge(int n) { | |||
| return SimpleEdge.fromMask(1 << n); | |||
| } | |||
| List<RangeSpecification> specs(String... s) { | |||
| return Stream.of(s).map(RangeSpecification::parse).collect(toImmutableList()); | |||
| } | |||
| // Asserts that a node has only one out edge and returns that edge's target. | |||
| Node assertOutEdge(ValueGraph<Node, SimpleEdge> nfa, Node node, SimpleEdge edge) { | |||
| assertThat(nfa.successors(node)).hasSize(1); | |||
| Node target = Iterables.getOnlyElement(nfa.successors(node)); | |||
| assertThat(nfa.edgeValue(node, target).get()).isEqualTo(edge); | |||
| return target; | |||
| } | |||
| // Asserts that a node has all the given edges. | |||
| void assertOutEdges(ValueGraph<Node, SimpleEdge> nfa, Node node, SimpleEdge... edges) { | |||
| assertThat(nfa.successors(node)).hasSize(edges.length); | |||
| List<Edge> out = nfa.successors(node).stream() | |||
| .map(t -> nfa.edgeValue(node, t).get()) | |||
| .collect(toImmutableList()); | |||
| assertThat(out).containsExactlyElementsIn(edges); | |||
| } | |||
| // Follows the given edge from a node (which must be in the graph), returning the target node | |||
| // (or null if the edge does not exist in the graph). | |||
| Node follow(ValueGraph<Node, SimpleEdge> nfa, Node node, SimpleEdge edge) { | |||
| return nfa.successors(node).stream() | |||
| .filter(t -> nfa.edgeValue(node, t).get().equals(edge)) | |||
| .findFirst() | |||
| .orElse(null); | |||
| } | |||
| } | |||
| @ -0,0 +1,107 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.FORCE_CAPTURING_GROUPS; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.FORCE_NON_CAPTURING_GROUPS; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.PRESERVE_CAPTURING_GROUPS; | |||
| import com.google.common.base.Joiner; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class RegexFormatterTest { | |||
| // Luckily the formatter cares only about 3 special characters, '(', '|' and ')', so we only need | |||
| // to test a few very straightforward cases to cover everything. | |||
| @Test | |||
| public void testSimple() { | |||
| assertThat(RegexFormatter.format("abcd", PRESERVE_CAPTURING_GROUPS)) | |||
| .isEqualTo("abcd"); | |||
| } | |||
| @Test | |||
| public void testNested() { | |||
| assertThat(RegexFormatter.format("ab(cd|ef)gh", PRESERVE_CAPTURING_GROUPS)).isEqualTo(lines( | |||
| "ab(", | |||
| " cd|", | |||
| " ef", | |||
| ")gh")); | |||
| assertThat(RegexFormatter.format("ab(?:cd|ef)gh", PRESERVE_CAPTURING_GROUPS)).isEqualTo(lines( | |||
| "ab(?:", | |||
| " cd|", | |||
| " ef", | |||
| ")gh")); | |||
| } | |||
| @Test | |||
| public void testDoubleNested() { | |||
| assertThat(RegexFormatter.format("ab(cd(ef|gh)|ij)", PRESERVE_CAPTURING_GROUPS)) | |||
| .isEqualTo(lines( | |||
| "ab(", | |||
| " cd(", | |||
| " ef|", | |||
| " gh", | |||
| " )|", | |||
| " ij", | |||
| ")")); | |||
| assertThat(RegexFormatter.format("ab(cd(?:ef|gh)|ij)", PRESERVE_CAPTURING_GROUPS)) | |||
| .isEqualTo(lines( | |||
| "ab(", | |||
| " cd(?:", | |||
| " ef|", | |||
| " gh", | |||
| " )|", | |||
| " ij", | |||
| ")")); | |||
| } | |||
| @Test | |||
| public void testForceNonCapturingGroups() { | |||
| assertThat(RegexFormatter.format("ab(?:cd(ef|gh)|ij)", FORCE_NON_CAPTURING_GROUPS)) | |||
| .isEqualTo(lines( | |||
| "ab(?:", | |||
| " cd(?:", | |||
| " ef|", | |||
| " gh", | |||
| " )|", | |||
| " ij", | |||
| ")")); | |||
| } | |||
| @Test | |||
| public void testForceCapturingGroups() { | |||
| assertThat(RegexFormatter.format("ab(?:cd(ef|gh)|ij)", FORCE_CAPTURING_GROUPS)).isEqualTo(lines( | |||
| "ab(", | |||
| " cd(", | |||
| " ef|", | |||
| " gh", | |||
| " )|", | |||
| " ij", | |||
| ")")); | |||
| } | |||
| private static String lines(String... s) { | |||
| return Joiner.on('\n').join(s); | |||
| } | |||
| } | |||
| @ -0,0 +1,197 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.base.CharMatcher.whitespace; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.RegexGenerator.basic; | |||
| import static java.util.stream.Collectors.joining; | |||
| import com.google.common.collect.ImmutableList; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import java.util.Arrays; | |||
| import java.util.List; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class RegexGeneratorTest { | |||
| @Test | |||
| public void testSimple() { | |||
| assertRegex(basic(), ranges("123xxx"), "123\\d{3}"); | |||
| // This could be improved to "..." rather than ".{3}" saving 1 char, probably not worth it. | |||
| assertRegex(basic().withDotMatch(), ranges("123xxx"), "123.{3}"); | |||
| } | |||
| @Test | |||
| public void testVariableLength() { | |||
| assertRegex(basic(), ranges("123xxx", "123xxxx", "123xxxxx", "123xxxxxx"), "123\\d{3,6}"); | |||
| } | |||
| @Test | |||
| public void testTailOptimization() { | |||
| RangeTree dfa = ranges("123xxx", "123xxxx", "145xxx"); | |||
| assertRegex(basic(), dfa, "1(?:23\\d{3,4}|45\\d{3})"); | |||
| assertRegex(basic().withTailOptimization(), dfa, "1(?:23\\d?|45)\\d{3}"); | |||
| } | |||
| @Test | |||
| public void testDfaFactorization() { | |||
| // Essentially create a "thin" wedge of specific non-determinism with the shorter (5-digit) | |||
| // numbers which prevents the larger ranges from being contiguous in the DFA. | |||
| RangeTree dfa = ranges("1234x", "1256x", "[0-4]xxxxxx", "[0-4]xxxxxxx"); | |||
| assertRegex(basic(), dfa, | |||
| "[02-4]\\d{6,7}|", | |||
| "1(?:[013-9]\\d{5,6}|", | |||
| "2(?:[0-246-9]\\d{4,5}|", | |||
| "3(?:[0-35-9]\\d{3,4}|4\\d(?:\\d{2,3})?)|", | |||
| "5(?:[0-57-9]\\d{3,4}|6\\d(?:\\d{2,3})?)))"); | |||
| assertRegex(basic().withDfaFactorization(), dfa, "[0-4]\\d{6,7}|12(?:34|56)\\d"); | |||
| } | |||
| @Test | |||
| public void testSubgroupOptimization() { | |||
| // The subgraph of "everything except 95, 96 and 100" (this appears in China leading digits). | |||
| RangeTree postgraph = ranges("[02-8]", "1[1-9]", "10[1-9]", "9[0-47-9]"); | |||
| RangeTree pregraph = ranges("123", "234", "345", "456", "567"); | |||
| // Cross product of pre and post paths. | |||
| RangeTree subgraph = RangeTree.from( | |||
| pregraph.asRangeSpecifications().stream() | |||
| .flatMap(a -> postgraph.asRangeSpecifications().stream().map(a::extendBy))); | |||
| // Union in other paths to trigger repetition in the "basic" case. | |||
| RangeTree rest = ranges("128xx", "238xx", "348xx", "458xx", "568xx"); | |||
| RangeTree dfa = rest.union(subgraph); | |||
| assertRegex(basic(), dfa, | |||
| "12(?:3(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", | |||
| "23(?:4(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", | |||
| "34(?:5(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", | |||
| "45(?:6(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", | |||
| "56(?:7(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)"); | |||
| assertRegex(basic().withSubgroupOptimization(), dfa, | |||
| "(?:12|23|34|45|56)8\\d\\d|", | |||
| "(?:123|234|345|456|567)(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])"); | |||
| } | |||
| @Test | |||
| public void testRegression_bug_65250963() { | |||
| RangeTree dfa = ranges( | |||
| "1387", | |||
| "1697", | |||
| "1524", | |||
| "1539", | |||
| "1768", | |||
| "1946"); | |||
| assertRegex(basic(), dfa, | |||
| "1(?:", | |||
| " (?:", | |||
| " 38|", | |||
| " 69", | |||
| " )7|", | |||
| " 5(?:", | |||
| " 24|", | |||
| " 39", | |||
| " )|", | |||
| " 768|", | |||
| " 946", | |||
| ")"); | |||
| } | |||
| @Test | |||
| public void testRegression_bug_68929642() { | |||
| assertMatches( | |||
| "1\\d{6}(?:\\d{2})?", | |||
| ImmutableList.of("1234567", "123456789"), | |||
| ImmutableList.of("12345678"), | |||
| "1xxx_xxx", "1xx_xxx_xxx"); | |||
| assertMatches( | |||
| "1\\d{6}[0-7]?", | |||
| ImmutableList.of("1234567", "12345670"), | |||
| ImmutableList.of("123456", "123456700"), | |||
| "1xxx_xxx", "1x_xxx_xx[0-7]"); | |||
| assertMatches( | |||
| "\\d\\d?", | |||
| ImmutableList.of("1", "12"), | |||
| ImmutableList.of("", "123"), | |||
| "x", "xx"); | |||
| assertMatches( | |||
| "\\d{1,3}", | |||
| ImmutableList.of("1", "12", "123"), | |||
| ImmutableList.of("", "1234"), | |||
| "x", "xx", "xxx"); | |||
| assertMatches( | |||
| "\\d(?:\\d{3}(?:\\d{2})?)?", | |||
| ImmutableList.of("1", "1234", "123456"), | |||
| ImmutableList.of("", "12", "123", "12345", "1234567"), | |||
| "x", "xxxx", "xxx_xxx"); | |||
| assertMatches( | |||
| "(?:\\d\\d(?:\\d(?:\\d{2,4})?)?)?", | |||
| ImmutableList.of("", "12", "123", "12345", "123456", "1234567"), | |||
| ImmutableList.of("1", "1234", "12345678"), | |||
| "", "xx", "xxx", "xx_xxx", "xxx_xxx", "xxxx_xxx"); | |||
| assertMatches( | |||
| "(?:\\d{2})?", | |||
| ImmutableList.of("", "12"), | |||
| ImmutableList.of("1", "123"), | |||
| "", "xx"); | |||
| assertMatches( | |||
| "\\d?", | |||
| ImmutableList.of("", "1"), | |||
| ImmutableList.of("12"), | |||
| "", "x"); | |||
| } | |||
| // This does not check that the generated regex is the same as the input, but it does test some | |||
| // positive/negative matching cases against both and verifies that the DFA for both are equal. | |||
| private static void assertMatches( | |||
| String pattern, List<String> matchNumbers, List<String> noMatchNumbers, String... specs) { | |||
| String regex = basic().toRegex(ranges(specs)); | |||
| assertThat(regex).isEqualTo(pattern); | |||
| // Test the given positive/negative match numbers and expect the same behaviour from both. | |||
| for (String number : matchNumbers) { | |||
| assertThat(number).matches(pattern); | |||
| assertThat(number).matches(regex); | |||
| } | |||
| for (String number : noMatchNumbers) { | |||
| assertThat(number).doesNotMatch(pattern); | |||
| assertThat(number).doesNotMatch(regex); | |||
| } | |||
| } | |||
| private static void assertRegex(RegexGenerator generator, RangeTree dfa, String... lines) { | |||
| String regex = generator.toRegex(dfa); | |||
| String expected = Arrays.stream(lines).map(whitespace()::removeFrom).collect(joining()); | |||
| assertThat(regex).isEqualTo(expected); | |||
| } | |||
| private static RangeTree ranges(String... specs) { | |||
| return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse)); | |||
| } | |||
| } | |||
| @ -0,0 +1,80 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.common.truth.Truth8.assertThat; | |||
| import com.google.i18n.phonenumbers.metadata.RangeSpecification; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree; | |||
| import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; | |||
| import com.google.i18n.phonenumbers.metadata.regex.SubgroupOptimizer.LinkNodeVisitor; | |||
| import java.util.Arrays; | |||
| import java.util.Optional; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class SubgraphOptimizerTest { | |||
| // The subgraph of "everything except 95, 96 and 100" (this appears in China leading digits). | |||
| // Note that unlike China, there's also an early terminating '9' in the subgraph to ensure that | |||
| // the entire subgraph is extracted (including teminating node). | |||
| private static final RangeTree POSTGRAPH = ranges("[02-9]", "1[1-9]", "10[1-9]", "9[0-47-9]"); | |||
| // Some prefixes which come before the subgraph. | |||
| private static final RangeTree PREGRAPH = ranges("123", "234", "345", "456", "567"); | |||
| // Cross product of pre and post paths. | |||
| private static final RangeTree SUBGRAPH = RangeTree.from( | |||
| PREGRAPH.asRangeSpecifications().stream() | |||
| .flatMap(a -> POSTGRAPH.asRangeSpecifications().stream().map(a::extendBy))); | |||
| // Additional paths which share edges in the subgraph and will cause repetition in regular | |||
| // expressions. Also add a couple of early terminating paths "on the way to" the subgroup. | |||
| // Note however that a terminating path that reaches the root of the subgraph (e.g. "123") will | |||
| // cause a split in the DFA at the root node (one terminating, one not terminating). | |||
| private static final RangeTree TEST_RANGES = | |||
| SUBGRAPH.union(ranges("128xx", "238xx", "348xx", "458xx", "568xx", "12", "34")); | |||
| @Test | |||
| public void testSubgraphWeightAndInOrder() { | |||
| LinkNodeVisitor v = new LinkNodeVisitor(); | |||
| TEST_RANGES.accept(v); | |||
| DfaNode n = v.getHighestCostNode(); | |||
| assertThat(n).isNotNull(); | |||
| // 5 paths in PREGRAPH which reach the root of POSTGRAPH. | |||
| assertThat(v.getInOrder(n)).isEqualTo(5); | |||
| // 7 edges in POSTGRAPH with a total weight of 27: | |||
| // "[02-8]" = 6, "1", "0", "9" = 3, 2 x "[1-9]" = 10, "[0-47-9]" = 8 | |||
| assertThat(v.getSubgraphWeight(n)).isEqualTo(27); | |||
| } | |||
| @Test | |||
| public void testSubgraphExtraction() { | |||
| Optional<RangeTree> extracted = SubgroupOptimizer.extractRepeatingSubgraph(TEST_RANGES); | |||
| assertThat(extracted).hasValue(SUBGRAPH); | |||
| // The "bridge" node is the same, so we extract the whole graph (so we return nothing). | |||
| assertThat(SubgroupOptimizer.extractRepeatingSubgraph(SUBGRAPH)).isEmpty(); | |||
| // There's no repetition in this graph, so return nothing. | |||
| assertThat(SubgroupOptimizer.extractRepeatingSubgraph(ranges("123", "234", "345"))).isEmpty(); | |||
| } | |||
| private static RangeTree ranges(String... specs) { | |||
| return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse)); | |||
| } | |||
| } | |||
| @ -0,0 +1,122 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| package com.google.i18n.phonenumbers.metadata.regex; | |||
| import static com.google.common.truth.Truth.assertThat; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; | |||
| import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; | |||
| import com.google.common.graph.ValueGraph; | |||
| import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; | |||
| import org.junit.Test; | |||
| import org.junit.runner.RunWith; | |||
| import org.junit.runners.JUnit4; | |||
| @RunWith(JUnit4.class) | |||
| public class TrailingPathOptimizerTest { | |||
| @Test | |||
| public void testSimple() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| nfa.addPath(INITIAL, TERMINAL, "12xx"); | |||
| nfa.addPath(INITIAL, TERMINAL, "34xxxx"); | |||
| ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph()); | |||
| // Expect the common trailing "xx" to be factored out at some new join point. | |||
| NfaBuilder expected = new NfaBuilder(); | |||
| Node join = expected.addPath(INITIAL, "12"); | |||
| expected.addPath(INITIAL, join, "34xx"); | |||
| expected.addPath(join, TERMINAL, "xx"); | |||
| assertEquivalent(actual, expected); | |||
| } | |||
| @Test | |||
| public void testTrailingOptionalGroup() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| nfa.addPath(INITIAL, TERMINAL, "12xx"); | |||
| // Add path "34xx(xx)?" | |||
| Node optStart = nfa.addPath(INITIAL, "34xx"); | |||
| nfa.addOptionalPath(optStart, TERMINAL, "xx"); | |||
| ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph()); | |||
| // Expect the common trailing "xx" to be factored out at some new join point. | |||
| NfaBuilder expected = new NfaBuilder(); | |||
| Node join = expected.addPath(INITIAL, "12"); | |||
| // Add "34(xx)?" up to the joining node. | |||
| optStart = expected.addPath(INITIAL, "34"); | |||
| expected.addOptionalPath(optStart, join, "xx"); | |||
| // Add the trailing "xx". | |||
| expected.addPath(join, TERMINAL, "xx"); | |||
| assertEquivalent(actual, expected); | |||
| } | |||
| @Test | |||
| public void testDoubleRecursion() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| nfa.addPath(INITIAL, TERMINAL, "12xx"); | |||
| nfa.addPath(INITIAL, TERMINAL, "34xxxx"); | |||
| // Add path "56xxxx(xx)?" | |||
| Node optStart = nfa.addPath(INITIAL, "56xxxx"); | |||
| nfa.addOptionalPath(optStart, TERMINAL, "xx"); | |||
| ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph()); | |||
| // Factoring should be applied twice to pull out 2 lots of "xx". | |||
| // How I wish we had a way to embed proper graphs in JavaDoc! | |||
| // | |||
| // ,-----------12-----------v | |||
| // (I)------34----->(1)--xx-->(2)--xx-->(T) | |||
| // `-56-->()--xx--^ | |||
| // `--e---^ | |||
| // | |||
| NfaBuilder expected = new NfaBuilder(); | |||
| Node secondJoin = expected.addPath(INITIAL, "12"); | |||
| expected.addPath(secondJoin, TERMINAL, "xx"); | |||
| Node firstJoin = expected.addPath(INITIAL, "34"); | |||
| expected.addPath(firstJoin, secondJoin, "xx"); | |||
| optStart = expected.addPath(INITIAL, "56"); | |||
| expected.addOptionalPath(optStart, firstJoin, "xx"); | |||
| assertEquivalent(actual, expected); | |||
| } | |||
| @Test | |||
| public void testNoChangeIfNoCommonFactor() { | |||
| NfaBuilder nfa = new NfaBuilder(); | |||
| nfa.addPath(INITIAL, TERMINAL, "12xxxxxx"); | |||
| // Add path "34xxx(xx)?" which, while it shares 'xxx' with '12xxxxxx', will not be factored | |||
| // because splitting out 'xxx' would make the resulting regular expression longer | |||
| // (e.g. "(?:34\d{2}?|12\d{3})\d{3}" is longer than "34\d{2}?\d{3}|12\d{6}"). | |||
| // | |||
| // Note that there are some cases in which this isn't true (shorter sequences like 'x' might be | |||
| // splittable without cost, but they are unlikely to ever make the expression shorter, | |||
| // especially if they result in adding new parentheses for grouping. | |||
| Node optStart = nfa.addPath(INITIAL, "34xxx"); | |||
| nfa.addOptionalPath(optStart, TERMINAL, "xx"); | |||
| ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph()); | |||
| assertEquivalent(actual, nfa); | |||
| } | |||
| private static void assertEquivalent(ValueGraph<Node, SimpleEdge> actual, NfaBuilder expected) { | |||
| // This is a somewhat cheeky way to test graph isomorphism and relies on the fact that graph | |||
| // flattening is deterministic according to how edges sort and doesn't care about node values. | |||
| // It also, obviously, relies on the flattening code to be vaguely well tested. | |||
| assertThat(NfaFlattener.flatten(actual)).isEqualTo(NfaFlattener.flatten(expected.graph())); | |||
| } | |||
| } | |||
| @ -0,0 +1,49 @@ | |||
| /* | |||
| * Copyright (C) 2017 The Libphonenumber Authors. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| syntax = "proto3"; | |||
| package i18n.phonenumbers.internal.finitestatematcher.compiler; | |||
| option java_package = "com.google.i18n.phonenumbers.internal.finitestatematcher.compiler"; | |||
| option java_outer_classname = "RegressionTestProto"; | |||
| // A set of regression tests. | |||
| message Tests { | |||
| repeated TestCase test_case = 1; | |||
| } | |||
| // A single regression test entry. | |||
| message TestCase { | |||
| // A name for the test, ideally unique. | |||
| string name = 1; | |||
| // If set true, expect that the test will fail 100% of the time. This is | |||
| // useful to test that test numbers have enough coverage to force a failure | |||
| // and is typically achieved by modifying an input range after generating a | |||
| // passing test (or carefully modifying the output bytecodes). Note that not | |||
| // all changes will make a test fail 100% of the time, so care must be taken | |||
| // to avoid creating a flaky test (e.g. don't change a "[0-3]" to "[0-5]", as | |||
| // this only fails if the test number contains a 4 or 5 at the corresponding | |||
| // index, change it to "[4-6]" so there's no overlap and at least one test | |||
| // number that's valid for that range will not be accepted by the matcher). | |||
| bool should_fail = 2; | |||
| // The input ranges (in the form of range specifications) which form the DFA | |||
| // to be tested (e.g. "1[2-5]678xxxxx" etc...). | |||
| repeated string range = 3; | |||
| // The expected output bytes, encoded in test files using C-style hex notation | |||
| // (i.e. \xHH). This can be split over multiple lines for readability. | |||
| repeated bytes expected = 4; | |||
| } | |||