Browse Source

Project import generated by Copybara. (#2890)

PiperOrigin-RevId: 509849832
pull/2891/merge
David Beaumont 3 years ago
committed by GitHub
parent
commit
f63cf2c937
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 6559 additions and 111 deletions
  1. +71
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java
  2. +1
    -1
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java
  3. +317
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java
  4. +262
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/OpCode.java
  5. +247
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherBytes.java
  6. +299
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompiler.java
  7. +600
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Operation.java
  8. +44
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Statistics.java
  9. +2
    -2
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java
  10. +1
    -1
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java
  11. +1
    -1
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java
  12. +74
    -89
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java
  13. +181
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/AnyPath.java
  14. +351
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Edge.java
  15. +343
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriter.java
  16. +195
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattener.java
  17. +51
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Node.java
  18. +123
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverter.java
  19. +118
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatter.java
  20. +171
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexGenerator.java
  21. +190
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/SubgroupOptimizer.java
  22. +206
    -0
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizer.java
  23. +17
    -13
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java
  24. +1
    -1
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java
  25. +1
    -1
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java
  26. +1
    -1
      metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java
  27. +76
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/LengthsParserTest.java
  28. +1
    -1
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java
  29. +210
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcherTest.java
  30. +317
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/CompilerRegressionTest.java
  31. +144
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompilerTest.java
  32. +60
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/OperationTest.java
  33. +295
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/regression_test_data.textpb
  34. +106
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/AnyPathTest.java
  35. +224
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeTest.java
  36. +154
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriterTest.java
  37. +98
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaBuilder.java
  38. +229
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattenerTest.java
  39. +68
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NodeTest.java
  40. +154
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverterTest.java
  41. +107
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatterTest.java
  42. +197
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexGeneratorTest.java
  43. +80
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/SubgraphOptimizerTest.java
  44. +122
    -0
      metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizerTest.java
  45. +49
    -0
      metadata/src/test/proto/regression_test.proto

+ 71
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java View File

@ -0,0 +1,71 @@
/*
* Copyright (C) 2022 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.base.Preconditions.checkArgument;
import static java.lang.Integer.parseUnsignedInt;
import com.google.common.base.CharMatcher;
import com.google.common.base.Splitter;
import com.google.common.collect.ContiguousSet;
import com.google.common.collect.ImmutableSortedSet;
import java.util.List;
import java.util.NavigableSet;
import java.util.TreeSet;
/** Parses strings of form "4,7-9,11" which are used as length specifiers across LPN metadata */
public final class LengthsParser {
private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults(whitespace());
private static final Splitter RANGE_SPLITTER =
Splitter.on('-').trimResults(whitespace()).limit(2);
private static final CharMatcher ALLOWED_CHARACTERS =
CharMatcher.inRange('0', '9').or(CharMatcher.anyOf("-,")).or(whitespace());
/** Returns the set of integers specified by this string. */
public static ImmutableSortedSet<Integer> parseLengths(String s) {
checkArgument(
ALLOWED_CHARACTERS.matchesAllOf(s),
"Length specifier contains forbidden characters: %s",
s);
NavigableSet<Integer> lengths = new TreeSet<>();
for (String lengthOrRange : COMMA_SPLITTER.split(s)) {
if (lengthOrRange.contains("-")) {
List<String> lohi = RANGE_SPLITTER.splitToList(lengthOrRange);
int lo = parseUnsignedInt(lohi.get(0));
int hi = parseUnsignedInt(lohi.get(1));
checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi);
checkArgument(
lengths.isEmpty() || lo > lengths.last(),
"Numbers in length specifier are out of order: %s",
s);
lengths.addAll(ContiguousSet.closed(lo, hi));
} else {
int length = parseUnsignedInt(lengthOrRange);
checkArgument(
lengths.isEmpty() || length > lengths.last(),
"Numbers in length specifier are out of order: %s",
s);
lengths.add(length);
}
}
return ImmutableSortedSet.copyOf(lengths);
}
private LengthsParser() {}
}

+ 1
- 1
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java View File

@ -71,7 +71,7 @@ public final class Types {
.put(XML_VOIP, VOIP)
.put(XML_UAN, UAN)
.put(XML_VOICEMAIL, VOICEMAIL)
.build();
.buildOrThrow();
/** Returns the set of valid XML type names. */
public static ImmutableSet<String> getXmlNames() {


+ 317
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java View File

@ -0,0 +1,317 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode.State;
/**
* Matches phone number regular expressions based on compact compiled data generated by
* {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler
* MatcherCompiler}. Typically the phone number regular expression will be compiled at build time
* and the resulting matcher data will be packaged into the binary which needs it, or downloaded at
* run time.
* <p>
* This class is designed to be lightweight and fast, and should be simple to implement in many
* different languages (C++, Python, JS, etc.).
*
* TODO: Consider UnisgnedBytes.toInt(x) to avoid lots of (x & 0xFF).
*/
public abstract class DigitSequenceMatcher {
/** Possible result types returned by a match operation. */
public enum Result {
/** The match operation was a success and the input was matched. */
MATCHED,
/** The match operation failed because unexpected input was encountered. */
INVALID,
/**
* The match operation failed because the input terminated too soon (ie, the input was a
* valid prefix for the matcher).
*/
TOO_SHORT,
/**
* The match operation failed due to the existence of additional input after matching had
* completed (ie, the the input would have matched if it were shorter).
*/
TOO_LONG;
}
/** An iterator of {@code int}, used to supply the matcher with a sequence of input digits. */
public interface DigitSequence {
/** Returns true if there are more digits available. */
boolean hasNext();
/**
* Return the next digit value (from 0 to 9 inclusive, not a char value). The matcher does not
* test for invalid digits, so returning values outside this range will have undefined results,
* including false positive results.
*/
int next();
}
/** Internal abstraction to allow matching over either byte arrays or strings. */
interface DataView {
/** Return the unsigned byte value at the given offset from the current position. */
int peekByte(int offset);
/** Return the unsigned byte value at the current position and move ahead 1 byte. */
int readByte();
/** Return the unsigned short value at the current position and move ahead 2 bytes. */
int readShort();
/** Return the unsigned int value at the current position and move ahead 4 bytes. */
int readInt();
/** Adjust the current position by the given (non-negative) offset. */
State branch(int offset);
/**
* Adjust the current position by the unsigned byte offset value read from the current
* position plus the given index. This is used to implement maps and branching ranges.
*/
State jumpTable(int index);
}
/**
* Creates a new matcher which reads instructions directly from the given byte array. Typically
* it is expected that this method will consume byte arrays packaged into a binary at build time
* (the MatcherCompiler is not suitable for direct parsing of regular expressions at run time).
* <p>
* See {@code MatcherCompiler.compile(...)}.
*/
public static DigitSequenceMatcher create(byte[] data) {
if (data.length == 0) {
throw new IllegalArgumentException("matcher data cannot be empty");
}
return new ByteArrayMatcher(data);
}
/**
* Creates a new matcher which reads instructions from the given string. Typically it is expected
* that this method will be used when matcher data is packaged as literal Java string constants
* in (auto-generated) source files.
* <p>
* See {@code MatcherCompiler.compileToUnquotedJavaSourceString(...)}.
*/
public static DigitSequenceMatcher create(String data) {
if (data.isEmpty()) {
throw new IllegalArgumentException("matcher data cannot be empty");
}
return new StringMatcher(data);
}
abstract DataView newDataView();
abstract int size();
/** Matches the input against this matcher, returning a result code. */
public Result match(DigitSequence in) {
State state = runMatcher(in);
switch (state) {
case TERMINAL:
return !in.hasNext() ? Result.MATCHED : Result.TOO_LONG;
case TRUNCATED:
return Result.TOO_SHORT;
case INVALID:
return Result.INVALID;
default:
throw new AssertionError("unexpected state: " + state);
}
}
private State runMatcher(DigitSequence in) {
DataView data = newDataView();
State state;
do {
state = OpCode.decode(data.peekByte(0)).execute(data, in);
} while (state == State.CONTINUE);
return state;
}
@Override
public String toString() {
int size = size();
StringBuilder out = new StringBuilder(size + " :: [ ");
DataView data = newDataView();
while (size-- > 0) {
out.append(Integer.toHexString(data.readByte())).append(", ");
}
out.setLength(out.length() - 2);
out.append(" ]");
return out.toString();
}
/** A matcher for reading instructions from a byte array. */
private static final class ByteArrayMatcher extends DigitSequenceMatcher {
private class ByteArrayData implements DataView {
int position = 0;
@Override public int peekByte(int offset) {
return bytes[position + offset] & 0xFF;
}
@Override public int readByte() {
return bytes[position++] & 0xFF;
}
@Override public int readShort() {
return (readByte() << 8) | readByte();
}
@Override public int readInt() {
return (readShort() << 16) | readShort();
}
@Override public State branch(int offset) {
position += offset;
return offset != 0 ? State.CONTINUE : State.TERMINAL;
}
@Override public State jumpTable(int index) {
return branch(peekByte(index));
}
}
private final byte[] bytes;
private ByteArrayMatcher(byte[] data) {
this.bytes = data;
}
@Override
DataView newDataView() {
return new ByteArrayData();
}
@Override
int size() {
return bytes.length;
}
}
/** A matcher for reading instructions from a String. */
private static final class StringMatcher extends DigitSequenceMatcher {
/*
* Note: Using unsigned shift "x >>> 1" is more likely to be free as part of a data load
* instruction than "x / 2".
*/
private class StringData implements DataView {
int position = 0;
@Override public int peekByte(int offset) {
offset += position;
int data = bytes.charAt(offset >>> 1);
// char := hi [ even-byte | odd-byte ] lo
return (offset & 1) != 0 ? data & 0xFF : data >>> 8;
}
@Override public int readByte() {
int data = bytes.charAt(position >>> 1);
// char := hi [ even-byte | odd-byte ] lo
data = (position & 1) != 0 ? data & 0xFF : data >>> 8;
position += 1;
return data;
}
@Override public int readShort() {
int data = bytes.charAt(position >>> 1);
// Adding 2 early does not affect odd/even (but does reference next char).
position += 2;
if ((position & 1) != 0) {
data = ((data & 0xFF) << 8) | (bytes.charAt(position >>> 1) >>> 8);
}
return data;
}
@Override public int readInt() {
return (readShort() << 16) | readShort();
}
@Override public State branch(int offset) {
position += offset;
return offset != 0 ? State.CONTINUE : State.TERMINAL;
}
@Override public State jumpTable(int index) {
return branch(peekByte(index));
}
}
private final String bytes;
private StringMatcher(String bytes) {
this.bytes = bytes;
}
@Override
DataView newDataView() {
return new StringData();
}
@Override
int size() {
int size = 2 * bytes.length();
if ((bytes.charAt(bytes.length() - 1) & 0xFF) == 0xFF) {
size -= 1;
}
return size;
}
}
/** An iterator of {@code int} that yields a sequence of input digits from a string. */
private static final class StringDigits implements DigitSequence {
private final CharSequence number;
private int n = 0;
private StringDigits(CharSequence number) {
this.number = number;
}
@Override public int next() {
if (n < 0 || n >= number.length()) {
throw new IndexOutOfBoundsException(
"index '" + n + "' out of bounds for input: " + number);
}
char c = number.charAt(n);
if (c < '0' || c > '9') {
throw new IllegalArgumentException(
"non-digit character '" + c + "' [" + ((int) c) + "] at index " + n + " in: " + number);
}
n++;
return c - '0';
}
@Override public boolean hasNext() {
return n < number.length();
}
}
/**
* Returns an instance of DigitSequence based on the input string. The input string may only
* contain digits.
*/
public static DigitSequence digitsFromString(CharSequence number) {
return new StringDigits(number);
}
/** A matcher has no internal state and is just a factory for data specific implementations. */
private DigitSequenceMatcher() { }
}

+ 262
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/OpCode.java View File

@ -0,0 +1,262 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DataView;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence;
/**
* Implementation of instructions for the phone number matcher state machine.
* <p>
* <h3>Jump Tables</h3>
*
* Several instructions use a "jump table" concept which is simply a contiguous region of bytes
* containing offsets from which a new position is calculated. The new position is the current
* position (at the start of the jump table) plus the value of the chosen jump offset.
*
* <pre>{@code
* [ ... | JUMP_0 | JUMP_1 | ... | JUMP_N | ... | DEST | ...
* position --^ ^ ^
* `---index ---' |
* offset `---------------- [ position + index ] -----'
*
* position = position + unsignedByteValueAt(position + index)
* }</pre>
*
* A jump offset of zero signifies that the state jumped to is terminal (this avoids having to jump
* to a termination byte). A jump table will always occur immediately after an associated
* instruction and the instruction's stated size includes the number of bytes in the jump table.
*/
public enum OpCode {
/**
* Jumps ahead by between 1 and 4095 bytes from the end of this opcode. This opcode does not
* consume any input.
* <p>
* This is a variable length instruction, taking one byte for offsets up to 15 and (if EXT is set)
* two bytes for larger offsets up to 4095. The jump offset signifies how many bytes to skip after
* this instruction.
* <p>
* As a special case, a single byte branch with a jump offset of zero (represented by a single
* zero byte) can be used to signify that the current state is terminal and the state machine
* should exit (a zero jump offset never makes sense in any instruction).
*
* <pre>{@code
* [ 0 | 0 | JUMP ]
* [ 0 | 1 | JUMP | EXT_JUMP ]
* <3>.<1>.<-- 4 -->.<---- 8 ---->
* }</pre>
*/
BRANCH(0) {
@Override
State execute(DataView data, DigitSequence ignored) {
int op = data.readByte();
int offset = op & 0xF;
if ((op & (1 << 4)) != 0) {
offset = (offset << 8) + data.readByte();
}
return data.branch(offset);
}
},
/**
* Accepts a single input (and transition to a single state). Inputs not matching "VAL" are
* invalid from the current state. If "TRM" is set then the state being transitioned from may
* terminate.
*
* <pre>{@code
* [ 1 |TRM| VAL ]
* <3>.<1>.<- 4 ->
* }</pre>
*/
SINGLE(1) {
@Override
State execute(DataView data, DigitSequence in) {
int op = data.readByte();
if (!in.hasNext()) {
return ((op & (1 << 4)) != 0) ? State.TERMINAL : State.TRUNCATED;
}
int n = in.next();
return ((op & 0xF) == n) ? State.CONTINUE : State.INVALID;
}
},
/**
* Accept any input to transition to a single state one or more times.
* <p>
* If "TRM" is set then every state that is transitioned from may terminate.
*
* <pre>{@code
* [ 2 |TRM| NUM-1 ]
* <3>.<1>.<- 4 ->
* }</pre>
*/
ANY(2) {
@Override
State execute(DataView data, DigitSequence in) {
int op = data.readByte();
int num = (op & 0xF) + 1;
boolean isTerminating = (op & (1 << 4)) != 0;
while (num-- > 0) {
if (!in.hasNext()) {
return isTerminating ? State.TERMINAL : State.TRUNCATED;
}
in.next();
}
return State.CONTINUE;
}
},
/**
* Accepts multiple inputs to transition to one or two states. The bit-set has the Nth bit set if
* we should accept digit N (bit-0 is the lowest bit of the 2 byte form of the instruction).
* <p>
* This is a variable length instruction which either treats non-matched inputs as invalid
* (2 byte form) or branches to one of two states via a 2-entry jump table (4 byte form).
* <p>
* If "TRM" is set then the state being transitioned from may terminate.
*
* <pre>{@code
* [ 3 |TRM| 0 |---| BIT SET ]
* [ 3 |TRM| 1 |---| BIT SET | JUMP_IN | JUMP_OUT ]
* <3>.<1>.<1>.<1>.<--- 10 --->.<--- 8 --->.<--- 8 --->
* }</pre>
*/
RANGE(3) {
@Override
State execute(DataView data, DigitSequence in) {
int op = data.readShort();
if (!in.hasNext()) {
return ((op & (1 << 12)) != 0) ? State.TERMINAL : State.TRUNCATED;
}
int n = in.next();
if ((op & (1 << 11)) == 0) {
// 2 byte form, non-matched input is invalid.
return ((op & (1 << n)) != 0) ? State.CONTINUE : State.INVALID;
}
// 4 byte form uses jump table (use bitwise negation so a set bit becomes a 0 index).
return data.jumpTable((~op >>> n) & 1);
}
},
/**
* Accept multiple inputs to transition to between one and ten states via jump offsets. Inputs
* not encoded in "CODED MAP" are invalid from the current state.
* <p>
* Because there is no room for a termination bit in this instruction, there is an alternate
* version, {@code TMAP}, which should be used when transitioning from a terminating state.
* <p>
* TODO: Figure out if we can save one bit here and merge MAP and TMAP.
*
* <pre>{@code
* [ 4 | CODED MAP | JUMP_1 | ... | JUMP_N ]
* <3>.<-------- 29 -------->.<--- 8 --->. ... .<--- 8 --->
* }</pre>
*/
MAP(4) {
@Override
State execute(DataView data, DigitSequence in) {
return map(data, in, State.TRUNCATED);
}
},
/**
* Like {@code MAP} but transitions from a terminating state.
*/
TMAP(5) {
@Override
State execute(DataView data, DigitSequence in) {
return map(data, in, State.TERMINAL);
}
};
/** The types of states that the state-machine can be in. */
public enum State {
CONTINUE, TERMINAL, INVALID, TRUNCATED;
}
private static final OpCode[] VALUES = values();
/**
* Encode maps as 29 bits where each digit takes a different number of bits to encode its offset.
* Specifically:
* <ul>
* <li>The first entry (matching 0) has only two possible values (it is either not present or maps
* to the first entry in the jump table), so takes only 1 bit.
* <li>The second entry (matching 1) has three possible values (not present or maps to either the
* first or second entry in the jump table), so it takes 2 bits.
* <li>In general the entry matching digit N has (N+1) possible states and takes log2(N+1) bits.
* </ul>
*/
private static final long MAP_SHIFT_BITS = 0L << 0 | // 1 bit (1x, mask=1)
1L << 5 | 3L << 10 | // 2 bits (2x, mask=3)
5L << 15 | 8L << 20 | 11L << 25 | 14L << 30 | // 3 bits (4x, mask=7)
17L << 35 | 21L << 40 | 25L << 45; // 4 bits (3x, mask=F)
/**
* A table of values with which to mask the coded jump table map, after shifting it. Each nibble
* is a mask of up to 4 bits to extract the encoded index from a map instruction after it has
* been shifted.
*/
private static final long MAP_MASK_BITS = 0xFFF7777331L;
/**
* Returns the number of bits we must shift the coded jump table map for a digit with value
* {@code n} such that the jump index is in the lowest bits.
*/
public static int getMapShift(int n) {
return (int) (MAP_SHIFT_BITS >>> (5 * n)) & 0x1F;
}
/**
* Returns a mask we must apply to the shifted jump table map to extract only the jump index from
* the lowest bits.
*/
public static int getMapMask(int n) {
return (int) (MAP_MASK_BITS >>> (4 * n)) & 0xF;
}
/**
* Executes a map instruction by decoding the map data and selecting a jump offset to apply.
*/
private static State map(DataView data, DigitSequence in, State noInputState) {
int op = data.readInt();
if (!in.hasNext()) {
return noInputState;
}
int n = in.next();
// Coded indices are 1-to-10 (0 is the "invalid" state).
int index = ((op >>> getMapShift(n)) & getMapMask(n));
if (index == 0) {
return State.INVALID;
}
// Jump offsets are zero based.
return data.jumpTable(index - 1);
}
/**
* Returns the opcode associated with the given unsigned byte value (the first byte of any
* instruction).
*/
static OpCode decode(int unsignedByte) {
return VALUES[unsignedByte >>> 5];
}
private OpCode(int code) {
// Assertion checks during enum creation. Opcodes must be 3 bits and match the ordinal of the
// enum (this prevents issues if reordering enums occurs).
if ((code & ~0x7) != 0 || code != ordinal()) {
throw new AssertionError("bad opcode value: " + code);
}
}
abstract State execute(DataView data, DigitSequence in);
}

+ 247
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherBytes.java View File

@ -0,0 +1,247 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler.Sequence;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Renders the final bytecode representation for the matcher by connecting sequences of operations
* together and fixing-up offsets and branch instructions. This is essentially the higher-level
* aspect of matcher bytecode compilation.
* <p>
* Unlike {@link MatcherCompiler} in which a lot of the data is immutable (because sequences can
* be defined in isolation), there's a lot of mutable state in this class due to the need to build
* and manage offsets between the sequences, which relies on the order in which other sequences
* have been rendered.
*/
class MatcherBytes {
/**
* A partial order on byte sequences based on their size. This is not "equivalent to equals" and
* must not be used to construct an ordered set.
*/
private static final Comparator<SequenceBytes> DECREASING_BY_SIZE =
new Comparator<SequenceBytes>() {
@Override public int compare(SequenceBytes lhs, SequenceBytes rhs) {
return Integer.compare(rhs.size(), lhs.size());
}
};
/**
* Sequences we have not considered for rendering yet.
*/
private final List<Sequence> remainingSequences;
/**
* Candidate sequences whose dependent sequences have all been rendered, and which may themselves
* now be rendered.
*/
private final Set<Sequence> canditiateSequences = new LinkedHashSet<>();
/**
* Sequences which have been rendered (used to determine when other sequences become renderable).
*/
private final Set<Sequence> compiledSequences = new HashSet<>();
/**
* A map from which are final nodes of a sequence to the sequence they belong to. The key set of
* this map is a subset of all nodes.
*/
private final Map<DfaNode, SequenceBytes> sequenceMap = new HashMap<>();
/**
* A list of compiled byte sequences in reverse order (ie, the sequence with the terminal node
* in it is first in this list and the sequence with the initial node is last). Compilation
* occurs in reverse order to allow offsets between sequences to be calculated as we go.
*/
private final List<SequenceBytes> reverseOrder = new ArrayList<>();
/** Statistics instance for collecting inforation about the compilation. */
private final Statistics stats;
MatcherBytes(Iterable<Sequence> allSequences, Statistics stats) {
// Our set of remaining sequences just starts out as all the sequences.
// Sequences are processed in reverse order, so reverse the sorted sequences before beginning.
remainingSequences = Lists.reverse(Lists.newArrayList(allSequences));
this.stats = Preconditions.checkNotNull(stats);
}
/**
* Compiles all sequences into a single byte buffer suitable for use by a
* {@code DigitSequenceMatcher}.
*/
byte[] compile() {
int totalSequenceCount = remainingSequences.size();
// Sequences with not dependent sequences are compiled first.
compileFinalSequences();
// Determine new candidate sequences.
while (compiledSequences.size() < totalSequenceCount) {
// We won't always add a new candidate sequence each time around the loop, but the set
// should never be emptied until the final sequence is processed.
for (Iterator<Sequence> it = remainingSequences.iterator(); it.hasNext();) {
Sequence s = it.next();
if (compiledSequences.containsAll(s.unorderedOutSequences())) {
canditiateSequences.add(s);
it.remove();
}
}
// Compile the next candidate sequence.
Sequence toCompile = Iterables.get(canditiateSequences, 0);
reverseOrder.add(compile(toCompile));
compiledSequences.add(toCompile);
canditiateSequences.remove(toCompile);
}
// We should have always exhausted the candidate sequences when we've finished rendering.
Preconditions.checkState(remainingSequences.isEmpty());
Preconditions.checkState(canditiateSequences.isEmpty());
return concatSequenceBytesInForwardOrder();
}
/**
* Compiles any sequences which have no dependencies and orders them by size to heuristically
* reduce the size of branch offsets needed to reach them.
*/
private void compileFinalSequences() {
for (Iterator<Sequence> it = remainingSequences.iterator(); it.hasNext();) {
Sequence s = it.next();
if (s.isFinal()) {
reverseOrder.add(compile(s));
compiledSequences.add(s);
it.remove();
}
}
// They are ordered by size (shortest first) because this will tend to reduce the number of
// 2-byte branch instructions needed to jump to them.
Collections.sort(reverseOrder, DECREASING_BY_SIZE);
}
/** Compiles a sequence for which all dependent sequences have already been compiled. */
private SequenceBytes compile(Sequence sequence) {
// Note: Even non branching sequences will have an out node here.
Map<DfaNode, Integer> offsetMap = new HashMap<>();
for (DfaNode out : sequence.getOutStates()) {
SequenceBytes targetSequence = sequenceMap.get(out);
int offsetToStartOfSequence = 0;
for (int n = reverseOrder.size() - 1; n >= 0 && reverseOrder.get(n) != targetSequence; n--) {
offsetToStartOfSequence += reverseOrder.get(n).size();
}
if (offsetToStartOfSequence > 0 && targetSequence.isTerminator()) {
// If we would explicitly jump to a terminator sequence, we can just exit
// unconditionally at this point.
offsetToStartOfSequence = Operation.TERMINATION_OFFSET;
}
offsetMap.put(out, offsetToStartOfSequence);
}
SequenceBytes compiled = new SequenceBytes(sequence, offsetMap, stats);
sequenceMap.put(sequence.getInitialState(), compiled);
return compiled;
}
/** Creates the final, single buffer of bytecode instructions for the matcher. */
private byte[] concatSequenceBytesInForwardOrder() {
try {
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
for (int n = reverseOrder.size() - 1; n >= 0; n--) {
outBuffer.write(reverseOrder.get(n).getBytes());
}
return outBuffer.toByteArray();
} catch (IOException e) {
throw new AssertionError("ByteArrayOutputStream cannot throw IOException");
}
}
/** Renders a sequence (along with a map of branch offsets) to its bytecode form. */
private static byte[] renderSequence(
Sequence sequence, Map<DfaNode, Integer> offsetMap, Statistics stats) {
// Because our operations come from a sequence, we can assert that only the last operation
// could possibly be branching.
List<Operation> ops = sequence.createOps();
ByteArrayOutputStream baos = new ByteArrayOutputStream();
ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(baos);
// Write all but the last operation (there are no branches to worry about).
for (int n = 0; n < ops.size() - 1; n++) {
ops.get(n).writeTo(outBytes, null, stats);
}
Operation lastOp = Iterables.getLast(ops);
if (lastOp.isTerminating()) {
stats.record(Statistics.Type.TERMINATING);
}
if (lastOp.isBranching()) {
// A branching operation uses the offset map directly to fill in its jump table information.
lastOp.writeTo(outBytes, offsetMap, stats);
} else {
// A non-branching operation does not use offsets, but we may need to add an explicit branch
// instruction after it.
lastOp.writeTo(outBytes, null, stats);
if (!offsetMap.isEmpty()) {
// When adding a branch instruction, there should only be a single offset to use.
int offset = Iterables.getOnlyElement(offsetMap.values());
if (offset >= 0) {
// The offset could still be zero, but this is handled correctly by writeBranch().
Operation.writeBranch(outBytes, offset, stats);
} else {
// This is a terminal instruction and the matcher should exit.
Preconditions.checkArgument(offset == Operation.TERMINATION_OFFSET);
Operation.writeTerminator(outBytes, stats);
}
}
}
return baos.toByteArray();
}
/**
* A single compiled sequence of operations. This is just a holder for a {@link Sequence} and the
* compiled bytes it produces.
*/
static class SequenceBytes {
private final Sequence sequence;
private final byte[] bytes;
SequenceBytes(Sequence sequence, Map<DfaNode, Integer> offsetMap, Statistics stats) {
this.sequence = sequence;
this.bytes = renderSequence(sequence, offsetMap, stats);
}
Sequence getSequence() {
return sequence;
}
boolean isTerminator() {
return sequence.isFinal() && sequence.size() == 1;
}
int size() {
return bytes.length;
}
byte[] getBytes() {
return bytes;
}
}
}

+ 299
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompiler.java View File

@ -0,0 +1,299 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.ImmutableSet.toImmutableSet;
import static java.lang.Integer.numberOfTrailingZeros;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.graph.MutableValueGraph;
import com.google.common.graph.ValueGraph;
import com.google.common.graph.ValueGraphBuilder;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
/**
* Compiles non-capturing phone number regular expressions into sequences of bytes suitable for
* creating {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher
* DigitSequenceMatcher} instances.
*/
public final class MatcherCompiler {
/**
* Compiles the given {@code RangeTree} into a sequence of bytes suitable for creating a
* {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher
* DigitSequenceMatcher}.
*/
public static byte[] compile(RangeTree dfa) {
return compile(dfa, Statistics.NO_OP);
}
/**
* As {@link #compile(RangeTree)} but additionally accepts a {@link Statistics} instance
* to record metrics about the compilation.
*/
public static byte[] compile(RangeTree dfa, Statistics stats) {
return new MatcherCompiler(dfa).compile(stats);
}
/** The DFA from which the matcher data is to be compiled. */
private final ValueGraph<DfaNode, DfaEdge> dfa;
/** The unique initial node of the DFA. */
private final DfaNode init;
/**
* A map from nodes which are at the beginning of a sequence to that sequence. Not all nodes
* will be present in the key set of this map.
*/
private final ImmutableMap<DfaNode, Sequence> seqStart;
/**
* Builds a graph directly from the DFA in a RangeTree.
*
* <p>Rather than deal with the DFA tree directly (which is deliberately opaque as a data
* structure) we serialize it into a more maleable ValueGraph. This allows simpler graph
* traversal while maintaining a simple-as-possible node/edge structure. It's okay to reuse the
* RangeTree types {@code DfaNode} and {@code DfaEdge} here because they have the expected
* semantics (e.g. conforming to equals/hashcode etc...) but care must be taken not to keep the
* instances around for a long time, since this will keep larger parts of the original DFA alive
* in the garbage collector (but this is fine since only bytes are returned from this class).
*/
private static ValueGraph<DfaNode, DfaEdge> buildGraph(RangeTree dfa) {
Preconditions.checkArgument(!dfa.isEmpty());
MutableValueGraph<DfaNode, DfaEdge> graph =
ValueGraphBuilder.directed().allowsSelfLoops(false).build();
graph.addNode(dfa.getInitial());
DfaVisitor visitor = new DfaVisitor() {
@Override
public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
boolean isFirstVisit = graph.addNode(target);
graph.putEdgeValue(source, target, edge);
if (isFirstVisit) {
target.accept(this);
}
}
};
dfa.accept(visitor);
return graph;
}
/**
* Creates a {@code MatcherCompiler} from the given automaton by generating all the
* {@code Sequence}'s of operations necessary to represent it.
*/
MatcherCompiler(RangeTree ranges) {
this.dfa = buildGraph(ranges);
this.init = ranges.getInitial();
LinkedHashMap<DfaNode, Sequence> start = new LinkedHashMap<>();
buildSequencesFrom(init, start);
this.seqStart = ImmutableMap.copyOf(start);
}
/**
* Returns the output targets of the given node sorted according to the lowest "accepting" digit
* on the corresponding edge. This ordering is necessary for stability, but also correctness when
* building mapping operations. Apart from special cases (e.g. only one output) this is the only
* method which should be used to obtain output nodes.
*/
private ImmutableSet<DfaNode> sortedOutputs(DfaNode source) {
Comparator<DfaNode> ordering = Comparator.comparing(
target -> numberOfTrailingZeros(dfa.edgeValue(source, target).get().getDigitMask()));
return dfa.successors(source).stream().sorted(ordering).collect(toImmutableSet());
}
/** Returns the single output target of the given node (or throws an exception). */
private DfaNode singleOutput(DfaNode source) {
return Iterables.getOnlyElement(dfa.successors(source));
}
/**
* Builds the output map from a given node in the DFA in the correct order. Note that because
* ImmutableSetMultimap.Builder orders keys based on the first time they are added, and we add
* keys (nodes) in the order of the input by which they can be reached, the keys of the returned
* map are ordered by the lowest digit in their set of values (inputs). This is necessary for
* correct behaviour in the "Mapping" operation.
*/
private ImmutableMap<DfaNode, Integer> getOutMap(DfaNode source) {
Function<DfaNode, Integer> getMask =
target -> dfa.edgeValue(source, target).get().getDigitMask();
return sortedOutputs(source).stream().collect(toImmutableMap(Function.identity(), getMask));
}
/**
* Recursively builds sequences by traversing the DFA and grouping successive sub-sequences of
* nodes which neither branch, nor are branched to. Each such sub-sequence is represented by a
* {@code Sequence} instance (a list of non-branching operations, optionally terminated with a
* branching operation).
*/
private void buildSequencesFrom(DfaNode start, LinkedHashMap<DfaNode, Sequence> map) {
if (map.containsKey(start)) {
return;
}
DfaNode current = start;
ImmutableList.Builder<DfaNode> nodes = ImmutableList.builder();
while (true) {
nodes.add(current);
if (dfa.outDegree(current) != 1) {
break;
}
DfaNode next = singleOutput(current);
if (dfa.inDegree(next) > 1) {
break;
}
current = next;
}
Sequence seq = new Sequence(nodes.build());
map.put(start, seq);
// Recurse from the outputs at the end of the sequence according to their edge values.
// IMPORTANT: We must not use "current.successors()" here since we need the order of insertion
// to be well defined and ValueGraph does not make good enough promises about node ordering.
for (DfaNode out : sortedOutputs(current)) {
buildSequencesFrom(out, map);
}
}
/** Creates and compiles a {@code MatcherBytes} instance to render the output bytes. */
byte[] compile(Statistics stats) {
return createMatcherBytes(stats).compile();
}
/** Creates a mutable {@code MatcherBytes} instance which will render the output bytes. */
MatcherBytes createMatcherBytes(Statistics stats) {
return new MatcherBytes(seqStart.values(), stats);
}
/**
* A contiguous sub-sequence of nodes in the DFA which neither branch, nor are branched to.
* <p>
* The important property of a {@code Sequence} is that branching may only occur at the end of a
* {@code Sequence} and branches may only jump to the start of another {@code Sequence}. This
* makes it easier to separate the compilation of operations (inside sequences) from the
* management of branches and offsets (between sequences).
*/
class Sequence {
private final ImmutableList<DfaNode> nodes;
Sequence(ImmutableList<DfaNode> nodes) {
checkArgument(!nodes.isEmpty());
this.nodes = nodes;
}
private Operation getOp(DfaNode node) {
return Operation.from(node.canTerminate(), getOutMap(node));
}
/**
* Returns the operations representing this sequence, merging successive operations where
* possible. The final list of operations is guaranteed to have at most one branching operation
* which (if present) will always be the last element in the list.
*/
List<Operation> createOps() {
List<Operation> ops = new ArrayList<>();
Operation current = getOp(nodes.get(0));
for (int n = 1; n < nodes.size(); n++) {
Operation next = getOp(nodes.get(n));
Operation merged = current.mergeWith(next);
if (merged != null) {
current = merged;
} else {
ops.add(current);
current = next;
}
}
ops.add(current);
return ops;
}
DfaNode getInitialState() {
return Iterables.get(nodes, 0);
}
DfaNode getFinalState() {
return Iterables.getLast(nodes);
}
Set<DfaNode> getOutStates() {
return sortedOutputs(getFinalState());
}
/**
* Not the same as "terminating" for an operation. A sequence is "final" if no other sequences
* follow it. Normally there is only one final sequence in a normalized DFA, even if that
* sequence contains only a single terminating node. However not all terminating nodes are
* in final sequences.
*/
boolean isFinal() {
return getOutStates().isEmpty();
}
/** Returns the number of nodes that this sequence represents. */
int size() {
return nodes.size();
}
ImmutableSet<Sequence> unorderedOutSequences() {
return getOutStates().stream().map(seqStart::get).collect(toImmutableSet());
}
@Override
public String toString() {
return toString(new StringBuilder(), 0).toString();
}
private StringBuilder toString(StringBuilder buf, int indent) {
List<Operation> ops = createOps();
appendIndent(buf, indent).append(
String.format("{%s} %s", nodes.get(0), Joiner.on(" >> ").join(ops)));
ImmutableList<DfaNode> outs = Iterables.getLast(ops).getOuts();
if (!outs.isEmpty()) {
buf.append(" {\n");
for (DfaNode out : outs) {
seqStart.get(out).toString(buf, indent + 1);
}
appendIndent(buf, indent).append("}\n");
} else {
buf.append('\n');
}
return buf;
}
}
@Override
public String toString() {
return seqStart.get(init).toString();
}
private static StringBuilder appendIndent(StringBuilder out, int indent) {
for (int n = 0; n < indent; n++) {
out.append(" ");
}
return out;
}
}

+ 600
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Operation.java View File

@ -0,0 +1,600 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableSetMultimap.flatteningToImmutableSetMultimap;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import static java.lang.Integer.numberOfTrailingZeros;
import static java.util.stream.Collectors.joining;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSetMultimap;
import com.google.common.collect.Iterables;
import com.google.common.io.ByteArrayDataOutput;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.Statistics.Type;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
/**
* A specific instance of a number matching operation derived from a DFA. Operations are created by
* analyzing a sequence in a DFA and knowing how to write the corresponding instruction(s) as bytes
* (to be processed by DigitSequenceMatcher or similar).
*/
abstract class Operation {
/** Represents the digits which can be accepted during matching operations. */
private enum Digit {
// Order of enums must match the digit value itself (this is checked for in the constructor).
ZERO(0), ONE(1), TWO(2), THREE(3), FOUR(4), FIVE(5), SIX(6), SEVEN(7), EIGHT(8), NINE(9);
private static final Digit[] VALUES = values();
// Iteration order is order of enum declaration (and thus also the value order).
public static final ImmutableSet<Digit> ALL = ImmutableSet.copyOf(VALUES);
Digit(int value) {
// No need to store the digit value if we know it matches our ordinal value.
Preconditions.checkArgument(value == ordinal());
}
/** Returns the digit corresponding to the integral value in the range {@code 0...9}. */
public static Digit of(int n) {
return VALUES[n];
}
/**
* Returns the set of digits corresponding to a bit-mask in which bits 0 to 9 represent the
* corresponding digits.
*/
public static ImmutableSet<Digit> fromMask(int mask) {
Preconditions.checkArgument(mask >= 1 && mask <= ALL_DIGITS_MASK);
if (mask == ALL_DIGITS_MASK) {
return ALL;
}
ImmutableSet.Builder<Digit> digits = ImmutableSet.builder();
for (int n = 0; n <= 9; n++) {
if ((mask & (1 << n)) != 0) {
digits.add(VALUES[n]);
}
}
return digits.build();
}
/** Returns the integer value of this digit instance. */
public int value() {
return ordinal();
}
}
/**
* An invalid jump offset indicating that instead of jumping to a new instruction, the state
* machine can just terminate (used to avoid jumping directly to the termination instruction).
*/
static final int TERMINATION_OFFSET = -1;
/** The number of bytes required by a "long" branch instruction. */
private static final int LONG_BRANCH_SIZE = 2;
private final boolean isTerminating;
private final boolean isBranching;
private Operation(boolean isTerminating, boolean isBranching) {
this.isTerminating = isTerminating;
this.isBranching = isBranching;
}
/** Returns whether this operation can terminate the state machine when it has been reached. */
boolean isTerminating() {
return isTerminating;
}
/**
* Returns whether this operation is branching. A branching operation has more than one output
* node it can reach.
*/
boolean isBranching() {
return isBranching;
}
/**
* Returns the output nodes of this operation. For branching operations the order of multiple
* output nodes is defined by the operation itself (most operations are not branching and have
* only one output state anyway).
*/
abstract ImmutableList<DfaNode> getOuts();
/** Returns the op-code for this operation, used when writing out instruction bytes. */
abstract OpCode getOpCode();
/** Writes this operation out as a series of instruction bytes. */
abstract void writeImpl(
ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats);
void writeTo(ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats) {
if (isTerminating()) {
stats.record(Type.TERMINATING);
}
writeImpl(out, offsetMap, stats);
}
/**
* Merges two adjacent operations (a poor man's compiler optimization). Useful for collapsing
* sequences of "ANY" operations. If this instruction cannot be merged with the given "next"
* instruction then it should return {@code null}, which is the default behavior.
*
* @param next the operation following this operation which we will try and merge with.
*/
Operation mergeWith(Operation next) {
return null;
}
/** Writes a branch instructions into the output byte sequence. */
static void writeBranch(ByteArrayDataOutput out, int jump, Statistics stats) {
Preconditions.checkArgument(jump >= 0 && jump < 0x1000, "invalid jump: " + jump);
if (jump == 0) {
stats.record(Type.CONTINUATION);
} else if (jump < 16) {
stats.record(Type.SHORT_BRANCH);
out.writeByte((OpCode.BRANCH.ordinal() << 5) | jump);
} else {
stats.record(jump < 0x100 ? Type.MEDIUM_BRANCH : Type.LONG_BRANCH);
out.writeShort((OpCode.BRANCH.ordinal() << 13) | (1 << 12) | jump);
}
}
/** Writes a termination byte into the output byte sequence. */
static void writeTerminator(ByteArrayDataOutput out, Statistics stats) {
stats.record(Type.FINAL);
out.writeByte(0);
}
/**
* Creates a new operation to represent the output state transition given by {@code outMasks}.
* Note that where multiple nodes exist in {@code outMasks}, their ordering must be consistent
* with the {@code Mapping} operation (whereby nodes are ordered by the lowest bit set in the
* corresponding mask.
*/
static Operation from(boolean isTerminating, ImmutableMap<DfaNode, Integer> outMasks) {
if (outMasks.isEmpty()) {
// No out nodes; then it's a "Terminal" operation.
Preconditions.checkState(isTerminating);
return new Operation.Terminal();
}
ImmutableList<DfaNode> outStates = outMasks.keySet().asList();
if (outStates.size() == 1) {
DfaNode outState = Iterables.getOnlyElement(outStates);
int digitMask = outMasks.get(outState);
if (Integer.bitCount(digitMask) == 1) {
// One output state reached by a single input; then it's a "Single" operation.
return new Operation.Single(isTerminating, numberOfTrailingZeros(digitMask), outStates);
}
if (digitMask == ALL_DIGITS_MASK) {
// One output state reached by any input; then it's an "Any" operation.
return new Operation.Any(isTerminating, 1, outStates);
}
// One output state reached other general input; then it's a "Range" operation.
return new Operation.Range(isTerminating, digitMask, outStates);
}
if (outStates.size() == 2) {
// Test if the 2 disjoint masks cover all inputs. If so, use a shorter branch operation.
List<Integer> masks = outMasks.values().asList();
if ((masks.get(0) | masks.get(1)) == ALL_DIGITS_MASK) {
// One of two output nodes reached by any input; then it's a branching "Range" operation.
return new Operation.Range(isTerminating, masks.get(0), outStates);
}
}
// Any other combination of nodes or inputs; then it's a "Mapping" operation. This code relies
// on the ordering of entries in the output map to correspond to edge order.
return new Operation.Mapping(isTerminating, outMasks);
}
/** Respresents a state with no legal outputs, which must be a terminal state in the matcher. */
private static final class Terminal extends Operation {
Terminal() {
super(true, true);
}
@Override
OpCode getOpCode() {
return OpCode.BRANCH;
}
@Override
ImmutableList<DfaNode> getOuts() {
return ImmutableList.of();
}
@Override
void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> unused, Statistics stats) {
writeTerminator(out, stats);
}
@Override
public String toString() {
return "TERMINAL";
}
}
/**
* Respresents a state which can be transitioned from to a single output state via a single input
* (eg, "0" or "9").
*/
private static final class Single extends Operation {
private final Digit digit;
private final ImmutableList<DfaNode> outs;
Single(boolean isTerminating, int digit, ImmutableList<DfaNode> outs) {
super(isTerminating, false);
Preconditions.checkArgument(outs.size() == 1);
this.digit = Digit.of(digit);
this.outs = outs;
}
@Override
OpCode getOpCode() {
return OpCode.SINGLE;
}
@Override ImmutableList<DfaNode> getOuts() {
return outs;
}
@Override
void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> unused, Statistics stats) {
// <--------- 1 byte --------->
// [ OPCODE | TRM | VALUE ]
out.writeByte((getOpCode().ordinal() << 5)
| (isTerminating() ? (1 << 4) : 0)
| digit.value());
}
@Override
public String toString() {
return format(digit.value());
}
}
/**
* Respresents a state which can be transitioned from to a single output state via any input
* (ie, "\d"). Successive "Any" oeprations can be merged to represent a repeated sequence
* (eg, "\d{5}").
*/
private static final class Any extends Operation {
private final int count;
private final ImmutableList<DfaNode> outs;
Any(boolean isTerminating, int count, ImmutableList<DfaNode> outs) {
super(isTerminating, false);
Preconditions.checkArgument(outs.size() == 1);
Preconditions.checkArgument(count > 0);
this.count = count;
this.outs = outs;
}
@Override
OpCode getOpCode() {
return OpCode.ANY;
}
@Override ImmutableList<DfaNode> getOuts() {
return outs;
}
@Override
void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> unused, Statistics stats) {
int remainingCount = count;
// <--------- 1 byte --------->
// [ OPCODE | TRM | COUNT-1 ]
int anyN = (getOpCode().ordinal() << 5) | (isTerminating() ? (1 << 4) : 0);
while (remainingCount > 16) {
out.writeByte(anyN | 15);
remainingCount -= 16;
}
out.writeByte(anyN | remainingCount - 1);
}
@Override
public Operation mergeWith(Operation next) {
if (next.getOpCode() == OpCode.ANY && isTerminating() == next.isTerminating()) {
return new Any(isTerminating(), this.count + ((Any) next).count, ((Any) next).outs);
}
return null;
}
@Override
public String toString() {
return format(count);
}
}
/**
* Represents a state which can be transitioned from via an arbitrary set of inputs to either
* one or two output nodes (eg, "[23-69]" or "[0-4]X|[5-9]Y"). In the case where there are two
* output nodes, any input must reach one of the two possible nodes (ie, there is no invalid
* input).
*/
private static final class Range extends Operation {
private final ImmutableSet<Digit> digits;
private final ImmutableList<DfaNode> outs;
Range(boolean isTerminating, int digitMask, ImmutableList<DfaNode> outs) {
super(isTerminating, outs.size() == 2);
Preconditions.checkArgument(outs.size() <= 2);
this.digits = Digit.fromMask(digitMask);
this.outs = outs;
}
@Override
OpCode getOpCode() {
return OpCode.RANGE;
}
/**
* For branching Range operations (with 2 output nodes), the order is that the state matched
* by {@code digits} is the first state and the state reached by any other input is second.
*/
@Override ImmutableList<DfaNode> getOuts() {
return outs;
}
@Override
void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats) {
// <-------------- 2 bytes --------------> <-------- 2 bytes --------->
// [ OPCODE | TRM | 0 | BIT SET ]
// [ OPCODE | TRM | 1 | BIT SET | JUMP_IN | JUMP_OUT ]
out.writeShort((getOpCode().ordinal() << 13)
| (isTerminating() ? (1 << 12) : 0)
| (isBranching() ? (1 << 11) : 0)
| asBitMask(digits));
if (isBranching()) {
writeJumpTable(out, ImmutableList.of(
offsetMap.get(outs.get(0)), offsetMap.get(outs.get(1))), stats);
}
}
@Override
public String toString() {
return format(asRangeString(digits));
}
}
/**
* Represents a state in the matcher which can be transitioned from via an arbitrary set of
* inputs, to an arbitrary set of nodes. This is the most general form of operation and (apart
* from branches) provides the only truly necessary instruction in the matcher; everything else
* is just some specialization of this operation.
*/
private static final class Mapping extends Operation {
private final ImmutableSetMultimap<DfaNode, Digit> nodeMap;
Mapping(boolean isTerminating, ImmutableMap<DfaNode, Integer> outMasks) {
super(isTerminating, true);
this.nodeMap = outMasks.entrySet().stream()
.collect(flatteningToImmutableSetMultimap(
Entry::getKey, e -> Digit.fromMask(e.getValue()).stream()));
}
@Override
OpCode getOpCode() {
return isTerminating() ? OpCode.TMAP : OpCode.MAP;
}
/**
* For Mapping operations, output node order is defined by the lowest digit by which that
* node can be reached. For example, if a map operation can reach three nodes {@code A},
* {@code B} and {@code C} via inputs in the ranges {@code [1-38]}, {@code [4-6]} and
* {@code [09]} respectively, then they will be ordered {@code (C, A, B)}.
*/
@Override ImmutableList<DfaNode> getOuts() {
return nodeMap.keySet().asList();
}
@Override
void writeImpl(ByteArrayDataOutput out, Map<DfaNode, Integer> offsetMap, Statistics stats) {
// <------------ 4 bytes ------------> <-- 1 byte per offset --->
// [ OPCODE | CODED MAP | JUMP_1 | ... | JUMP_N ]
out.writeInt((getOpCode().ordinal() << 29) | asCodedMap(nodeMap));
ImmutableList<Integer> offsets =
getOuts().stream().map(offsetMap::get).collect(toImmutableList());
writeJumpTable(out, offsets, stats);
}
@Override
public String toString() {
return format(nodeMap.asMap().values().stream()
.map(Operation::asRangeString).collect(joining(", ")));
}
}
String format(Object extra) {
return String.format("%s%s : %s", getOpCode(), isTerminating() ? "*" : "", extra);
}
/**
* Returns an integer with the lowest 10 bits set in accordance with the digits in the given set.
*/
private static int asBitMask(ImmutableSet<Digit> digits) {
int bitMask = 0;
for (Digit digit : digits) {
bitMask |= (1 << digit.value());
}
return bitMask;
}
/**
* Returns a integer with the lowest 29 bits set to encode an arbitrary mapping from input digit
* to an output index. The 29 bits are partitioned such that lower inputs require fewer bits to
* encode (output indices are assigned as they are encountered, starting at the first input).
* Each digit can then be quickly mapped to either its 1-indexed output node, or 0 if the input
* was invalid.
*/
private static int asCodedMap(ImmutableSetMultimap<DfaNode, Digit> nodeMap) {
int codedMap = 0;
List<DfaNode> outs = nodeMap.keySet().asList();
for (int n = 0; n < outs.size(); n++) {
for (Digit digit : nodeMap.get(outs.get(n))) {
// Coded indices are 1-to-10 (0 is the "invalid" node).
codedMap |= ((n + 1) << OpCode.getMapShift(digit.value()));
}
}
return codedMap;
}
/**
* Writes a sequence of offsets representing a unsigned byte-based jump table after either a
* Mapping or Range instruction. This accounts correctly for the need to introduce a new
* "trampoline" branch instruction after the jump table (when the desired offset is too large
* to fit in a single unsigned byte).
* <p>
* Offsets are either:
* <ul>
* <li>The number of bytes to jump from the end of the current {@code Sequence} bytes to the
* start of the destination {@code Sequence} bytes.
* <li>{@code -1} to indicate that a terminal node has been reached.
* </ul>
* <p>
* Note that the offset written into the jump table itself must be relative to the beginning of
* the jump table and so must be adjusted by the number of bytes in the jump table and any other
* branch instructions that follow it. This it probably the most awkward logic in the entire
* compiler.
*/
static void writeJumpTable(ByteArrayDataOutput out, List<Integer> offsets,
Statistics stats) {
int jumpTableSize = offsets.size();
boolean needsExtraBranches = false;
for (int n = 0; n < jumpTableSize && !needsExtraBranches; n++) {
// Check whether the adjusted offset (ie, the one we would write) will fit in a byte.
// It's no issue to have offsets of -1 as it can never trigger "needsExtraBranches".
needsExtraBranches = (offsets.get(n) + jumpTableSize >= 0x100);
}
if (needsExtraBranches) {
// We only get here if at least one offset (after adjustment by the original jump table size)
// would not fit into a byte. Now we must calculate exactly how many extra branches we are
// going to need. For this we must assume the worst case adjustment of "3 x jumpTableSize"
// which is 1 byte for the jump table offset and 2 bytes for the extra branch for every entry.
// This is pessimistic because there will now be cases where we write a trampoline jump for
// an offset that could have fitted had we not assumed that we might need the extra space for
// the branch. However these cases are rare enough that we choose to ignore them.
int maxOffsetAdjust = ((1 + LONG_BRANCH_SIZE) * jumpTableSize);
int extraBranchCount = 0;
for (int n = 0; n < jumpTableSize; n++) {
if (offsets.get(n) + maxOffsetAdjust >= 0x100) {
extraBranchCount += 1;
}
}
// Now we know a reasonable upper bound for how many extra branches are needed, use this to
// adjust the actual offsets and write them. When a "trampoline" branch instruction is needed
// we split the offset so the jump table jumps to the branch instruction and that jumps the
// rest. Branch instructions are positioned, in order, immediately after the jump table.
List<Integer> extraBranchOffsets = new ArrayList<>();
int totalOffsetAdjust = jumpTableSize + (LONG_BRANCH_SIZE * extraBranchCount);
for (int n = 0; n < jumpTableSize; n++) {
int offset = offsets.get(n);
if (offset >= 0) {
int worstCaseOffset = offset + maxOffsetAdjust;
// Get the actual total offset we want to jump by.
offset += totalOffsetAdjust;
// Use the worst case offset here so we repeat exactly the same decision as the loop
// above (otherwise we might add fewer branches which would screw up our offsets).
if (worstCaseOffset >= 0x100) {
// Split the original offset, recording the jump to the trampoline branch as well as
// the branch offset itself. Note that the offset adjustment changes as more trampoline
// branches are encountered (but the overall offset jumped remains the same).
int extraBranchIndex = extraBranchOffsets.size();
// This offset will always be small (max jump table is 10 entries, so offset to the
// last possible branch will be at most 28 bytes).
int branchInstructionOffset = jumpTableSize + (LONG_BRANCH_SIZE * extraBranchIndex);
// Subtract one additional branch instruction here because when we trampoline jump, we
// jump to the start of the branch instruction, but jump away from the end of it.
extraBranchOffsets.add((offset - branchInstructionOffset) - LONG_BRANCH_SIZE);
offset = branchInstructionOffset;
}
// Write the total offset (offset must be < 0x100 here as worstCaseOffset was < 0x100).
Preconditions.checkState(offset < 0x100, "jump too long: %s", offset);
out.writeByte(offset);
} else {
// If the destination of this jump would just be a termination instruction, just write
// the termination byte here directly (no point jumping to the termination byte).
Preconditions.checkArgument(offset == TERMINATION_OFFSET, "bad offset: %s", offset);
writeTerminator(out, stats);
}
}
// Write out the trampoline jumps in the order they were found.
for (int offset : extraBranchOffsets) {
stats.record(Type.DOUBLE_JUMP);
Operation.writeBranch(out, offset, stats);
}
} else {
// In the simple case, there are no extra branches, so we just write the offsets we have.
// This has the same effect as running the code above with (extraBranchCount == 0) but can be
// reached more optimistically because we don't need to account for the worst case offset
// adjustment when deciding if it's safe to just use the offsets we were given. It's a form
// of hysteresis between the no-branch and extra-branch cases.
for (int n = 0; n < jumpTableSize; n++) {
int offset = offsets.get(n);
if (offset >= 0) {
offset += jumpTableSize;
Preconditions.checkState(offset < 0x100, "jump too long: " + offset);
out.writeByte(offset);
} else {
writeTerminator(out, stats);
}
}
}
}
// Helper function for asRanges() to print a single range (eg, "[014-7]").
private static String asRangeString(Collection<Digit> digits) {
StringBuilder out = new StringBuilder();
out.append("[");
Digit lhs = null;
Digit rhs = null;
for (Digit digit : digits) {
if (lhs != null) {
if (digit.value() == rhs.value() + 1) {
rhs = digit;
continue;
}
if (rhs != lhs) {
if (rhs.value() > lhs.value() + 1) {
out.append("-");
}
out.append(rhs.value());
}
}
lhs = digit;
rhs = digit;
out.append(lhs.value());
}
if (rhs != lhs) {
if (rhs.value() > lhs.value() + 1) {
out.append("-");
}
out.append(rhs.value());
}
out.append("]");
return out.toString();
}
}

+ 44
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Statistics.java View File

@ -0,0 +1,44 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
/**
* A simple class for capturing statistics produced during regular expression compilation. This can
* be used to quantify how proposed changes to the byte-code definition will affect the size of any
* compiled matcher bytes.
*/
public interface Statistics {
public static final Statistics NO_OP = new Statistics() {
@Override public void record(Type type) { }
};
/** The type of things we are counting. */
public enum Type {
SHORT_BRANCH,
MEDIUM_BRANCH,
LONG_BRANCH,
DOUBLE_JUMP,
CONTINUATION,
TERMINATING,
FINAL;
}
/** Records an operation of the specified type during bytecode compilation. */
void record(Type type);
}

+ 2
- 2
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java View File

@ -96,7 +96,7 @@ public final class ExamplesTableSchema {
for (Cell<PhoneRegion, ValidNumberType, DigitSequence> c : table.cellSet()) {
out.put(ExampleNumberKey.of(c.getRowKey(), c.getColumnKey()), NUMBER, c.getValue());
}
return CsvTable.from(SCHEMA, out.build());
return CsvTable.from(SCHEMA, out.buildOrThrow());
}
/**
@ -110,7 +110,7 @@ public final class ExamplesTableSchema {
for (ExampleNumberKey k : csv.getKeys()) {
out.put(k.getRegion(), k.getType(), csv.getOrDefault(k, NUMBER));
}
return out.build();
return out.buildOrThrow();
}
private static Stream<String> write(ExampleNumberKey key) {


+ 1
- 1
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java View File

@ -25,7 +25,7 @@ import java.nio.file.Path;
/**
* A CSV provider which reads files rooted in a given directory. The file layout should match that
* in the CSV metadata directory ({@code googledata/third_party/i18n/phonenumbers/metadata}).
* in the CSV metadata directory ({@code third_party/libphonenumber_metadata/metadata}).
*/
public final class FileBasedCsvLoader implements CsvDataProvider {
/** Returns a CSV loader which reads files from the given base directory. */


+ 1
- 1
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java View File

@ -81,7 +81,7 @@ public final class FormatsTableSchema {
formats.getOrDefault(id, NATIONAL_PREFIX_OPTIONAL),
toComment(formats.getOrDefault(id, COMMENT))));
}
return specs.build();
return specs.buildOrThrow();
}
private static Optional<String> toOptional(String s) {


+ 74
- 89
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java View File

@ -23,13 +23,13 @@ import static java.util.Comparator.comparing;
import static java.util.function.Function.identity;
import static java.util.stream.Collectors.joining;
import com.google.common.base.Splitter;
import com.google.common.collect.ContiguousSet;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableRangeSet;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Range;
import com.google.i18n.phonenumbers.metadata.LengthsParser;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion;
import com.google.i18n.phonenumbers.metadata.i18n.SimpleLanguageTag;
@ -49,18 +49,18 @@ import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode;
import com.google.i18n.phonenumbers.metadata.table.Schema;
import java.time.ZoneId;
import java.util.List;
import java.util.NavigableSet;
import java.util.Optional;
import java.util.TreeSet;
import java.util.stream.Stream;
/**
* The schema of the standard "Ranges" table with rows keyed by {@link RangeKey} and columns:
*
* <ol>
* <li>{@link #TYPE}: The semantic type of numbers in a range (note that this is not
* the same a XmlNumberType or ValidNumberType). All ranges should be assigned a type.
* <li>{@link #TARIFF}: The expected cost of numbers in a range (combining TYPE and TARIFF
* can yield the internal ValidNumberType). All ranges should be assigned a tariff.
* <li>{@link #TYPE}: The semantic type of numbers in a range (note that this is not the same a
* XmlNumberType or ValidNumberType). All ranges should be assigned a type.
* <li>{@link #TARIFF}: The expected cost of numbers in a range (combining TYPE and TARIFF can
* yield the internal ValidNumberType). All ranges should be assigned a tariff.
* <li>{@link #AREA_CODE_LENGTH}: The length of an optional prefix which may be removed from
* numbers in a range for local dialling. Local only lengths are derived using this column.
* <li>{@link #NATIONAL_ONLY}: True if numbers in a range cannot be dialled from outside its
@ -72,8 +72,8 @@ import java.util.stream.Stream;
* applied).
* <li>{@link #TIMEZONE}: The timezone names for a range (or empty to imply the default
* timezones). Multiple timezones can be specific if separated by {@code '&'}.
* <li>{@link #REGIONS}: A group of boolean columns in the form "Region:XX", where ranges are
* set {@code true} that range is valid within the region {@code XX}.
* <li>{@link #REGIONS}: A group of boolean columns in the form "Region:XX", where ranges are set
* {@code true} that range is valid within the region {@code XX}.
* <li>{@link #GEOCODES}: A group of String columns in the form "Geocode:XXX" containing the
* geocode string for a range, where {@code XXX} is the language code of the string.
* <li>{@link #PROVENANCE}: Indicates the most important reason for a range to be valid.
@ -81,6 +81,7 @@ import java.util.stream.Stream;
* </ol>
*
* <p>Rows keys are serialized via the marshaller and produce leading columns:
*
* <ol>
* <li>{@code Prefix}: The prefix (RangeSpecification) for the ranges in a row (e.g. "12[3-6]").
* <li>{@code Length}: A set of lengths for the ranges in a row (e.g. "9", "8,9" or "5,7-9").
@ -88,16 +89,16 @@ import java.util.stream.Stream;
*/
public final class RangesTableSchema {
/**
* External number type enum. This is technically much better than ValidNumberType since it
* splits type and cost properly. Unfortunately the internal logic of the phonenumber library
* doesn't really cope with this, which is why we convert to {@code XmlRangesSchema} before
* creating legacy data structures.
* External number type enum. This is technically much better than ValidNumberType since it splits
* type and cost properly. Unfortunately the internal logic of the phonenumber library doesn't
* really cope with this, which is why we convert to {@code XmlRangesSchema} before creating
* legacy data structures.
*
* <p>This enum can be modified as new types are requested from data providers, providing the
* type mapping to ValidNumberType is updated appropriately. Note that until it's clear that
* mapping types such as {@link #M2M} to {@link ValidNumberType#UNKNOWN} will work okay, we
* should be very careful about using the additional types. Additional types need to be removed
* before the generated table can be turned into a {@link NumberingScheme}.
* <p>This enum can be modified as new types are requested from data providers, providing the type
* mapping to ValidNumberType is updated appropriately. Note that until it's clear that mapping
* types such as {@link #M2M} to {@link ValidNumberType#UNKNOWN} will work okay, we should be very
* careful about using the additional types. Additional types need to be removed before the
* generated table can be turned into a {@link NumberingScheme}.
*/
public enum ExtType {
/** Default value not permitted in real data. */
@ -125,14 +126,14 @@ public final class RangesTableSchema {
private static final ImmutableMap<ExtType, ValidNumberType> TYPE_MAP =
Stream.of(
ExtType.FIXED_LINE,
ExtType.MOBILE,
ExtType.FIXED_LINE_OR_MOBILE,
ExtType.PAGER,
ExtType.PERSONAL_NUMBER,
ExtType.UAN,
ExtType.VOICEMAIL,
ExtType.VOIP)
ExtType.FIXED_LINE,
ExtType.MOBILE,
ExtType.FIXED_LINE_OR_MOBILE,
ExtType.PAGER,
ExtType.PERSONAL_NUMBER,
ExtType.UAN,
ExtType.VOICEMAIL,
ExtType.VOIP)
.collect(toImmutableMap(identity(), v -> ValidNumberType.valueOf(v.name())));
public Optional<ValidNumberType> toValidNumberType() {
@ -185,9 +186,9 @@ public final class RangesTableSchema {
Column.of(ExtTariff.class, "Tariff", ExtTariff.STANDARD_RATE);
/**
* The "Area Code Length" column in the range table, denoting the length of a prefix which can
* be removed from all numbers in a range to obtain locally diallable numbers. If an
* "area code" is not optional for dialling, then no value should be set here.
* The "Area Code Length" column in the range table, denoting the length of a prefix which can be
* removed from all numbers in a range to obtain locally diallable numbers. If an "area code" is
* not optional for dialling, then no value should be set here.
*/
public static final Column<Integer> AREA_CODE_LENGTH =
Column.ofUnsignedInteger("Area Code Length");
@ -226,12 +227,13 @@ public final class RangesTableSchema {
public static final Column<String> COMMENT = Column.ofString("Comment");
/** Marshaller for constructing CsvTable from RangeTable. */
private static final CsvKeyMarshaller<RangeKey> MARSHALLER = new CsvKeyMarshaller<>(
RangesTableSchema::write,
RangesTableSchema::read,
Optional.of(RangeKey.ORDERING),
"Prefix",
"Length");
private static final CsvKeyMarshaller<RangeKey> MARSHALLER =
new CsvKeyMarshaller<>(
RangesTableSchema::write,
RangesTableSchema::read,
Optional.of(RangeKey.ORDERING),
"Prefix",
"Length");
/** The non-key columns of a range table. */
public static final Schema TABLE_COLUMNS =
@ -251,10 +253,10 @@ public final class RangesTableSchema {
.build();
/**
* The columns for the serialized CSV table. Note that the "REGIONS" column group is replaced
* by the CSV regions multi-value. This allows region codes to be serialize in a single column
* (which is far nicer when looking at data in a spreadsheet). In the range table, this is
* normalized into the boolean column group (because that's far nicer to work with).
* The columns for the serialized CSV table. Note that the "REGIONS" column group is replaced by
* the CSV regions multi-value. This allows region codes to be serialize in a single column (which
* is far nicer when looking at data in a spreadsheet). In the range table, this is normalized
* into the boolean column group (because that's far nicer to work with).
*/
private static final Schema CSV_COLUMNS =
Schema.builder()
@ -289,17 +291,21 @@ public final class RangesTableSchema {
for (Change c : table.toChanges()) {
for (RangeKey k : RangeKey.decompose(c.getRanges())) {
regions.clear();
c.getAssignments().forEach(a -> {
// We special case the regions column, converting a group of boolean columns into a
// multi-value of region codes. If the column is in the group, it must hold Booleans.
if (regionColumns.contains(a.column())) {
if (a.value().map(((Column<Boolean>) a.column())::cast).orElse(Boolean.FALSE)) {
regions.add(REGIONS.getKey(a.column()));
}
} else {
csv.put(k, a);
}
});
c.getAssignments()
.forEach(
a -> {
// We special case the regions column, converting a group of boolean columns into
// a
// multi-value of region codes. If the column is in the group, it must hold
// Booleans.
if (regionColumns.contains(a.column())) {
if (a.value().map(((Column<Boolean>) a.column())::cast).orElse(Boolean.FALSE)) {
regions.add(REGIONS.getKey(a.column()));
}
} else {
csv.put(k, a);
}
});
// We can do this out-of-sequence because the table will order its columns.
if (!regions.isEmpty()) {
csv.put(k, CSV_REGIONS, Regions.of(regions));
@ -311,22 +317,28 @@ public final class RangesTableSchema {
/**
* Converts a {@link RangeKey} based {@link CsvTable} to a {@link RangeTable}, preserving the
* original table columns. The {@link CsvSchema} of the returned table is not guaranteed to be
* the {@link #SCHEMA} instance if the given table had different columns.
* original table columns. The {@link CsvSchema} of the returned table is not guaranteed to be the
* {@link #SCHEMA} instance if the given table had different columns.
*/
public static RangeTable toRangeTable(CsvTable<RangeKey> csv) {
RangeTable.Builder out = RangeTable.builder(TABLE_COLUMNS);
for (RangeKey k : csv.getKeys()) {
Change.Builder change = Change.builder(k.asRangeTree());
csv.getRow(k).forEach((c, v) -> {
// We special case the regions column, converting a comma separated list of region codes
// into a series of boolean column assignments.
if (c.equals(CSV_REGIONS)) {
CSV_REGIONS.cast(v).getValues().forEach(r -> change.assign(REGIONS.getColumn(r), true));
} else {
change.assign(c, v);
}
});
csv.getRow(k)
.forEach(
(c, v) -> {
// We special case the regions column, converting a comma separated list of region
// codes
// into a series of boolean column assignments.
if (c.equals(CSV_REGIONS)) {
CSV_REGIONS
.cast(v)
.getValues()
.forEach(r -> change.assign(REGIONS.getColumn(r), true));
} else {
change.assign(c, v);
}
});
out.apply(change.build(), OverwriteMode.NEVER);
}
return out.build();
@ -339,7 +351,8 @@ public final class RangesTableSchema {
// Shared by ShortcodeTableSchema
public static RangeKey read(List<String> parts) {
return RangeKey.create(RangeSpecification.parse(parts.get(0)), parseLengths(parts.get(1)));
return RangeKey.create(
RangeSpecification.parse(parts.get(0)), LengthsParser.parseLengths(parts.get(1)));
}
private static String formatLength(ImmutableSortedSet<Integer> lengthSet) {
@ -364,33 +377,5 @@ public final class RangesTableSchema {
}
}
private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults();
private static final Splitter RANGE_SPLITTER = Splitter.on('-').trimResults().limit(2);
private static NavigableSet<Integer> parseLengths(String s) {
NavigableSet<Integer> lengths = new TreeSet<>();
for (String lengthOrRange : COMMA_SPLITTER.split(s)) {
if (lengthOrRange.contains("-")) {
List<String> lohi = RANGE_SPLITTER.splitToList(lengthOrRange);
int lo = parseInt(lohi.get(0));
int hi = parseInt(lohi.get(1));
checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi);
checkArgument(lengths.isEmpty() || lo > lengths.last(), "Overlapping ranges: %s", s);
lengths.addAll(ContiguousSet.closed(lo, hi));
} else {
int length = parseInt(lengthOrRange);
checkArgument(lengths.isEmpty() || length > lengths.last(), "Overlapping ranges: %s", s);
lengths.add(length);
}
}
return lengths;
}
private static int parseInt(String s) {
return Integer.parseUnsignedInt(s, 10);
}
private RangesTableSchema() {}
}

+ 181
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/AnyPath.java View File

@ -0,0 +1,181 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import java.util.Optional;
/**
* Represents an NFA graph which accepts sequences of inputs of any digit (also known as "any-digit
* sequences"), possibly of variable length. For example, an {@code AnyPath} instance might accept
* a single input of any digit (i.e. equivalent to the regular expression {@code "\d"}), or it might
* accept sequences of any digits of length 4 or 6 (i.e. equivalent to the regular expression
* {@code "\d{4}\d{2}?"}.
*
* <p>As {@code AnyPath} instances are all restricted to only accepting any-digits sequences, the
* only interesting thing about them is the set of sequence lengths they accept.
*/
@AutoValue
abstract class AnyPath implements Comparable<AnyPath> {
/**
* The special empty path which matches zero length input. This is useful as an identity value
* when constructing other paths but should never be a path in the graph.
*/
public static final AnyPath EMPTY = new AutoValue_AnyPath(0x1);
/** The path matching exactly one input of any digit. */
public static final AnyPath SINGLE = of(0x2);
/** The path matching one or zero inputs of any digit. */
public static final AnyPath OPTIONAL = of(0x3);
@VisibleForTesting
static AnyPath of(int mask) {
Preconditions.checkArgument(mask > 1, "invalid path mask: %s", mask);
return new AutoValue_AnyPath(mask);
}
/**
* Returns a bit-mask representing the lengths of any-digit sequences accepted by this path.
* If bit-N is set, then this path accepts an N-length sequence of any digits.
*/
abstract int mask();
/** Returns whether this path accepts an any-digit sequence of length {@code n}.*/
public boolean acceptsLength(int n) {
Preconditions.checkArgument(n >= 0 && n < 32, "invalid path length: %s", n);
return (mask() & (1 << n)) != 0;
}
/** Returns the maximum length any-sequence that this path will accept. */
public int maxLength() {
return (31 - Integer.numberOfLeadingZeros(mask()));
}
/**
* Returns whether this path is empty (i.e. accepts only zero length sequences). This is only
* useful when constructing paths and empty paths should never appear in an NFA graph.
*/
public boolean isEmpty() {
return mask() == 0x1;
}
/**
* Extends this path by one input, potentially setting all input as optional. For example (using
* 'x' to represent a single "any digit" input):
* <ul>
* <li>{@code "xx".extend(false) == "xxx"}
* <li>{@code "xx".extend(true) == "(xxx)?"}
* <li>{@code "xx(x)?".extend(false) == "xxx(x)?"}
* <li>{@code "xx(x)?".extend(true) == "(xxx(x)?)?"}
* </ul>
*/
public AnyPath extend(boolean allOptional) {
return of((mask() << 1) | (allOptional ? 0x1 : 0x0));
}
/**
* Joins the given path to this one, results in a new path which is equivalent to the
* concatenation of the regular expressions they represent. For example (using
* 'x' to represent a single "any digit" input):
* <ul>
* <li>{@code "xx".join("xx") == "xxxx"}
* <li>{@code "xx".join("x?") == "xx(x)?"}
* </ul>
*/
public AnyPath join(AnyPath other) {
int newMask = 0;
// Include the length itself (which is always accepted).
for (int n = 0; n <= other.maxLength(); n++) {
if (other.acceptsLength(n)) {
newMask |= mask() << n;
}
}
return of(newMask);
}
/**
* Returns a new path which is equal to this path, except that it also accepts zero length
* sequences.
*/
public AnyPath makeOptional() {
return of(mask() | 0x1);
}
/**
* Attempts to "factor" this path by the given path to produce a path such that
* {@code p.factor(q).join(q)} is equivalent to {@code p}. This is useful when trying to
* determine longest common paths. Factorizing may not succeed in cases where no common path
* exists (e.g. {@code "xx(xx)?".factor("x?")} fails because there is no way to join anything
* to the path {@code "x?"} to make it accept exactly 2 or 4 length any-digit sequences).
*/
public Optional<AnyPath> factor(AnyPath other) {
int factor = mask() / other.mask();
if (factor > 1 && (other.mask() * factor) == mask()) {
return Optional.of(of(factor));
} else {
return Optional.empty();
}
}
@Override
public int compareTo(AnyPath other) {
return Integer.compare(mask(), other.mask());
}
@Override
public final String toString() {
// A non-obvious algorithm for getting a reasonable toString() using x's.
// Best understood via examples:
//
// 0001 is invalid as we cannot represent an optional zero-length sequence.
//
// Hi-bit-1 ==> 1 x
// 0010 -> x, 0011 -> (x)?
//
// Hi-bit-2 ==> 2 x's
// 0100 -> xx, 0101 -> (xx)?, 0110 -> x(x)?, 0111 -> (x(x)?)?
//
// Hi-bit-3 ==> 3 x's
// 1000 -> xxx, 1001 -> (xxx)?, 1010 -> x(xx)?, 1011 -> (x(xx)?)?
// 1100 -> xx(x)?, 1101 -> (xx(x)?)?, 1110 -> x(x(x)?)?, 1111 -> (x(x(x)?)?)?
//
// Rules:
// * For hi-bit M, there are M x's in the string.
// * For N < M; if bit-N is set, then a group starts after the Nth-x.
if (mask() == 0x1) {
return "<EMPTY>";
}
StringBuilder out = new StringBuilder();
for (int n = 0; n < maxLength(); n++) {
out.append('x');
}
// Loop high-to-low to prevent earlier insertions messing with the index.
for (int n = maxLength() - 1; n >= 0; n--) {
if (acceptsLength(n)) {
out.insert(n, '(');
}
}
// The number of opened groups was the number of set bits - 1.
for (int n = Integer.bitCount(mask()) - 1; n > 0; n--) {
out.append(")?");
}
return out.toString();
}
}

+ 351
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Edge.java View File

@ -0,0 +1,351 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSortedSet;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Value type for edges in NFA graphs of phone number regular expressions. Outside this package,
* this type is mainly used for examining NFA graphs which represent a regular expression,
* generated via {@link RangeTreeConverter#toNfaGraph}..
*
* <p>Note that the ordering of edges is carefully designed to attempt to replicate as much of the
* existing intuition about ordering in regular expressions as possible. This should result in any
* generated expressions being as close to existing hand edited expressions as possible.
*/
public abstract class Edge implements Comparable<Edge> {
/** API for visiting composite edges; see also {@link #accept(Visitor)}. */
public interface Visitor {
/** Visits a leaf node simple edge. */
void visit(SimpleEdge edge);
/**
* Visits a composited sequence of edges. Note that sequences only ever contain disjunctions or
* simple edges, but never other sequences. For edges "a", "b", "c", this represents the
* concatenated edge "abc".
*/
void visitSequence(List<Edge> edges);
/**
* Visits a disjunction of parallel edges. Note that disjunctions only ever contain sequences
* or simple edges, but never other disjunctions. For edges "a", "b", "c", this represents the
* disjunctive group "(a|b|c)".
*/
void visitGroup(Set<Edge> edges, boolean isOptional);
}
// The singleton epsilon edge.
private static final SimpleEdge EPSILON = new SimpleEdge();
// The singleton edge matching any digit (i.e. 'x' or '\d').
private static final SimpleEdge ANY = new SimpleEdge(ALL_DIGITS_MASK, false);
// The singleton edge optionally matching any digit (i.e. 'x?' or '\d?').
private static final SimpleEdge OPTIONAL_ANY = ANY.optional();
/** Returns an edge which accepts digits 0 to 9 according tothe bits set in the given mask. */
public static SimpleEdge fromMask(int digitMask) {
return digitMask == ALL_DIGITS_MASK ? ANY : new SimpleEdge(digitMask, false);
}
/**
* Returns the epsilon edge which accepts zero length input and transitions immediately. This
* edge should only ever appear parallel to other edges, and not as the only transition between
* two nodes.
*/
public static SimpleEdge epsilon() {
return EPSILON;
}
/** Returns the edge which accepts any digit {@code [0-9]}. */
public static SimpleEdge any() {
return ANY;
}
/** Returns the edge which optionally accepts any digit {@code [0-9]}. */
public static SimpleEdge optionalAny() {
return OPTIONAL_ANY;
}
/**
* Returns the ordered concatenation of the given edges. If either edge is a concatenation, it
* is first expanded, so that the resulting edge contains only simple edges or disjunctions.
*/
public static Edge concatenation(Edge lhs, Edge rhs) {
checkArgument(!lhs.equals(EPSILON) && !rhs.equals(EPSILON), "cannot concatenate epsilon edges");
// Don't make concatenations of concatenations; flatten them out so you only have singletons
// or disjunctions. This is equivalent to writing "xyz" instead of "x(yz)".
List<Edge> edges = Stream.of(lhs, rhs)
.flatMap(
e -> (e instanceof Concatenation) ? ((Concatenation) e).edges.stream() : Stream.of(e))
.collect(Collectors.toList());
return new Concatenation(edges);
}
/**
* Returns the disjunction of the given edges. If either edge is already a concatenation, it
* is first expanded, so that the resulting edge contains only simple edges or disjunctions.
*/
public static Edge disjunction(Collection<Edge> edges) {
// Don't make disjunctions of disjunctions; flatten them out so you only have singletons,
// concatenations or epsilon. This is equivalent to writing "(x|y|z)" instead of "(x|(y|z))".
List<Edge> allEdges = edges.stream()
.flatMap(
e -> (e instanceof Disjunction) ? ((Disjunction) e).edges.stream() : Stream.of(e))
.sorted()
.distinct()
.collect(Collectors.toList());
// There should only ever be one epsilon when we make a disjunction (disjunctions are made when
// subgraphs collapse and each subgraph should only have one epsilon to make it optional).
// Epsilons sort to-the-left of everything, so if there is an epsilon it must be the first edge.
boolean isOptional = allEdges.get(0) == EPSILON;
if (isOptional) {
allEdges = allEdges.subList(1, allEdges.size());
}
Preconditions.checkState(!allEdges.contains(EPSILON));
return new Disjunction(allEdges, isOptional);
}
/** An edge optionally matching a single input token, or the epsilon transition. */
public static final class SimpleEdge extends Edge {
private final int digitMask;
private final boolean isOptional;
// Constructor for singleton epsilon edge.
private SimpleEdge() {
this.digitMask = 0;
// An optional epsilon makes no real sense.
this.isOptional = false;
}
private SimpleEdge(int digitMask, boolean isOptional) {
checkArgument(digitMask > 0 && digitMask < (1 << 10), "invalid bit mask %s", digitMask);
this.digitMask = digitMask;
this.isOptional = isOptional;
}
/** Returns the mask of digits accepted by this edge. */
public int getDigitMask() {
return digitMask;
}
/** Returns whether this edge is optional. */
public boolean isOptional() {
return isOptional;
}
/** Returns an optional version of this, non-optional edge. */
public SimpleEdge optional() {
Preconditions.checkState(digitMask != 0, "cannot make epsilon optional");
Preconditions.checkState(!isOptional, "edge already optional");
return new SimpleEdge(digitMask, true);
}
@Override
public void accept(Visitor visitor) {
visitor.visit(this);
}
@Override
public boolean equals(Object obj) {
return (obj instanceof SimpleEdge) && digitMask == ((SimpleEdge) obj).digitMask;
}
@Override
public int hashCode() {
return digitMask;
}
@Override
public int compareTo(Edge rhs) {
if (rhs instanceof SimpleEdge) {
return compare((SimpleEdge) rhs);
} else {
// Composite types know how to compare themselves to SimpleEdges, so delegate to them but
// remember to invert the result since we are reversing the comparison order.
return -rhs.compareTo(this);
}
}
private int compare(SimpleEdge rhs) {
if (isOptional != rhs.isOptional) {
// Optional edges sort to-the-right of non-optional things.
return isOptional ? 1 : -1;
}
if (digitMask == rhs.digitMask) {
return 0;
}
if (digitMask == 0 || rhs.digitMask == 0) {
// Epsilon sorts to-the-left of everything.
return digitMask == 0 ? -1 : 1;
}
// Unlike many other places where range specifications are used, we cannot guarantee the
// ranges are disjoint here, so we sort on the reversed bitmask to favour the lowest set bit.
// This sorts 'x' ([0-9]) to the left of everything, and epsilon to the right of everything.
// I.e. "x" < "0", "0" < "1", "[0-3]" < "[0-2]", "9" < epsilon.
//
// Remember to logical-shift back down to avoid negative values.
int reverseLhsMask = (Integer.reverse(digitMask) >>> 22);
int reverseRhsMask = (Integer.reverse(rhs.digitMask) >>> 22);
// Compare in the opposite order, so the largest reversed value is ordered "to the left".
return Integer.compare(reverseRhsMask, reverseLhsMask);
}
}
// A sequence of edges (disjunctions or simple edges).
private static final class Concatenation extends Edge {
private final ImmutableList<Edge> edges;
private Concatenation(Collection<Edge> edges) {
this.edges = ImmutableList.copyOf(edges);
}
@Override
public void accept(Visitor visitor) {
visitor.visitSequence(edges);
}
@Override
public boolean equals(Object obj) {
return (obj instanceof Concatenation) && edges.equals(((Concatenation) obj).edges);
}
@Override
public int hashCode() {
return edges.hashCode();
}
@Override
public int compareTo(Edge rhs) {
if (rhs instanceof Concatenation) {
return compareEdges(edges, ((Concatenation) rhs).edges);
} else {
// Compare our first edge to the non-concatenation. If this compares as equal, order the
// concatenation between simple edges and disjunctions to break the tie and avoid implying
// that a concatenation and a non-concatenation are equal.
int comparison = -rhs.compareTo(edges.get(0));
return comparison != 0 ? comparison : (rhs instanceof SimpleEdge ? 1 : -1);
}
}
}
// A disjunctive group of edges (sequences or simple edges).
private static final class Disjunction extends Edge {
private final ImmutableSortedSet<Edge> edges;
private final boolean isOptional;
private Disjunction(Collection<Edge> edges, boolean isOptional) {
checkArgument(!edges.isEmpty());
this.edges = ImmutableSortedSet.copyOf(edges);
this.isOptional = isOptional;
}
@Override
public void accept(Visitor visitor) {
visitor.visitGroup(edges, isOptional);
}
@Override
public boolean equals(Object obj) {
return (obj instanceof Disjunction) && edges.equals(((Disjunction) obj).edges);
}
@Override
public int hashCode() {
// Negate bits here to be different from Concatenation.
return ~edges.hashCode();
}
@Override
public int compareTo(Edge rhs) {
if (rhs instanceof Disjunction) {
return compareEdges(edges.asList(), ((Disjunction) rhs).edges.asList());
} else {
// Compare our first edge to the non-disjunction. If this compares as equal, order the
// disjunction to the right of the other edge to break the tie and avoid implying that
// a disjunction and a non-disjunction are equal.
int comparison = -rhs.compareTo(edges.asList().get(0));
return comparison == 0 ? 1 : comparison;
}
}
}
/**
* Accepts a visitor on this edge, visiting any sub-edges from which it is composed. This is a
* double-dispatch visitor to avoid anyone processing edges needing to know about specific types.
* Only the immediate edge is visited and the visitor is then responsible for visiting child
* edges.
*/
public abstract void accept(Visitor visitor);
// Compare lists according to elements, and tie break on length if different. This is effectively
// a lexicographical ordering.
private static int compareEdges(ImmutableList<Edge> lhs, ImmutableList<Edge> rhs) {
int minSize = Math.min(lhs.size(), rhs.size());
for (int n = 0; n < minSize; n++) {
int compared = lhs.get(n).compareTo(rhs.get(n));
if (compared != 0) {
return compared;
}
}
return Integer.compare(lhs.size(), rhs.size());
}
@Override
public String toString() {
StringBuilder out = new StringBuilder();
accept(new Visitor() {
@Override
public void visit(SimpleEdge e) {
if (e.equals(Edge.epsilon())) {
// Epsilon cannot be optional.
out.append("e");
} else {
int m = e.getDigitMask();
out.append(m == ALL_DIGITS_MASK ? "x" : RangeSpecification.toString(m));
if (e.isOptional()) {
out.append('?');
}
}
}
@Override
public void visitSequence(List<Edge> edges) {
edges.forEach(e -> e.accept(this));
}
@Override
public void visitGroup(Set<Edge> edges, boolean isOptional) {
out.append("(");
edges.forEach(e -> {
e.accept(this);
out.append("|");
});
out.setLength(out.length() - 1);
out.append(isOptional ? ")?" : ")");
}
});
return out.toString();
}
}

+ 343
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriter.java View File

@ -0,0 +1,343 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import com.google.common.collect.Iterables;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import com.google.i18n.phonenumbers.metadata.regex.Edge.Visitor;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import javax.annotation.Nullable;
/** Writes an NFA graph edge instance as a regular expression. */
final class EdgeWriter implements Visitor {
// Regex constant strings pulled out for some degree of readability.
private static final String DOT_MATCH = ".";
private static final String DIGIT_MATCH = "\\d";
private static final String OPTIONAL_MARKER = "?";
private static final String GROUP_START = "(?:";
private static final String GROUP_DISJUNCTION = "|";
private static final String GROUP_END = ")";
private static final String OPTIONAL_GROUP_END = GROUP_END + OPTIONAL_MARKER;
/**
* Returns a regular expression corresponding to the structure of the given edge. This method
* does not apply any specific optimizations to the edge it is given and any optimizations which
* affect the output must have already been applied to the graph from which the input edge was
* derived.
*
* @param edge A collapsed edge typically derived from serializing an NFA graph.
* @param useDotMatch true if {@code '.'} should be used to "match any digit" (instead of
* {@code '\\d'}) which results in shorter output.
*/
public static String toRegex(Edge edge, boolean useDotMatch) {
EdgeWriter writer = new EdgeWriter(useDotMatch);
edge.accept(writer);
return writer.out.toString();
}
// The token to match any input digit (e.g. "\\d" or ".").
private final String anyToken;
// Accumulated regular expression appended to during visitation.
private final StringBuilder out = new StringBuilder();
// Flag to determine when the top-level edge visited is a group, because if it is we can often
// omit the explicit grouping tokens and save some space.
private boolean isTopLevelGroup = true;
private EdgeWriter(boolean useDotMatch) {
this.anyToken = useDotMatch ? DOT_MATCH : DIGIT_MATCH;
}
@Override
public void visit(SimpleEdge e) {
checkArgument(!e.equals(Edge.epsilon()), "unexpected bare epsilon");
isTopLevelGroup = false;
// It's easier to just attempt to extract an "any" edge as that code already has to work for
// simple edges when they are inside other composite edges. Optionality is encoded into the
// resulting AnyPath and handled by appendRegex(), so we don't need to handle it again here.
Optional<AnyPath> any = AnyPathVisitor.extractAnyPath(e);
if (any.isPresent()) {
appendRegex(out, any.get().mask());
} else {
// Not an "any" edge so append the usual range representation (e.g. "6" or "[014-9]").
out.append(RangeSpecification.toString(e.getDigitMask()));
if (e.isOptional()) {
out.append(OPTIONAL_MARKER);
}
}
}
@Override
public void visitSequence(List<Edge> edges) {
checkArgument(!edges.isEmpty(), "sequences must have at least one edge");
isTopLevelGroup = false;
// At this level a sequence might be a mix of normal and "any" edges (e.g. "123xxxx"). To
// cope with this, track and accumulate the un-written "any" edge, and emit it just before
// any other output (or at the end).
AnyPath any = AnyPath.EMPTY;
for (Edge e : edges) {
Optional<AnyPath> next = AnyPathVisitor.extractAnyPath(e);
if (next.isPresent()) {
any = any.join(next.get());
continue;
}
// Here we have a "normal" edge, but we still might need to emit a collected "any" edge.
if (!any.isEmpty()) {
appendRegex(out, any.mask());
any = AnyPath.EMPTY;
}
// This recursion only happens when this was not an "any" edge (though it may still be a
// composite that contains other "any" edges).
e.accept(this);
}
// If the last thing we saw in this sequence was an "any" edge, don't forget to emit it.
if (!any.isEmpty()) {
appendRegex(out, any.mask());
}
}
@Override
public void visitGroup(Set<Edge> edges, boolean isOptional) {
checkArgument(!edges.isEmpty(), "groups must have at least one edge");
// The very top-level group is almost always non-optional and can be omitted for length
// (ie. "(?:a|b|c)" can just be "a|b|c").
boolean canSkipParens = isTopLevelGroup && !isOptional;
// Unset this before recursing.
isTopLevelGroup = false;
// We have exactly one case where an "any" edge needs to be handled for groups, and that's
// when there's an optional any group that's not part of an enclosing sequence (e.g. "(xx)?").
if (edges.size() == 1 && isOptional) {
Optional<AnyPath> any = AnyPathVisitor.extractAnyPath(Iterables.getOnlyElement(edges));
if (any.isPresent()) {
// Remember to account for the optionality of the outer group.
appendRegex(out, any.get().makeOptional().mask());
return;
}
}
if (!canSkipParens) {
out.append(GROUP_START);
}
for (Edge e : edges) {
e.accept(this);
out.append(GROUP_DISJUNCTION);
}
// Easier to just remove the disjunction we know was added last than track state in the loop.
out.setLength(out.length() - GROUP_DISJUNCTION.length());
if (!canSkipParens) {
out.append(isOptional ? OPTIONAL_GROUP_END : GROUP_END);
}
}
/**
* Recursive visitor to extract "any" sequences from edges (simple or composite). A sequence of
* edges is an "any path" if all edges accept any digit. Composite edges already enforce the
* requirement that epsilon edges don't exist directly (they are represented via optionality).
*/
private static final class AnyPathVisitor implements Visitor {
/**
* Returns the longest "any" sequence represented by the given edge (if the edge represents an
* any sequence). If present, the result is non-empty.
*/
@Nullable
public static Optional<AnyPath> extractAnyPath(Edge e) {
AnyPathVisitor visitor = new AnyPathVisitor();
e.accept(visitor);
return Optional.ofNullable(visitor.path);
}
// Accumulate value during visitation and set to null to abort.
@Nullable
private AnyPath path = AnyPath.EMPTY;
@Override
public void visit(SimpleEdge edge) {
checkState(path != null, "path should never be null at start of recursion");
if (edge.getDigitMask() == ALL_DIGITS_MASK) {
path = path.join(edge.isOptional() ? AnyPath.OPTIONAL : AnyPath.SINGLE);
} else {
path = null;
}
}
@Override
public void visitSequence(List<Edge> edges) {
checkState(path != null, "path should never be null at start of recursion");
// Looking for a complete sequence of "any edges" (partial sequences in a concatenation are
// taken care of by the caller).
for (Edge e : edges) {
Optional<AnyPath> next = AnyPathVisitor.extractAnyPath(e);
if (next.isPresent()) {
path = path.join(next.get());
} else {
path = null;
break;
}
}
}
@Override
public void visitGroup(Set<Edge> edges, boolean isOptional) {
checkState(path != null, "path should never be null at start of recursion");
// Looking for a group like (xxx(xx)?)? which contains one edge only. We just recurse into
// that edge and then make the result optional (a disjuction with only one edge must be
// optional or else it should have been a concatenation).
if (edges.size() > 1) {
path = null;
return;
}
checkState(isOptional, "single edge disjunctions should be optional");
Edge e = Iterables.getOnlyElement(edges);
e.accept(this);
if (path != null) {
path = path.makeOptional();
}
}
}
// The code below here is really a bit squiffy and relies on a whole bunch of bit fiddling to
// do what it does. The good news is that it's easy to unit-test the heck out of, so that's
// what I've done. Don't look too hard at what's going on unless you're a bit of a masochist.
/**
* Appends the regular expression corresponding to the given AnyPath mask value. This is a
* bit-mask where the Nth bit corresponds to accepting an any digit sequence of length N.
*
* <p>For example:
* <ul>
* <li> {@code 00000010} accepts only length 1 (e.g. "\d")
* <li> {@code 00000011} accepts lengths 0 or 1 (e.g. "\d?")
* <li> {@code 00001000} accepts only length 3 (e.g. "\d{3}")
* <li> {@code 00011100} accepts lengths 2-4 (e.g. "\d{2,4}")
* <li> {@code 11101100} accepts lengths 0,2,3,5,6,7 (e.g. "(?:\d\d(?:\d(?:\d{2,4})?)?)?")
* </ul>
*/
private void appendRegex(StringBuilder out, int mask) {
checkArgument(mask > 1, "unexpected mask value %s", mask);
// Deal with optionality separately.
boolean allOptional = (mask & 0x1) != 0;
mask &= ~0x1;
// We are looking for bit patterns like '1111000' for contiguous ranges (e.g. {3,7}).
// Find the lo/hi size of the next contiguous range (inclusive).
int lo = Integer.numberOfTrailingZeros(mask);
int hi = Integer.numberOfTrailingZeros(~(mask >>> lo)) + (lo - 1);
// If all the bits are accounted for (nothing above the "hi" bit) then this was the last
// contiguous range and we don't need to recurse (so no more groups need to be opened).
if (mask < (1 << (hi + 1))) {
// Writes a contiguous range as a single token with optionality (e.g. "\d", "(?:\d{2,4})?").
appendAnyRange(out, lo, hi, allOptional);
return;
}
// This is about the entire group, not the subgroup we are about to recurse into.
if (allOptional) {
out.append(GROUP_START);
}
// IMPORTANT: If we are recursing, we must not attempt to emit the entire group here, only the
// shortest matching length.
//
// Mask "11101100" does NOT represent "\d{2,3}(?:\d{2,4})?" as that can match 4-digits too.
// Instead it should generate "\d\d(?:\d(?:\d{2,4})?)?", where the 3 digit match is part of an
// optional group.
appendRequiredAnyRange(out, lo);
// Recurse using the mask that's had the match we just emitted "factored out". This is always
// optional because bit-0 is what was the lowest set bit in our mask.
appendRegex(out, mask >>> lo);
if (allOptional) {
out.append(OPTIONAL_GROUP_END);
}
}
/**
* Appends regular expression tokens that accept any digits for a single length.
*
* <p>For example:
* <ol>
* <li>{@code n=1}: {@code "\d"}
* <li>{@code n=2}: {@code "\d\d"} (this could be extended if using '.')
* <li>{@code otherwise}: {@code "\d{n}"}
* </ol>
*/
private void appendRequiredAnyRange(StringBuilder out, int n) {
checkArgument(n >= 1, "bad any length %s", n);
out.append(anyToken);
if (n == 2) {
// Only safe to do this if the group is not optional ("\d\d?" != "(?:\d{2})?").
out.append(anyToken);
} else if (n > 2) {
out.append('{').append(n).append('}');
}
}
/**
* Appends regular expression tokens that accept any digits in a contiguous range of lengths.
*
* <p>For example:
* <ol>
* <li>{@code lo=1, hi=1, optional=false}: {@code "\d"}
* <li>{@code lo=1, hi=1, optional=true}: {@code "\d?"}
* <li>{@code lo=2, hi=2, optional=true}: {@code "(?:\d{2})?"}
* <li>{@code lo=3, hi=6, optional=false}: {@code "\d{3,6}"}
* <li>{@code lo=3, hi=6, optional=true}: {@code "(?:\d{3,6})?"}
* <li>{@code lo=1, hi=4, optional=true}: {@code "\d{0,4}"} (not {@code (?:\d{1,4})?})
* <li>{@code lo=2, hi=2, optional=false}: {@code "\d\d"} (special case for size)
* <li>{@code lo=1, hi=2, optional=false}: {@code "\d\d?"} (special case for size)
* </ol>
*/
private void appendAnyRange(StringBuilder out, int lo, int hi, boolean optional) {
checkArgument(lo >= 1 && hi >= lo, "bad range arguments %s, %s", lo, hi);
if (lo == hi) {
if (!optional) {
// Required single length.
appendRequiredAnyRange(out, lo);
} else {
// Optional single length.
if (lo > 1) {
out.append(GROUP_START).append(anyToken);
out.append('{').append(lo).append('}');
out.append(OPTIONAL_GROUP_END);
} else {
out.append(anyToken).append(OPTIONAL_MARKER);
}
}
} else if (lo == 1 && hi == 2 && !optional) {
// Special case for "\d\d?" as it's shorter than "\d{1,2}" (and even shorter with '.').
// Even though we append the "optional marker" (i.e. '?') here it's got nothing to do
// with the entire group being optional. That would be "(?:\d{1,2})?" which is "\d{0,2}".
out.append(anyToken).append(anyToken).append(OPTIONAL_MARKER);
} else if (lo == 1 && optional) {
// Special case to write "\d{0,N}" instead of "(?:\d{1,N})?"
out.append(anyToken).append("{0,").append(hi).append('}');
} else {
if (optional) {
out.append(GROUP_START);
}
// General case.
out.append(anyToken).append('{').append(lo).append(',').append(hi).append('}');
if (optional) {
out.append(OPTIONAL_GROUP_END);
}
}
}
}

+ 195
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattener.java View File

@ -0,0 +1,195 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.graph.ValueGraph;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.function.Function;
/**
* Flattens an NFA graph of simple edges into a composite edge which represents all the same
* transitions in a strict tree structure (i.e. nestable sub-groups). This can entail some
* duplication of edges, but this should be kept to a minimum and favours duplicating trailing
* paths to avoid introducing additional non-determinism.
*/
final class NfaFlattener {
/**
* Flattens the given NFA graph into a single composite edge composed of concatenation and
* disjunction. The resulting edge can be visited using the {@code Edge.Visitor} class.
*/
public static Edge flatten(ValueGraph<Node, SimpleEdge> graph) {
return new NfaFlattener(graph).flatten();
}
/*
* A simple pair of edge value and target node which represents the current state along any path
* in the NFA graph. Path followers may be joined (if they point at the same node) but can only
* be split by recursion into the new subgraph.
*/
@AutoValue
abstract static class PathFollower {
private static PathFollower of(Node node, Edge edge) {
return new AutoValue_NfaFlattener_PathFollower(node, edge);
}
/** The target node that this follower points to. */
abstract Node node();
/** A composite edge representing everything up to the target node in the current sub-graph. */
abstract Edge edge();
}
// The graph being flattened.
private final ValueGraph<Node, SimpleEdge> graph;
// An ordering for the work queue which ensures that followers with the same node are adjacent.
private final Comparator<PathFollower> queueOrder;
private NfaFlattener(ValueGraph<Node, SimpleEdge> graph) {
this.graph = graph;
this.queueOrder = Comparator
.comparing(PathFollower::node, nodeOrdering(graph))
.thenComparing(PathFollower::edge);
}
private Edge flatten() {
// Sub-graph visitation only works for graphs which branch from and collapse to a single node.
// An NFA graph could be multiple sequential edges or a sequence of edges and sub-graphs.
// Handle that in this outer loop rather than complicate the visitor (already quite complex).
PathFollower out = visitSubgraph(Node.INITIAL);
while (out.node() != Node.TERMINAL) {
PathFollower subgraph = visitSubgraph(out.node());
out = PathFollower.of(subgraph.node(), Edge.concatenation(out.edge(), subgraph.edge()));
}
return out.edge();
}
/**
* Visits the sub-graph rooted at the given node, following all out-edges until they eventually
* re-join. Because the given graph has only one terminal node and no cycles, all sub-graphs must
* eventually rejoin at some point. If during visitation of a sub-graph, a node with multiple
* out-edges is reached, then the sub-graph it starts is recursively visited. Note that as "inner"
* sub-graphs must terminate at or before their parent graph, nesting is assured.
*
* <p>The key to the implementation of this algorithm is that visitation occurs in breadth-first
* order defined according to the reachability of the nodes in the graph. This ensures that when
* an edge follower which reaches a node at which other edges join together is processed (i.e.
* when it gets to the head of the queue) all the other followers that can also reach that node
* must also be present in a contiguous sequence at the front of the queue.
*/
private PathFollower visitSubgraph(Node node) {
Preconditions.checkArgument(graph.outDegree(node) > 0, "cannot recurse from the terminal node");
if (graph.outDegree(node) == 1) {
// Visit the trivial "subgraph" that's really just a single edge. Note that this code could
// loop and concatenate all sequential single edges, but it also works fine to rely on the
// recursion of the caller (the advantage of doing it this, simpler, way means that this code
// doesn't have to know about termination due to reaching the terminal node).
Node target = Iterables.getOnlyElement(graph.successors(node));
return PathFollower.of(target, graph.edgeValue(node, target).get());
}
// A work-queue of the path followers, ordered primarily by the node they target. This results
// in the followers at any "point of collapse" being adjacent in the queue.
PriorityQueue<PathFollower> followerQueue = new PriorityQueue<>(queueOrder);
for (Node t : graph.successors(node)) {
followerQueue.add(PathFollower.of(t, graph.edgeValue(node, t).get()));
}
while (true) {
// Get the set of followers that share the same target node at the head of the queue. The
// ordering in the queue ensures that followers for the same target are always adjacent.
PathFollower follower = followerQueue.remove();
Node target = follower.node();
List<Edge> joiningEdges = collectJoiningEdges(followerQueue, target);
if (joiningEdges != null) {
// Replace any joined followers with their disjunction (they all have the same target).
joiningEdges.add(follower.edge());
follower = PathFollower.of(target, Edge.disjunction(joiningEdges));
}
if (followerQueue.isEmpty()) {
// If we just processed the last "joining" paths then this sub-graph has been collapsed
// into a single edge and we just return the current follower. Note that we can join edges
// without ending recursion (when 3 followers become 2) but we can only end recursion after
// joining at least 2 edges at the terminal sub-graph node.
return follower;
}
// Recurse into the next sub-graph (possibly just a single edge) which is just concatenated
// onto the current follower.
PathFollower subgraph = visitSubgraph(target);
followerQueue.add(
PathFollower.of(subgraph.node(), Edge.concatenation(follower.edge(), subgraph.edge())));
}
}
// Collects the edges of any followers at the front of the queue which share the same target node
// as the given follower. If the node is not a target of any other followers then return null.
private static List<Edge> collectJoiningEdges(PriorityQueue<PathFollower> queue, Node target) {
// It's really common for edges not to join, so avoid making the list unless necessary.
if (!nextFollowerJoinsTarget(queue, target)) {
return null;
}
List<Edge> joiningEdges = new ArrayList<>();
do {
joiningEdges.add(queue.remove().edge());
} while (nextFollowerJoinsTarget(queue, target));
return joiningEdges;
}
// Checks if the head of the queue is a follower with the same target node.
private static boolean nextFollowerJoinsTarget(PriorityQueue<PathFollower> queue, Node target) {
return !queue.isEmpty() && queue.peek().node().equals(target);
}
/**
* Returns a total ordering of nodes in this graph based on the maximum path length from the
* initial node. If path lengths are equal for two nodes, then the node ID is used to tie break.
*
* <p>The property of this ordering that is critical to the node flattening algorithm is that if
* {@code a < b}, then no path exists in the graph where {@code b} precedes {@code a}. This
* ensures that path followers are processed consistently with the "node reachability" and if
* several path followers target the same node, then they are adjacent in the follower queue.
*
* <p>Using the node ID as a tie-break is safe, because while node IDs are assigned arbitrarily,
* they only apply between nodes in the same path length "bucket", so it cannot violate the total
* ordering requirement, since any order within a "bucket" is equally good.
*/
// Note: If there are graph cycles this will not terminate, but that implies bad bugs elsewhere.
@VisibleForTesting
static Comparator<Node> nodeOrdering(ValueGraph<Node, ?> graph) {
Map<Node, Integer> map = new HashMap<>();
recursivelySetMaxPathLength(Node.INITIAL, 0, graph, map);
// We have to cast the "get" method since it accepts "Object", not "Node" on a map.
return Comparator.comparing((Function<Node, Integer>) map::get).thenComparing(Node::id);
}
private static void recursivelySetMaxPathLength(
Node node, int length, ValueGraph<Node, ?> graph, Map<Node, Integer> map) {
// Only continue if at least some paths can be lengthened from here onwards.
if (length > map.getOrDefault(node, -1)) {
map.put(node, length);
for (Node target : graph.successors(node)) {
recursivelySetMaxPathLength(target, length + 1, graph, map);
}
}
}
}

+ 51
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Node.java View File

@ -0,0 +1,51 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import com.google.auto.value.AutoValue;
/**
* Value type for nodes in NFA graphs of phone number regular expressions. This is basically a
* trivial wrapper for an {@code int}, but it makes a lot of other pieces of code type safe.
* Outside this package, this type is mainly used for examining NFA graphs which represent a
* regular expression, generated via {@link RangeTreeConverter#toNfaGraph}.
*/
@AutoValue
public abstract class Node implements Comparable<Node> {
/** The unique initial node in an NFA graph with in-order zero. */
public static final Node INITIAL = new AutoValue_Node(0);
/** The unique terminal node in an NFA graph with out-order zero. */
public static final Node TERMINAL = new AutoValue_Node(1);
/** Returns a new node whose ID is one greater than this node. */
public Node createNext() {
return (id() == 0) ? TERMINAL : new AutoValue_Node(id() + 1);
}
/** Returns the numeric ID of this node, which must be unique within an NFA graph. */
abstract int id();
@Override
public int compareTo(Node o) {
return Integer.compare(id(), o.id());
}
@Override
public final String toString() {
return Integer.toString(id());
}
}

+ 123
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverter.java View File

@ -0,0 +1,123 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.Preconditions.checkState;
import com.google.common.graph.ElementOrder;
import com.google.common.graph.MutableValueGraph;
import com.google.common.graph.ValueGraph;
import com.google.common.graph.ValueGraphBuilder;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.HashMap;
import java.util.Map;
/**
* Converts DFA {@link RangeTree}s to NFA {@link ValueGraph}s. The resulting graph has almost
* exactly the same node and edge structure as the original DFA, with the following exceptions:
* <ol>
* <li>Nodes which could optionally terminate now have 'epsilon' edges connecting them to the
* terminal node.
* <li>If an optionally terminating node connects directly to the terminal node, then a special
* "optional edge" is used (this is because the {@link ValueGraph} structure allows only one
* value for each edge, so you can't have an epsilon edge that goes between the same source and
* target as other edge).
* </ol>
*/
public final class RangeTreeConverter {
/**
* Returns the directed NFA graph representation of a {@link RangeTree}. The returned graph is
* not a DFA and may contain epsilon transitions. Nodes are assigned in visitation order, except
* for the initial and terminal nodes which are always present in the graph.
*/
public static ValueGraph<Node, SimpleEdge> toNfaGraph(RangeTree ranges) {
NfaVisitor visitor = new NfaVisitor(ranges.getInitial());
ranges.accept(visitor);
return visitor.graph;
}
private static class NfaVisitor implements DfaVisitor {
private final MutableValueGraph<Node, SimpleEdge> graph = ValueGraphBuilder
.directed()
.allowsSelfLoops(false)
// Stable ordering should help keep any generated structures (regex, graph files) stable.
.nodeOrder(ElementOrder.<Node>natural())
.build();
// Map of nodes added to the new graph (keyed by the corresponding DFA node).
private final Map<DfaNode, Node> nodeMap = new HashMap<>();
// The last node we added.
private Node lastAdded;
private NfaVisitor(DfaNode initial) {
// Add initial and terminal nodes first (there's always exactly one of each).
graph.addNode(Node.INITIAL);
graph.addNode(Node.TERMINAL);
// During visitation we check only target nodes to add epsilon edges, but we may also need
// to add an epsilon from the very top if the DFA can match the empty input.
if (initial.canTerminate()) {
graph.putEdgeValue(Node.INITIAL, Node.TERMINAL, Edge.epsilon());
}
nodeMap.put(initial, Node.INITIAL);
nodeMap.put(RangeTree.getTerminal(), Node.TERMINAL);
lastAdded = Node.TERMINAL;
}
@Override
public void visit(DfaNode dfaSource, DfaEdge dfaEdge, DfaNode dfaTarget) {
SimpleEdge simpleEdge = Edge.fromMask(dfaEdge.getDigitMask());
Node source = nodeMap.get(dfaSource);
Node target = getTarget(dfaTarget);
boolean wasNewNode = graph.addNode(target);
// The only chance of an existing edge is if an epsilon was already added immediately before
// visiting this edge. This can only occur if (target == TERMINAL) however.
SimpleEdge epsilon = graph.putEdgeValue(source, target, simpleEdge);
if (epsilon != null) {
checkState(target.equals(Node.TERMINAL) && epsilon.equals(Edge.epsilon()),
"unexpected edge during visitation: %s -- %s --> %s", source, epsilon, target);
// Re-add the edge, but this time make it optional (because that's what epsilon means).
graph.putEdgeValue(source, target, simpleEdge.optional());
}
// Only recurse if the target node was newly added to the graph in this visitation.
if (wasNewNode) {
// The TERMINAL node is always in the map so (target != TERMINAL) here. This means we
// never risk adding a loop in the graph. The epsilon may end up being swapped out for
// an optional edge when we visit the dfaTarget, but that's fine.
if (dfaTarget.canTerminate()) {
graph.putEdgeValue(target, Node.TERMINAL, Edge.epsilon());
}
dfaTarget.accept(this);
}
}
// Gets or creates a new target node, adding it to the node map (but not to the graph itself).
private Node getTarget(DfaNode gnode) {
Node target = nodeMap.get(gnode);
if (target != null) {
return target;
}
lastAdded = lastAdded.createNext();
nodeMap.put(gnode, lastAdded);
return lastAdded;
}
}
private RangeTreeConverter() {}
}

+ 118
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatter.java View File

@ -0,0 +1,118 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import com.google.common.base.CharMatcher;
import com.google.common.base.Preconditions;
/**
* Simple indenting formatter for regular expressions and other similar nested syntax. Obviously
* the results are not the same from a match perspective as the new string contains whitespace.
*/
public final class RegexFormatter {
/** Option for how to handle formatting of groups. */
public enum FormatOption {
PRESERVE_CAPTURING_GROUPS,
FORCE_NON_CAPTURING_GROUPS,
FORCE_CAPTURING_GROUPS,
}
// We only care about 3 specific tokens, so this code can be used to print strings which look
// similar (nested, disjunctive groups) such as the toString() of the Edge class.
private static final CharMatcher tokens = CharMatcher.anyOf("()|");
/**
* Formats a regular expression (or similar nested group syntax) using the following rules:
* <ol>
* <li>Newline after opening '(?:' and increase indent.
* <li>Newline after '|'
* <li>Decrease indent and add newline before closing ')'
* </ol>
*/
public static String format(String regex, FormatOption formatOption) {
return new RegexFormatter(regex, formatOption).format();
}
private final StringBuilder out = new StringBuilder();
private final String regex;
private final FormatOption formatOption;
private RegexFormatter(String regex, FormatOption formatOption) {
this.regex = CharMatcher.whitespace().removeFrom(regex);
this.formatOption = Preconditions.checkNotNull(formatOption);
}
private String format() {
recurse(0, 0);
return out.toString();
}
// Assume at line start.
private int recurse(int pos, int level) {
while (pos < regex.length()) {
indent(level);
// Optionally printing closing group from previous recursion.
if (regex.charAt(pos) == ')') {
out.append(')');
pos++;
}
int nextToken = tokens.indexIn(regex, pos);
if (nextToken == -1) {
out.append(regex.substring(pos, regex.length()));
return regex.length();
}
out.append(regex.substring(pos, nextToken));
pos = nextToken;
switch (regex.charAt(pos)) {
case '(':
out.append("(");
pos++;
if (regex.indexOf("?:", pos) == pos) {
if (formatOption != FormatOption.FORCE_CAPTURING_GROUPS) {
out.append("?:");
}
pos += 2;
} else if (formatOption == FormatOption.FORCE_NON_CAPTURING_GROUPS) {
out.append("?:");
}
out.append('\n');
pos = recurse(pos, level + 1);
break;
case '|':
out.append("|\n");
pos++;
break;
case ')':
// Just exit recursion and let the parent write the ')', so don't update our position.
out.append("\n");
return pos;
default:
throw new AssertionError();
}
}
return pos;
}
private void indent(int level) {
while (level-- > 0) {
out.append(" ");
}
}
}

+ 171
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexGenerator.java View File

@ -0,0 +1,171 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.ALLOW_EDGE_SPLITTING;
import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES;
import static java.util.stream.Collectors.joining;
import com.google.common.base.Preconditions;
import com.google.common.graph.ValueGraph;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer;
import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.Optional;
/** Produces partially optimized regular expressions from {@code RangeTree}s. */
public final class RegexGenerator {
private static final RegexGenerator BASIC = new RegexGenerator(false, false, false, false);
// NOTE: Tail optimization should remain disabled since it seems to undo some of the benefits of
// subgroup optimization. At some point the code can probably just be removed.
private static final RegexGenerator DEFAULT_XML =
BASIC.withDfaFactorization().withSubgroupOptimization();
/**
* Returns a basic regular expression generator with no optional optimizations enabled. This will
* produce regular expressions with a simpler structure than other generators but output will
* almost always be longer.
*/
public static RegexGenerator basic() {
return BASIC;
}
/**
* Returns the default regex generator for XML data. This should be used by any tool wishing to
* obtain the same regular expressions as the legacy XML data. It is deliberately not specified
* as to which optimizations are enabled for this regular expression generator.
*/
public static RegexGenerator defaultXmlGenerator() {
return DEFAULT_XML;
}
/**
* Returns a new regular expression generator which uses the {@code '.'} token for matching any
* digit (rather than {@code '\d'}). This results in shorter output, but possibly at the cost of
* performance on certain platforms (and a degree of readability).
*/
public RegexGenerator withDotMatch() {
Preconditions.checkState(!this.useDotMatch, "Dot-matching already enabled");
return new RegexGenerator(true, this.factorizeDfa, this.optimizeSubgroups, this.optimizeTail);
}
/**
* Returns a new regular expression generator which applies a length-based factorization of the
* DFA graph in an attempt to reduce the number of problematic terminating states. This results
* in regular expressions with additional non-determinism, but which can greatly reduce size.
*/
public RegexGenerator withDfaFactorization() {
Preconditions.checkState(!this.factorizeDfa, "Length based factorizing already enabled");
return new RegexGenerator(this.useDotMatch, true, this.optimizeSubgroups, this.optimizeTail);
}
/**
* Returns a new regular expression generator which applies experimental factorization of the
* DFA graph in an attempt to identify and handle subgroups which would cause repetition. This
* results in regular expressions with additional non-determinism, but which can greatly reduce
* size.
*/
public RegexGenerator withSubgroupOptimization() {
Preconditions.checkState(!this.optimizeSubgroups, "Subgroup optimization already enabled");
return new RegexGenerator(this.useDotMatch, this.factorizeDfa, true, this.optimizeTail);
}
/**
* Returns a new regular expression generator which applies tail optimization to the intermediate
* NFA graph to factor out common trailing paths. This results in a small size improvement to
* many cases and does not adversely affect readability.
*/
public RegexGenerator withTailOptimization() {
Preconditions.checkState(!this.optimizeTail, "Tail optimization already enabled");
return new RegexGenerator(this.useDotMatch, this.factorizeDfa, this.optimizeSubgroups, true);
}
private final boolean useDotMatch;
private final boolean factorizeDfa;
private final boolean optimizeSubgroups;
private final boolean optimizeTail;
private RegexGenerator(
boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail) {
this.useDotMatch = useDotMatch;
this.factorizeDfa = factorizeDfa;
this.optimizeSubgroups = optimizeSubgroups;
this.optimizeTail = optimizeTail;
}
/**
* Generates a regular expression from a range tree, applying the configured options for this
* generator.
*/
public String toRegex(RangeTree ranges) {
// The regex of the empty range is "a regex that matches nothing". This is meaningless.
checkArgument(!ranges.isEmpty(),
"cannot generate regular expression from empty ranges");
// We cannot generate any regular expressions if there are no explicit state transitions in the
// graph (i.e. we can generate "(?:<re>)?" but only if "<re>" is non-empty). We just get
// "the regex that always immediately terminates after no input". This is also meaningless.
checkArgument(!ranges.getInitial().equals(RangeTree.getTerminal()),
"range tree must not contain only the empty digit sequence: %s", ranges);
String regex = generateFactorizedRegex(ranges);
if (optimizeSubgroups) {
regex = recursivelyOptimizeSubgroups(ranges, regex);
}
return regex;
}
private String recursivelyOptimizeSubgroups(RangeTree ranges, String regex) {
Optional<RangeTree> subgraphRanges = SubgroupOptimizer.extractRepeatingSubgraph(ranges);
if (subgraphRanges.isPresent()) {
RangeTree leftoverRanges = ranges.subtract(subgraphRanges.get());
String leftoverRegex = generateFactorizedRegex(leftoverRanges);
leftoverRegex = recursivelyOptimizeSubgroups(leftoverRanges, leftoverRegex);
String optimizedRegex = leftoverRegex + "|" + generateFactorizedRegex(subgraphRanges.get());
if (optimizedRegex.length() < regex.length()) {
regex = optimizedRegex;
}
}
return regex;
}
private String generateFactorizedRegex(RangeTree ranges) {
String regex = regexOf(ranges);
if (factorizeDfa) {
regex = generateFactorizedRegex(ranges, regex, REQUIRE_EQUAL_EDGES);
regex = generateFactorizedRegex(ranges, regex, ALLOW_EDGE_SPLITTING);
}
return regex;
}
private String generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy) {
String factoredRegex = RangeTreeFactorizer.factor(dfa, strategy).stream()
.map(this::regexOf)
.collect(joining("|"));
return factoredRegex.length() < bestRegex.length() ? factoredRegex : bestRegex;
}
private String regexOf(RangeTree ranges) {
ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(ranges);
if (optimizeTail) {
nfa = TrailingPathOptimizer.optimize(nfa);
}
return EdgeWriter.toRegex(NfaFlattener.flatten(nfa), useDotMatch);
}
}

+ 190
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/SubgroupOptimizer.java View File

@ -0,0 +1,190 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.ImmutableList.toImmutableList;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.LinkedHashMultiset;
import com.google.common.collect.Multiset;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.IntStream;
import javax.annotation.Nullable;
/**
* An optimization for RangeTree DFAs which attempts to isolate and extract subgraphs which would
* otherwise cause a lot of repetition in the generated regular expression.
*/
public final class SubgroupOptimizer {
/**
* Returns the subgraph which is likely to cause the most repetition in the regular expression
* of the given DFA. Subtracting the result out of the original range tree and generating two
* distinct regular expressions is likely to be shorter than the regular expression of the
* original range.
*/
public static Optional<RangeTree> extractRepeatingSubgraph(RangeTree ranges) {
return LinkNodeVisitor
.findBridgingNode(ranges)
.flatMap(n -> SubgraphExtractionVisitor.extractSubgraph(ranges, n));
}
/**
* A visitor which applies two types of weights to every interior node in a DFA.
* <ul>
* <li>A count of incoming edges to that node.
* <li>A count of all edges in the subgraph rooted at that node.
* </ul>
* These are then multiplied together using the cost function:
* <pre>cost(n) = subgraph-weight(n) * (in-order(n) - 1)</pre>
* get get a proxy for the cost of additional duplicates likely to be created by this node.
*/
static class LinkNodeVisitor implements DfaVisitor {
// Reasonable approximation for the cost of an edge in a subgraph is the length of the
// corresponding range specification (it doesn't work so well for repeated edges like
// 'xxxxxxxx' --> "\d{8}", but it's good to help break ties in the cost function).
private static final ImmutableList<Integer> EDGE_WEIGHTS =
IntStream.rangeClosed(1, 0x3FF)
.mapToObj(m -> RangeSpecification.toString(m).length())
.collect(toImmutableList());
// Important to use "linked" multisets here (at least for the one we iterate over) since
// otherwise we end up with non-deterministic regular expression generation.
private final Multiset<DfaNode> inOrder = LinkedHashMultiset.create();
private final Multiset<DfaNode> subgraphWeight = LinkedHashMultiset.create();
/**
* Returns the interior node whose subgraph is likely to cause the most repetition in the
* regular expression of the given DFA.
*/
static Optional<DfaNode> findBridgingNode(RangeTree ranges) {
checkArgument(!ranges.isEmpty(), "cannot visit empty ranges");
LinkNodeVisitor v = new LinkNodeVisitor();
ranges.accept(v);
return Optional.ofNullable(v.getHighestCostNode());
}
private static int getEdgeWeight(DfaEdge edge) {
// Subtract 1 since the array is 1-based (a zero edge mask is not legal).
return EDGE_WEIGHTS.get(edge.getDigitMask() - 1);
}
@VisibleForTesting
int getSubgraphWeight(DfaNode n) {
return subgraphWeight.count(n);
}
@VisibleForTesting
int getInOrder(DfaNode n) {
return inOrder.count(n);
}
// This returns null if no edge has a cost greater than zero. Since the cost function uses
// (in-order(n) - 1) this is trivially true for any graph where all interior nodes have only
// a single in-edge (the terminal node can have more than one in-edge, but it has a weight of
// zero and the initial node is never considered a candidate).
@VisibleForTesting
@Nullable
DfaNode getHighestCostNode() {
DfaNode node = null;
int maxWeight = 0;
for (DfaNode n : inOrder.elementSet()) {
int weight = getSubgraphWeight(n) * (getInOrder(n) - 1);
if (weight > maxWeight) {
maxWeight = weight;
node = n;
}
}
return node;
}
@Override
public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
// The weight is zero only if we haven't visited this node before (or it's the terminal).
int targetWeight = subgraphWeight.count(target);
if (targetWeight == 0 && !target.equals(RangeTree.getTerminal())) {
target.accept(this);
targetWeight = subgraphWeight.count(target);
}
// Add an extra one for the edge we are processing now and increment our target's in-order.
subgraphWeight.add(source, targetWeight + getEdgeWeight(edge));
inOrder.add(target);
}
}
/**
* A visitor to extract the subgraph of a DFA which passes through a specified interior
* "bridging" node.
*/
private static class SubgraphExtractionVisitor implements DfaVisitor {
private final DfaNode bridgingNode;
private final List<RangeSpecification> paths = new ArrayList<>();
private RangeSpecification path = RangeSpecification.empty();
private boolean sawBridgingNode = false;
private boolean splitHappens = false;
/** Returns the subgraph which passes through the specified node. */
static Optional<RangeTree> extractSubgraph(RangeTree ranges, DfaNode node) {
SubgraphExtractionVisitor v = new SubgraphExtractionVisitor(node);
ranges.accept(v);
// Only return proper subgraphs.
return v.splitHappens ? Optional.of(RangeTree.from(v.paths)) : Optional.empty();
}
private SubgraphExtractionVisitor(DfaNode bridgingNode) {
this.bridgingNode = checkNotNull(bridgingNode);
}
@Override
public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
RangeSpecification oldPath = path;
path = path.extendByMask(edge.getDigitMask());
// Potentially emit paths for any terminating node (not just the end of the graph). We have
// to extract the entire sub-graph _after_ the bridging node, including terminating nodes.
if (target.canTerminate()) {
// Emit path if we are "below" the bridging node.
if (sawBridgingNode) {
paths.add(path);
} else {
// Records that there were other paths not in the subgroup (since we only want to return
// a new DFA that's a proper subgraph of the original graph).
splitHappens = true;
}
}
if (target.equals(bridgingNode)) {
// Recurse with the flag set to emit paths once we hit the terminal node (note that the
// bridging node cannot be the terminal node).
sawBridgingNode = true;
target.accept(this);
sawBridgingNode = false;
} else {
// Recurse normally regardless of the flag.
target.accept(this);
}
path = oldPath;
}
}
}

+ 206
- 0
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizer.java View File

@ -0,0 +1,206 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import static java.util.Comparator.naturalOrder;
import static java.util.stream.Collectors.toList;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.graph.Graphs;
import com.google.common.graph.MutableValueGraph;
import com.google.common.graph.ValueGraph;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
/**
* Optimizer for NFA graphs which attempts to restructure the trailing paths to maximize sharing
* and hopefully minimize the amount of duplication in the resulting regular expression.
*/
public final class TrailingPathOptimizer {
/**
* Optimizes an NFA graph to make trailing "any digit" sequences common where possible. In many
* cases this will result in no change to the structure of the NFA (common trailing paths are
* not a feature of every NFA), but in some cases a substantial reduction in duplication can
* occur.
*
* <p>This is equivalent to recognizing that {@code "12\d{2}\d{2}?|34\d{2}|56\d{3}"} can be
* written as {@code "(?:12\d{2}?|34|56\d)\d{2}"}.
*/
public static ValueGraph<Node, SimpleEdge> optimize(ValueGraph<Node, SimpleEdge> graph) {
MutableValueGraph<Node, SimpleEdge> out = Graphs.copyOf(graph);
// Build a map of trailing "any digit" sequences (key is the node it starts from).
Map<Node, AnyPath> anyPaths = new HashMap<>();
recursivelyDetachTrailingPaths(Node.TERMINAL, AnyPath.EMPTY, out, anyPaths);
// If the terminal node has no "any digit" sequences leading to it, there's nothing we can do
// (well not in this simplistic algorithm anyway). This should almost never happen for phone
// number matching graphs as it implies a match expression that can terminate at a precise
// digit, rather than any digit. The only time this might occur is for short-codes, but due to
// their size it's likely to be fine if we don't try to aggressively optimize them.
if (anyPaths.size() == 1 && anyPaths.containsKey(Node.TERMINAL)) {
return graph;
}
// This is just a way to find a node from which we can start generating new nodes.
Node lastAddedNode = out.nodes().stream().max(naturalOrder()).get();
// Process paths from short to long (since some paths are sub-paths of longer ones).
List<Node> shortestPathsFirst = anyPaths.entrySet().stream()
.sorted(Comparator.comparing(Entry::getValue))
.map(Entry::getKey)
.collect(toList());
Node pathEnd = Node.TERMINAL;
while (true) {
// Start with the next path that might be a factor of all the remaining paths.
Node shortestPathNode = shortestPathsFirst.get(0);
AnyPath shortestPath = anyPaths.get(shortestPathNode);
int pathsToFactor = shortestPathsFirst.size() - 1;
if (pathsToFactor == 0) {
// If all paths are factored, we're done.
break;
}
// Factor all the remaining paths by the shortest path (where a missing result means it
// cannot be factored).
ImmutableList<AnyPath> factored = shortestPathsFirst.stream()
.skip(1)
.map(n -> anyPaths.get(n).factor(shortestPath))
.filter(Optional::isPresent)
.map(Optional::get)
.collect(toImmutableList());
// If not all the remaining paths have the shortest path as a common factor, we're done (in
// this simplistic algorithm we don't consider cases where an AnyPath is the factor of some,
// but not all, other paths; we could but it's far less likely to reduce regex size).
if (factored.size() < pathsToFactor) {
break;
}
// Shortest path is a factor of all remaining paths, so add a new path to the graph for it.
lastAddedNode = addPath(shortestPathNode, pathEnd, shortestPath, lastAddedNode, out);
// We're done with this path, but might still be able to find more factors of remaining paths.
anyPaths.remove(shortestPathNode);
shortestPathsFirst.remove(0); // index, not value.
// The newly factored edges now replace the original factors in the map.
for (int n = 0; n < factored.size(); n++) {
Preconditions.checkState(anyPaths.containsKey(shortestPathsFirst.get(n)));
anyPaths.put(shortestPathsFirst.get(n), factored.get(n));
}
// We now connect any new factored edges to the node we just added (not the terminal node).
pathEnd = shortestPathNode;
}
// If we exit, we must still reconnect any remaining, unfactored, paths to the graph.
for (Map.Entry<Node, AnyPath> e : anyPaths.entrySet()) {
lastAddedNode = addPath(e.getKey(), pathEnd, e.getValue(), lastAddedNode, out);
}
return out;
}
/**
* Recursively build up a map of trailing "any digit" sequences (AnyPath), starting from some
* current node (initially the terminal node) and working backwards. The key in the map is the
* node at which the AnyPath value starts from. Edges and nodes are removed from the graph,
* leaving "ragged" paths which will need to be reconnected later (the keys in the map are the
* set of nodes that need to be reconnected).
*
* @return whether the given node is the start of an AnyPath (i.e. if it immediately follows any
* edges which are not "any digit" sequences).
*/
private static boolean recursivelyDetachTrailingPaths(
Node node, AnyPath path, MutableValueGraph<Node, SimpleEdge> g, Map<Node, AnyPath> anyPaths) {
if (beginsAnAnyPath(node, g)) {
anyPaths.put(node, path);
return true;
}
// All incoming edges accept all digits, so we can recurse (but don't traverse epsilons).
List<Node> sources = g.predecessors(node).stream()
.filter(s -> !g.edgeValue(s, node).get().equals(Edge.epsilon()))
.collect(toList());
for (Node source : sources) {
AnyPath newPath = path.extend(canTerminate(source, g));
// Recurse to remove trailing paths higher in the tree and keep this source node only if
// recursion stopped here.
boolean keepSourceNode = recursivelyDetachTrailingPaths(source, newPath, g, anyPaths);
g.removeEdge(source, node);
// This removes the epsilon if it exists (and does nothing otherwise). This is safe since we
// know the other out-edge of this node accepts all digits, so the only remaining type of
// edge that could exist is an epsilon. After removing both we expect not to find any others.
g.removeEdge(source, Node.TERMINAL);
Preconditions.checkState(g.outDegree(source) == 0, "unexpected out edges in trailing graph");
// If we were able to recurse past this node, it can be removed.
if (!keepSourceNode) {
g.removeNode(source);
}
}
return false;
}
/**
* Returns whether the given node has incoming edges that do not just accept "any digit". This is
* the point at which recursion must stop since AnyPath can only represent "any digit" sequences.
*/
private static boolean beginsAnAnyPath(Node target, ValueGraph<Node, SimpleEdge> g) {
// Obviously we cannot recurse past the initial node.
if (target == Node.INITIAL) {
return true;
}
return g.predecessors(target).stream()
.map(s -> g.edgeValue(s, target).get())
.filter(e -> !e.equals(Edge.epsilon()))
.anyMatch(e -> e.getDigitMask() != ALL_DIGITS_MASK);
}
/**
* Returns whether this node can terminate. This logic relies on the input graph not having had
* its epsilon edges moved (i.e. if an epsilon edge exists it must point to the terminal node).
* This also looks for special "optional" edges which exist when a non-epsilon edge already
* exists from this node to the terminal node.
*/
private static boolean canTerminate(Node node, ValueGraph<Node, SimpleEdge> g) {
return g.successors(node).stream()
.map(t -> g.edgeValue(node, t).get())
.anyMatch(e -> e.isOptional() || e.equals(Edge.epsilon()));
}
/** Adds the given "AnyPath" into the graph, generating new nodes and edges as necessary. */
private static Node addPath(
Node node, Node end, AnyPath path, Node lastAdded, MutableValueGraph<Node, SimpleEdge> out) {
// Path length is always at least 1 for an AnyPath.
int pathLength = path.maxLength();
for (int n = 0; n < pathLength - 1; n++) {
if (path.acceptsLength(n)) {
out.putEdgeValue(node, end, Edge.epsilon());
}
lastAdded = lastAdded.createNext();
out.addNode(lastAdded);
out.putEdgeValue(node, lastAdded, Edge.any());
node = lastAdded;
}
// For the last edge we cannot add a parallel epsilon path if we need to skip to the end,
// so add the special "optional any" edge instead.
out.putEdgeValue(
node, end, path.acceptsLength(pathLength - 1) ? Edge.optionalAny() : Edge.any());
return lastAdded;
}
private TrailingPathOptimizer() {}
}

+ 17
- 13
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java View File

@ -73,19 +73,23 @@ public final class CsvParser {
} else {
ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
// Not a pure lambda due to the need to index columns.
row.forEach(new Consumer<String>() {
private int i = 0;
@Override
public void accept(String v) {
checkArgument(i < header.size(),
"too many columns (expected %s): %s", header.size(), map);
if (!v.isEmpty()) {
map.put(header.get(i++), v);
}
}
});
handler.accept(map.build());
row.forEach(
new Consumer<String>() {
private int i = 0;
@Override
public void accept(String v) {
checkArgument(
i < header.size(),
"too many columns (expected %s): %s",
header.size(),
map);
if (!v.isEmpty()) {
map.put(header.get(i++), v);
}
}
});
handler.accept(map.buildOrThrow());
}
}
};


+ 1
- 1
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java View File

@ -582,7 +582,7 @@ public abstract class CsvTable<K> {
.put('r', '\r')
.put('t', '\t')
.put('\\', '\\')
.build();
.buildOrThrow();
// Visible for AutoValue only.
CsvTable() {}


+ 1
- 1
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java View File

@ -628,7 +628,7 @@ public final class RangeTable {
RangeTree include = getRanges(column, value);
map.put(value, PrefixTree.minimal(include, allRanges.subtract(include), minPrefixLength));
}
return map.build();
return map.buildOrThrow();
}
// Constants for the simplification routine below.


+ 1
- 1
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java View File

@ -55,7 +55,7 @@ public abstract class Schema {
}
public Schema build() {
return new AutoValue_Schema(names.build(), columns.build(), groups.build());
return new AutoValue_Schema(names.build(), columns.buildOrThrow(), groups.buildOrThrow());
}
}


+ 76
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/LengthsParserTest.java View File

@ -0,0 +1,76 @@
/*
* Copyright (C) 2022 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata;
import static com.google.common.truth.Truth.assertThat;
import static org.junit.Assert.assertThrows;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public final class LengthsParserTest {
@Test
public void shouldThrowIfStringContainsForbiddenCharacters() {
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("a-6,7"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8, B, C"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8, ,10"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("4, +7-9, +11"));
}
@Test
public void shouldThrowIfNumbersAreOutOfOrder() {
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("9-7"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8,12-11"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("5,4,7-8"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("6-8, 7-9"));
}
@Test
public void shouldThrowIfFormatIsWrong() {
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("4-6-8"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("7-"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("3, -7"));
assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("1 2-3 4, 5 6"));
}
@Test
public void testParseSingletons() {
assertThat(LengthsParser.parseLengths("8")).containsExactly(8);
assertThat(LengthsParser.parseLengths("14")).containsExactly(14);
}
@Test
public void testParseCommaSeparatedNumbers() {
assertThat(LengthsParser.parseLengths("6,8,9")).containsExactly(6, 8, 9);
assertThat(LengthsParser.parseLengths("13, 14")).containsExactly(13, 14);
}
@Test
public void testParseRanges() {
assertThat(LengthsParser.parseLengths("6-8")).containsExactly(6, 7, 8);
assertThat(LengthsParser.parseLengths("13 - 14")).containsExactly(13, 14);
}
@Test
public void testParseComplex() {
assertThat(LengthsParser.parseLengths("4,7,9-12")).containsExactly(4, 7, 9, 10, 11, 12);
assertThat(LengthsParser.parseLengths("4-6, 8, 10-12")).containsExactly(4, 5, 6, 8, 10, 11, 12);
}
}

+ 1
- 1
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java View File

@ -20,8 +20,8 @@ import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.parse;
import static java.util.Arrays.asList;
import static com.google.i18n.phonenumbers.metadata.testing.AssertUtil.assertThrows;
import static java.util.Arrays.asList;
import com.google.common.collect.ImmutableRangeSet;
import com.google.common.collect.Range;


+ 210
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcherTest.java View File

@ -0,0 +1,210 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.INVALID;
import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.MATCHED;
import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.TOO_LONG;
import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.TOO_SHORT;
import com.google.common.base.CharMatcher;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler;
import com.google.i18n.phonenumbers.metadata.regex.RegexGenerator;
import java.util.Arrays;
import java.util.regex.Pattern;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class DigitSequenceMatcherTest {
@Test public void testStringDigits() {
DigitSequence digits = DigitSequenceMatcher.digitsFromString("1234");
Assert.assertTrue(digits.hasNext());
Assert.assertEquals(1, digits.next());
Assert.assertTrue(digits.hasNext());
Assert.assertEquals(2, digits.next());
Assert.assertTrue(digits.hasNext());
Assert.assertEquals(3, digits.next());
Assert.assertTrue(digits.hasNext());
Assert.assertEquals(4, digits.next());
Assert.assertFalse(digits.hasNext());
}
@Test public void testSingleDigitMatching() {
assertNotMatches(ranges("0"), INVALID, "1", "9");
assertNotMatches(ranges("0"), TOO_LONG, "00");
assertMatches(ranges("x"), "0", "5", "9");
assertNotMatches(ranges("x"), TOO_SHORT, "");
assertNotMatches(ranges("x"), TOO_LONG, "00");
assertMatches(ranges("[2-6]"), "2", "3", "4", "5", "6");
assertNotMatches(ranges("[2-6]"), INVALID, "0", "1", "7", "8", "9");
assertNotMatches(ranges("[2-6]"), TOO_LONG, "26");
}
@Test public void testOptional() {
RangeTree dfa = ranges("12", "123");
assertMatches(ranges("12", "123"), "12", "123");
assertNotMatches(dfa, TOO_SHORT, "1");
assertNotMatches(dfa, INVALID, "13");
assertNotMatches(dfa, TOO_LONG, "1233");
}
@Test public void testRepetition() {
assertMatches(ranges("12xx", "12xxx", "12xxxx"), "1234", "12345", "123456");
}
@Test public void testOr() {
RangeTree dfa = ranges("01", "23");
assertMatches(dfa, "01", "23");
assertNotMatches(dfa, INVALID, "03", "12");
assertNotMatches(dfa, TOO_SHORT, "0", "2");
assertNotMatches(dfa, TOO_LONG, "011", "233");
assertMatches(ranges("01", "23", "45", "6789"), "01", "23", "45", "6789");
}
@Test public void testRealRegexShort() {
RangeTree dfa = ranges(
"11[2-7]xxxxxxx",
"2[02][2-7]xxxxxxx",
"33[2-7]xxxxxxx",
"4[04][2-7]xxxxxxx",
"79[2-7]xxxxxxx",
"80[2-467]xxxxxxx");
assertMatches(dfa, "112 1234567", "797 1234567", "807 1234567");
assertNotMatches(dfa, TOO_SHORT, "112 123", "797 12345", "807 123456");
assertNotMatches(dfa, TOO_LONG, "112 12345678", "797 123456789");
assertNotMatches(dfa, INVALID, "122 1234567", "799 1234567", "805 1234567");
}
@Test public void testRealRegexLong() {
RangeTree dfa = ranges(
"12[0-249][2-7]xxxxxx",
"13[0-25][2-7]xxxxxx",
"14[145][2-7]xxxxxx",
"1[59][14][2-7]xxxxxx",
"16[014][2-7]xxxxxx",
"17[1257][2-7]xxxxxx",
"18[01346][2-7]xxxxxx",
"21[257][2-7]xxxxxx",
"23[013][2-7]xxxxxx",
"24[01][2-7]xxxxxx",
"25[0137][2-7]xxxxxx",
"26[0158][2-7]xxxxxx",
"278[2-7]xxxxxx",
"28[1568][2-7]xxxxxx",
"29[14][2-7]xxxxxx",
"326[2-7]xxxxxx",
"34[1-3][2-7]xxxxxx",
"35[34][2-7]xxxxxx",
"36[01489][2-7]xxxxxx",
"37[02-46][2-7]xxxxxx",
"38[159][2-7]xxxxxx",
"41[36][2-7]xxxxxx",
"42[1-47][2-7]xxxxxx",
"43[15][2-7]xxxxxx",
"45[12][2-7]xxxxxx",
"46[126-9][2-7]xxxxxx",
"47[0-24-9][2-7]xxxxxx",
"48[013-57][2-7]xxxxxx",
"49[014-7][2-7]xxxxxx",
"5[136][25][2-7]xxxxxx",
"522[2-7]xxxxxx",
"54[28][2-7]xxxxxx",
"55[12][2-7]xxxxxx",
"5[78]1[2-7]xxxxxx",
"59[15][2-7]xxxxxx",
"612[2-7]xxxxxx",
"6[2-4]1[2-7]xxxxxx",
"65[17][2-7]xxxxxx",
"66[13][2-7]xxxxxx",
"67[14][2-7]xxxxxx",
"680[2-7]xxxxxx",
"712[2-7]xxxxxx",
"72[14][2-7]xxxxxx",
"73[134][2-7]xxxxxx",
"74[47][2-7]xxxxxx",
"75[15][2-7]xxxxxx",
"7[67]1[2-7]xxxxxx",
"788[2-7]xxxxxx",
"816[2-7]xxxxxx",
"82[014][2-7]xxxxxx",
"83[126][2-7]xxxxxx",
"86[136][2-7]xxxxxx",
"87[078][2-7]xxxxxx",
"88[34][2-7]xxxxxx",
"891[2-7]xxxxxx");
assertMatches(dfa, "364 2 123456", "674 4 123456", "883 7 123456");
assertNotMatches(dfa, TOO_SHORT, "364 2 123", "674 4 1234", "883 7 12345");
assertNotMatches(dfa, TOO_LONG, "364 2 1234567", "674 4 12345678");
assertNotMatches(dfa, INVALID,
"365 2 123456", "364 8 123456", "670 4 123456", "670 5 123456", "892 2 123456");
}
private static RangeTree ranges(String... lines) {
return RangeTree.from(Arrays.stream(lines).map(RangeSpecification::parse));
}
private static void assertMatches(RangeTree dfa, String... numbers) {
checkRegex(dfa, true, numbers);
byte[] matcherData = MatcherCompiler.compile(dfa);
DigitSequenceMatcher matcher = DigitSequenceMatcher.create(matcherData);
assertMatcher(matcher, MATCHED, numbers);
}
private static void assertNotMatches(RangeTree dfa, Result error, String... numbers) {
checkArgument(error != MATCHED);
checkRegex(dfa, false, numbers);
byte[] matcherData = MatcherCompiler.compile(dfa);
DigitSequenceMatcher matcher = DigitSequenceMatcher.create(matcherData);
assertMatcher(matcher, error, numbers);
}
private static void checkRegex(RangeTree dfa, boolean expectMatch, String... numbers) {
Pattern pattern = Pattern.compile(RegexGenerator.basic().toRegex(dfa));
for (String number : numbers) {
checkArgument(expectMatch == pattern.matcher(noSpace(number)).matches(),
"regex %s could not match input %s", dfa.asRangeSpecifications(), number);
}
}
private static void assertMatcher(
DigitSequenceMatcher matcher, Result expected, String... numbers) {
for (final String number : numbers) {
Assert.assertEquals(expected,
matcher.match(DigitSequenceMatcher.digitsFromString(noSpace(number))));
}
}
private static String noSpace(String input) {
return CharMatcher.whitespace().removeFrom(input);
}
}

+ 317
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/CompilerRegressionTest.java View File

@ -0,0 +1,317 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.truth.Truth.assertWithMessage;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import static java.lang.Integer.bitCount;
import static java.lang.Integer.lowestOneBit;
import static java.lang.Integer.numberOfTrailingZeros;
import com.google.common.collect.Multimap;
import com.google.common.collect.MultimapBuilder;
import com.google.common.collect.SetMultimap;
import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto;
import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto.TestCase;
import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto.Tests;
import com.google.i18n.phonenumbers.metadata.DigitSequence;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result;
import com.google.protobuf.ByteString;
import com.google.protobuf.TextFormat;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class CompilerRegressionTest {
// Tests that the compiler produces the expected output, byte-for-byte.
@Test
public void testCompiledBytesEqualExpectedMatcherBytes() throws IOException {
StringWriter buffer = new StringWriter();
PrintWriter errors = new PrintWriter(buffer);
try (InputStream data =
CompilerRegressionTest.class.getResourceAsStream("regression_test_data.textpb")) {
Tests.Builder tests = RegressionTestProto.Tests.newBuilder();
TextFormat.merge(new InputStreamReader(data, StandardCharsets.UTF_8), tests);
for (TestCase tc : tests.getTestCaseList()) {
byte[] actual = MatcherCompiler.compile(ranges(tc.getRangeList()));
byte[] expected = combine(tc.getExpectedList());
int diffIndex = indexOfDiff(actual, expected);
if (!tc.getShouldFail()) {
if (diffIndex != -1) {
errors.format("FAILED [%s]: First difference at index %d\n", tc.getName(), diffIndex);
errors.format("Actual : %s\n", formatPbSnippet(actual, diffIndex, 20));
errors.format("Expected: %s\n", formatPbSnippet(expected, diffIndex, 20));
writeGoldenPbOutput(actual, errors);
}
} else {
if (diffIndex == -1) {
errors.format("FAILED [%s]: Expected difference, but got none\n", tc.getName());
}
}
}
}
String errorMessage = buffer.toString();
if (!errorMessage.isEmpty()) {
assertWithMessage(errorMessage).fail();
}
}
// Test that the matcher behaves correctly with respect to the input ranges using the expected
// byte sequences. If this test fails, then the matcher implementation is doing something wrong,
// or the expected bytes were generated incorrectly (either by hand or from the compiler).
//
// IMPORTANT: This test tests that the expected bytes (rather than the compiled bytes) match the
// numbers in the ranges. This avoids the risk of any bugs in both the matcher and compiler
// somehow cancelling each other out. However this also means that this test depends on the
// equality test above for validity (i.e. this test can pass even if the matcher compiler is
// broken, so it should not be run in isolation when debugging).
@Test
public void testExpectedMatcherBytesMatchRanges() throws IOException {
try (InputStream data =
CompilerRegressionTest.class.getResourceAsStream("regression_test_data.textpb")) {
RegressionTestProto.Tests.Builder tests = RegressionTestProto.Tests.newBuilder();
TextFormat.merge(new InputStreamReader(data, StandardCharsets.UTF_8), tests);
for (TestCase tc : tests.getTestCaseList()) {
RangeTree ranges = ranges(tc.getRangeList());
// If we compiled the ranges here, we could risk a situation where the compiled bytes were
// broken but the compiler had a corresponding bug that cancelled it out. This test only
// tests the matcher behaviour, whereas the test above only tests the compiler behaviour.
DigitSequenceMatcher matcher = DigitSequenceMatcher.create(combine(tc.getExpectedList()));
Multimap<Result, DigitSequence> numbers = buildTestNumbers(ranges);
if (!tc.getShouldFail()) {
testExpectedMatch(tc.getName(), matcher, numbers);
} else {
testExpectedFailure(tc.getName(), matcher, numbers);
}
}
}
}
private static void testExpectedMatch(String testName, DigitSequenceMatcher matcher,
Multimap<Result, DigitSequence> numbers) {
for (Result expectedResult : Result.values()) {
for (DigitSequence s : numbers.get(expectedResult)) {
Result result = matcher.match(new Sequence(s));
assertWithMessage("FAILED [%s]: Sequence %s", testName, s)
.that(result).isEqualTo(expectedResult);
}
}
}
private static void testExpectedFailure(String testName, DigitSequenceMatcher matcher,
Multimap<Result, DigitSequence> numbers) {
for (Result expectedResult : Result.values()) {
for (DigitSequence s : numbers.get(expectedResult)) {
Result result = matcher.match(new Sequence(s));
if (result != expectedResult) {
return;
}
}
}
assertWithMessage("FAILED [%s]: Expected at least one failure", testName).fail();
}
// Magic number: DigitSequences cannot be longer than 18 digits at the moment, so a check is
// needed to prevent us trying to make a longer-than-allowed sequences in tests. This only
// happens in the case of a terminal node, since non-terminal paths must be < 17 digits long.
// If the allowed digits increases, this value can be modified or left as-is.
private static final int MAX_SEQUENCE_LENGTH = 18;
// Trivial adapter from the metadata DigitSequence to the matcher's lightweight sequence.
private static final class Sequence implements DigitSequenceMatcher.DigitSequence {
private final DigitSequence seq;
private int index = 0;
Sequence(DigitSequence seq) {
this.seq = seq;
}
@Override
public boolean hasNext() {
return index < seq.length();
}
@Override
public int next() {
return seq.getDigit(index++);
}
}
// Returns a RangeTree for the list of RangeSpecification strings.
RangeTree ranges(List<String> specs) {
return RangeTree.from(specs.stream().map(RangeSpecification::parse).collect(toImmutableList()));
}
// Builds a map of numbers for the given RangeTree to test every branching point in the DFA.
// All paths combinations are generated exactly once to give coverage. This does use pseudo
// random numbers to pick random digits from masks, but it should not be flaky. If it _ever_
// fails then it implies a serious problem with the matcher compiler or matcher implementation.
private static Multimap<Result, DigitSequence> buildTestNumbers(RangeTree ranges) {
SetMultimap<Result, DigitSequence> numbers =
MultimapBuilder.enumKeys(Result.class).treeSetValues().build();
Set<DfaNode> visited = new HashSet<>();
ranges.accept(new Visitor(RangeSpecification.empty(), numbers, visited));
return numbers;
}
/**
* Visitor to generate a targeted set of test numbers from a range tree DFA, which should
* exercise every instruction in the corresponding matcher data. These numbers should ensure
* that every "branch" (including early terminations) is taken at least once. Where digits
* should be equivalent (i.e. both x & y have the same effect) they are chosen randomly, since
* otherwise you would need to generate billions of numbers to cover every possible combination.
*/
private static final class Visitor implements DfaVisitor {
private final RangeSpecification sourcePath;
private final SetMultimap<Result, DigitSequence> numbers;
private final Set<DfaNode> visited;
private int outEdgesMask = 0;
Visitor(RangeSpecification sourcePath,
SetMultimap<Result, DigitSequence> numbers,
Set<DfaNode> visited) {
this.sourcePath = sourcePath;
this.numbers = numbers;
this.visited = visited;
}
@Override
public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
// Record the current outgoing edge mask.
int mask = edge.getDigitMask();
outEdgesMask |= mask;
// Get the current path and add a test number for it.
RangeSpecification path = sourcePath.extendByMask(mask);
numbers.put(target.canTerminate() ? Result.MATCHED : Result.TOO_SHORT, sequenceIn(path));
// Avoid recursing into nodes we've already visited. This avoids generating many (hundreds)
// of test numbers for nodes which are reachable in many ways (via many path prefixes). This
// is an optional check and could be removed, but for testing larger ranges it seems to make
// a difference in test time. DFA node/instruction coverage should be unaffected by this.
if (visited.contains(target)) {
return;
}
visited.add(target);
// Recurse into the next level with a new visitor starting from our path (it's okay to visit
// the terminal node here since it does nothing and leaves the out edges mask zero).
Visitor childVisitor = new Visitor(path, numbers, visited);
target.accept(childVisitor);
// After recursion, find out which of our target's out-edges cannot be reached.
int unreachableMask = ~childVisitor.outEdgesMask & ALL_DIGITS_MASK;
if (unreachableMask != 0 && path.length() < MAX_SEQUENCE_LENGTH) {
// Create a path which cannot be reached directly from our target node. If this is the
// terminal node then we create a path that's too long, otherwise it's just invalid.
Result expected = target.equals(RangeTree.getTerminal()) ? Result.TOO_LONG : Result.INVALID;
numbers.put(expected, sequenceIn(path.extendByMask(unreachableMask)));
}
}
}
// Returns a pseudo randomly chosen sequence from the given path.
private static final DigitSequence sequenceIn(RangeSpecification path) {
DigitSequence seq = DigitSequence.empty();
for (int n = 0; n < path.length(); n++) {
int mask = path.getBitmask(n);
// A random number M in [0..BitCount), not the bit itself.
// E.g. mask = 0011010011 ==> (0 <= maskBit < 5) (allowed digits are {0,1,4,6,7})
int maskBit = (int) (bitCount(mask) * Math.random());
// Mask out the M lower bits which come before the randomly selected one.
// E.g. maskBit = 3 ==> mask = 0011000000 (3 lower bits cleared)
while (maskBit > 0) {
mask &= ~lowestOneBit(mask);
maskBit--;
}
// Extend the sequence by the digit value of the randomly selected bit.
// E.g. mask = 0011000000 ==> digit = 6 (randomly chosen from the allowed digits).
seq = seq.extendBy(numberOfTrailingZeros(mask));
}
return seq;
}
// Combines multiple ByteStrings into a single byte[] (we allow splitting in the regression test
// file for readability.
private static byte[] combine(List<ByteString> bytes) {
int size = bytes.stream().mapToInt(ByteString::size).sum();
byte[] out = new byte[size];
int offset = 0;
for (ByteString b : bytes) {
b.copyTo(out, offset);
offset += b.size();
}
return out;
}
// Return the index of the first difference, or -1 is the byte arrays are the same.
private static int indexOfDiff(byte[] a, byte[] b) {
int length = Math.min(a.length, b.length);
for (int n = 0; n < length; n++) {
if (a[n] != b[n]) {
return n;
}
}
return (a.length == length && b.length == length) ? -1 : length;
}
// Formats a subset of the bytes as a human readable snippet using C-style hex escaping (which
// is compatible with the regression test data).
private static String formatPbSnippet(byte[] bytes, int start, int length) {
StringBuilder out = new StringBuilder();
if (start > 0) {
out.append("...");
}
appendBytes(out, bytes, start, length);
if (start + length < bytes.length) {
out.append("...");
}
return out.toString();
}
// Writes bytes such that they can be cut & pasted into a regression test file as new golden data.
private static void writeGoldenPbOutput(byte[] bytes, PrintWriter errors) {
errors.println("Golden Data:");
StringBuilder out = new StringBuilder();
for (int start = 0; start < bytes.length; start += 20) {
errors.format(" expected: \"%s\"\n", appendBytes(out, bytes, start, 20));
out.setLength(0);
}
}
// Appends a set of bytes in C-style hex format (e.g. \xHH).
private static StringBuilder appendBytes(StringBuilder out, byte[] bytes, int start, int length) {
int end = Math.min(start + length, bytes.length);
for (int n = start; n < end; n++) {
out.append(String.format("\\x%02x", bytes[n] & 0xFF));
}
return out;
}
}

+ 144
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompilerTest.java View File

@ -0,0 +1,144 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.primitives.Bytes.asList;
import static com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler.compile;
import com.google.common.truth.Truth;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode;
import java.util.Arrays;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class MatcherCompilerTest {
private static final Byte TERMINATOR = (byte) 0;
@Test public void testSingleOperation() {
byte digit0 = single(0);
byte digit5 = single(5);
byte digit9 = single(9);
assertCompile(ranges("0"), digit0, TERMINATOR);
assertCompile(ranges("5"), digit5, TERMINATOR);
assertCompile(ranges("9"), digit9, TERMINATOR);
assertCompile(ranges("0559"), digit0, digit5, digit5, digit9, TERMINATOR);
byte digit5Terminating = (byte) (digit5 | (1 << 4));
assertCompile(ranges("05", "0559"),
digit0, digit5, digit5Terminating, digit9, TERMINATOR);
}
@Test public void testAnyOperation() {
byte anyDigit = any(1);
byte anyDigit16Times = any(16);
assertCompile(ranges("x"), anyDigit, TERMINATOR);
assertCompile(ranges("xxxx_xxxx_xxxx_xxxx"), anyDigit16Times, TERMINATOR);
assertCompile(ranges("xxxx_xxxx_xxxx_xxxx_x"),
anyDigit16Times, anyDigit, TERMINATOR);
byte anyDigitTerminating = (byte) (anyDigit | (1 << 4));
assertCompile(ranges("x", "xx"), anyDigit, anyDigitTerminating, TERMINATOR);
assertCompile(ranges("xxxx_xxxx_xxxx_xxxx", "xxxx_xxxx_xxxx_xxxx_x"),
anyDigit16Times, anyDigitTerminating, TERMINATOR);
}
@Test public void testRangeOperation() {
int range09 = range(0, 9);
int range123 = range(1, 2, 3);
int range789 = range(7, 8, 9);
assertCompile(ranges("[09]"), hi(range09), lo(range09), TERMINATOR);
assertCompile(ranges("[123][789]"),
hi(range123), lo(range123), hi(range789), lo(range789), TERMINATOR);
}
@Test public void testMapOperation() {
// Force all 10 possible branches to be taken.
byte[] data = compile(ranges("00", "11", "22", "33", "44", "55", "66", "77", "88", "99"));
// Check only the first 4 bytes for exact values.
Assert.assertEquals(
asList((byte) 0x95, (byte) 0x31, (byte) 0xF5, (byte) 0x9D),
asList(data).subList(0, 4));
// Each branch should jump to a 2 byte sequence between 10 and 28 bytes away (inclusive).
List<Byte> jumpTable = asList(data).subList(4, 14);
List<Byte> remainder = asList(data).subList(14, data.length);
// TODO: Now that ordering should be consistent, tighten up this test to ensure
// consistency and remove the shorter consistency test below.
for (byte jump : new byte[] {0xA, 0xC, 0xE, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C}) {
Assert.assertTrue(jumpTable.contains(jump));
int index = jumpTable.indexOf(jump);
// Subtract the length of the jump table to get relative offset in remaining code.
jump = (byte) (jump - 10);
// Each jump should end in 2 single-byte instructions (match corresponding digit, terminate).
Assert.assertEquals(single(index), remainder.get(jump));
Assert.assertEquals(TERMINATOR, remainder.get(jump + 1));
}
}
@Test public void testConsistentSorting() {
// Ensure that the MatcherCompiler output is consistent, otherwise it can result in a
// non-deterministic build, because the generated file changes with each execution.
byte[] expected = new byte[] {-128, 0, 0, 29, 3, 5, 7, 32, 0, 33, 0, 34, 0};
assertCompile(ranges("00", "11", "22"), expected);
}
/** Returns the 1-byte instruction representing matching a single digit once. */
private static Byte single(int value) {
checkArgument(value >= 0 && value < 10);
return (byte) ((OpCode.SINGLE.ordinal() << 5) | value);
}
/** Returns the 1-byte instruction representing matching any digit a specified number of times. */
private static Byte any(int count) {
checkArgument(count > 0 && count <= 16);
return (byte) ((OpCode.ANY.ordinal() << 5) | (count - 1));
}
/** Returns the 2-byte instruction representing matching a range of digits. */
private static int range(int... digits) {
int mask = 0;
for (int d : digits) {
checkArgument(0 <= d && d <= 9);
mask |= 1 << d;
}
return (OpCode.RANGE.ordinal() << 13) | mask;
}
private static Byte hi(int shortInstruction) {
return (byte) (shortInstruction >> 8);
}
private static Byte lo(int shortInstruction) {
return (byte) (shortInstruction & 0xFF);
}
private void assertCompile(RangeTree dfa, byte... expected) {
Truth.assertThat(compile(dfa)).isEqualTo(expected);
}
private static RangeTree ranges(String... lines) {
return RangeTree.from(Arrays.stream(lines).map(RangeSpecification::parse));
}
}

+ 60
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/OperationTest.java View File

@ -0,0 +1,60 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
import static com.google.common.primitives.Bytes.asList;
import com.google.common.collect.ImmutableList;
import com.google.common.io.ByteArrayDataOutput;
import com.google.common.io.ByteStreams;
import junit.framework.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class OperationTest {
@Test public void testWriteJumpTableNoExtraBranches() {
ByteArrayDataOutput outBytes = ByteStreams.newDataOutput();
Operation.writeJumpTable(outBytes, ImmutableList.of(0x10, 0x80, 0xFC), Statistics.NO_OP);
// The jump table size is added to the offsets.
Assert.assertEquals(
asList(new byte[] {(byte) 0x13, (byte) 0x83, (byte) 0xFF}),
asList(outBytes.toByteArray()));
}
// An easy way to reason about what the offsets for the branches should be is to consider
// that the last branch must always have the original offset (it jumps from the very end of
// the jump table, which is exactly what the original offset specified. The branch before it
// is the same except that it must jump over the final branch (ie, +2 bytes) and so on.
// Direct offsets are relative to the start of the jump table however and must be adjusted.
@Test public void testWriteJumpTableExtraBranches() {
ByteArrayDataOutput outBytes = ByteStreams.newDataOutput();
// Two extra branches needed (0x200 and 0xF7). Worst case adjustment is 9 bytes.
// Total adjustment is 7 bytes (jump table size + 2 * branch)
Operation.writeJumpTable(outBytes, ImmutableList.of(0xF7, 0xF6, 0x200), Statistics.NO_OP);
Assert.assertEquals(asList(new byte[] {
// Jump table: (offset-to-branch, direct-adjusted-offset, offset-to-branch)
(byte) 0x03, (byte) 0xFD, (byte) 0x05,
// Extra branch: offset = 0xF7 + 2 (jumps over last branch)
(byte) 0x10, (byte) 0xF9,
// Extra branch: offset = 0x200 (last branch always has original offset)
(byte) 0x12, (byte) 0x00}),
asList(outBytes.toByteArray()));
}
}

+ 295
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/regression_test_data.textpb View File

@ -0,0 +1,295 @@
# Copyright (C) 2017 The Libphonenumber Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ---- Manually crafted "unit" tests ----
test_case {
name: "Simple Range"
range: "1234xxx"
# 4 single byte, single value instructions: 0x20 + value
# 1 single byte, "ANY" instruction: 0x40 + (count-1)
expected: "\x21\x22\x23\x24\x42\x00"
}
test_case {
# NOTE: When the ANY instruction is marked as terminating, it applies when the instruction is
# reached, not after it's executed (i.e. \x50... is "(\d...)?", and not "\d(...)?").
# Match 3 x ANY (0x42), then "terminate or ANY" (0x50), then 2 x ANY
name: "Variable Any Match #1"
range: "1xxx"
range: "1xxxxxx"
expected: "\x21\x42\x50\x41\x00"
}
test_case {
name: "Variable Any Match #2"
range: "1xxx"
range: "1xxxx"
range: "1xxxxx"
range: "1xxxxxx"
# A repeated terminating ANY match applies on every repeat, not just the first time.
# Match 3 x ANY (0x42 = \d{3}), then 3 x "terminate or ANY" (0x52 = \d{0,3}).
expected: "\x21\x42\x52\x00"
}
test_case {
name: "Overflow Any Match"
range: "xxxxxxxxxxxxxxxxxx"
# 18 'any' digits can't fit in one instruction, so write 2 separate opcodes to match 16 (0x4F)
# and then 2 (0x41). This will almost never occur since DigitSequence is limited to 18 digits.
expected: "\x4F\x41\x00"
}
test_case {
name: "Range Matching"
range: "[0-4]12"
# First 2 bytes are a "branch" operation (opcode = 0x60 plus mask), but there are no offsets
# after it (since one "branch" is just to continue matching, while the other is failure).
expected: "\x60\x1F\x21\x22\x00"
}
test_case {
name: "Range Matching"
# Requires a 2-way branch in the DFA where both paths cover all input digits [0-9].
range: "[0-4]12"
range: "[5-9]34"
# First 2 bytes are a 2-way branch operation (opcode = 0x68 plus mask), then 2 jump offsets
# from the end of the branch instruction.
expected: "\x68\x1F\x02\x05\x21\x22\x00\x23\x24\x00"
}
# ---- Deliberate failure cases ----
test_case {
name: "Modified Single Match Bytecode"
should_fail: true
range: "123xxxx"
range: "123xxxxx"
range: "123xxxxxx"
# Expected bytes have been tweaked to accept 4 (\x24), rather than 3 (\x23).
expected: "\x21\x22\x24\x43\x51\x00"
}
test_case {
name: "Modified Range Bytecode"
should_fail: true
range: "1[2-5]xxxx"
# Expected bytes have been tweaked to accept [7-9] (\x63\x80), rather than [2-5] (\x60\x3C)
expected: "\x21\x63\x80\x43\x00"
}
test_case {
name: "Modified Any Match Bytecode"
should_fail: true
range: "1xxxx"
# Expected bytes have been tweaked to accept xxx (\x42), rather than xxxx (\x43)
expected: "\x21\x42\x00"
}
# ---- Auto-generated "stress tests" ----
test_case {
name: "GB Mobile"
range: "7[1-3]xxxxxxxx"
range: "74[0-46-9]xxxxxxx"
range: "745[0-689]xxxxxx"
range: "7457[0-57-9]xxxxx"
range: "750[0-8]xxxxxx"
range: "75[13-9]xxxxxxx"
range: "752[0-35-9]xxxxxx"
range: "7700[01]xxxxx"
range: "770[1-9]xxxxxx"
range: "77[1-7]xxxxxxx"
range: "778[02-9]xxxxxx"
range: "779[0-689]xxxxxx"
range: "78[014-9]xxxxxxx"
range: "78[23][0-8]xxxxxx"
range: "79[024-9]xxxxxxx"
range: "791[02-9]xxxxxx"
range: "7911[028]xxxxx"
range: "793[0-689]xxxxxx"
# Not much insight here - other than it starts by matching a '7' and terminates in one place
# after matching "any digit" 5 times (which is the shortest trailing match in the ranges).
expected: "\x27\x8c\xa8\x1a\x2a\x06\x09\x0d\x14\x1c\x20\x40\x10\x1e\x6b\xdf\x1c\x1f\x84\x44"
expected: "\x92\x5d\x1d\x16\x21\x88\x64\x92\x55\x1d\x0f\x21\x24\x6b\xf3\x09\x10\x82\x22\x49"
expected: "\x6d\x03\x1b\x18\x40\x10\x19\x6b\x7f\x17\x19\x61\xff\x10\x11\x63\xef\x0e\x68\x01"
expected: "\x11\x0c\x63\xfd\x07\x63\x7f\x04\x6b\xfd\x02\x0a\x40\x08\x63\xbf\x05\x60\x03\x02"
expected: "\x61\x05\x44\x00"
}
test_case {
name: "India Fixed Line"
range: "11[2-7]xxxxxxx"
range: "12[0-249][2-7]xxxxxx"
range: "12[35-8]x[2-7]xxxxx"
range: "13[0-25][2-7]xxxxxx"
range: "13[346-9]x[2-7]xxxxx"
range: "14[145][2-7]xxxxxx"
range: "14[236-9]x[2-7]xxxxx"
range: "1[59][0235-9]x[2-7]xxxxx"
range: "1[59][14][2-7]xxxxxx"
range: "16[014][2-7]xxxxxx"
range: "16[235-9]x[2-7]xxxxx"
range: "17[1257][2-7]xxxxxx"
range: "17[34689]x[2-7]xxxxx"
range: "18[01346][2-7]xxxxxx"
range: "18[257-9]x[2-7]xxxxx"
range: "2[02][2-7]xxxxxxx"
range: "21[134689]x[2-7]xxxxx"
range: "21[257][2-7]xxxxxx"
range: "23[013][2-7]xxxxxx"
range: "23[24-8]x[2-7]xxxxx"
range: "24[01][2-7]xxxxxx"
range: "24[2-8]x[2-7]xxxxx"
range: "25[0137][2-7]xxxxxx"
range: "25[25689]x[2-7]xxxxx"
range: "26[0158][2-7]xxxxxx"
range: "26[2-4679]x[2-7]xxxxx"
range: "27[13-79]x[2-7]xxxxx"
range: "278[2-7]xxxxxx"
range: "28[1568][2-7]xxxxxx"
range: "28[2-479]x[2-7]xxxxx"
range: "29[14][2-7]xxxxxx"
range: "29[235-9]x[2-7]xxxxx"
range: "301x[2-7]xxxxx"
range: "31[79]x[2-7]xxxxx"
range: "32[1-5]x[2-7]xxxxx"
range: "326[2-7]xxxxxx"
range: "33[2-7]xxxxxxx"
range: "34[13][2-7]xxxxxx"
range: "342[0189][2-7]xxxxx"
range: "342[2-7]xxxxxx"
range: "34[5-8]x[2-7]xxxxx"
range: "35[125689]x[2-7]xxxxx"
range: "35[34][2-7]xxxxxx"
range: "36[01489][2-7]xxxxxx"
range: "36[235-7]x[2-7]xxxxx"
range: "37[02-46][2-7]xxxxxx"
range: "37[157-9]x[2-7]xxxxx"
range: "38[159][2-7]xxxxxx"
range: "38[2-467]x[2-7]xxxxx"
range: "4[04][2-7]xxxxxxx"
range: "41[14578]x[2-7]xxxxx"
range: "41[36][2-7]xxxxxx"
range: "42[1-47][2-7]xxxxxx"
range: "42[5689]x[2-7]xxxxx"
range: "43[15][2-7]xxxxxx"
range: "43[2-467]x[2-7]xxxxx"
range: "45[12][2-7]xxxxxx"
range: "45[4-7]x[2-7]xxxxx"
range: "46[0-26-9][2-7]xxxxxx"
range: "46[35]x[2-7]xxxxx"
range: "47[0-24-9][2-7]xxxxxx"
range: "473x[2-7]xxxxx"
range: "48[013-57][2-7]xxxxxx"
range: "48[2689]x[2-7]xxxxx"
range: "49[014-7][2-7]xxxxxx"
range: "49[2389]x[2-7]xxxxx"
range: "51[025][2-7]xxxxxx"
range: "51[146-9]x[2-7]xxxxx"
range: "52[14-8]x[2-7]xxxxx"
range: "522[2-7]xxxxxx"
range: "53[1346]x[2-7]xxxxx"
range: "53[25][2-7]xxxxxx"
range: "54[14-69]x[2-7]xxxxx"
range: "54[28][2-7]xxxxxx"
range: "55[12][2-7]xxxxxx"
range: "55[46]x[2-7]xxxxx"
range: "56[146-9]x[2-7]xxxxx"
range: "56[25][2-7]xxxxxx"
range: "571[2-7]xxxxxx"
range: "57[2-4]x[2-7]xxxxx"
range: "581[2-7]xxxxxx"
range: "58[2-8]x[2-7]xxxxx"
range: "59[15][2-7]xxxxxx"
range: "59[246]x[2-7]xxxxx"
range: "61[1358]x[2-7]xxxxx"
range: "612[2-7]xxxxxx"
range: "621[2-7]xxxxxx"
range: "62[2457]x[2-7]xxxxx"
range: "631[2-7]xxxxxx"
range: "63[2-4]x[2-7]xxxxx"
range: "641[2-7]xxxxxx"
range: "64[235-7]x[2-7]xxxxx"
range: "65[17][2-7]xxxxxx"
range: "65[2-689]x[2-7]xxxxx"
range: "66[13][2-7]xxxxxx"
range: "66[24578]x[2-7]xxxxx"
range: "671[2-7]xxxxxx"
range: "67[235689]x[2-7]xxxxx"
range: "674[0189][2-7]xxxxx"
range: "674[2-7]xxxxxx"
range: "680[2-7]xxxxxx"
range: "68[1-6]x[2-7]xxxxx"
range: "71[013-9]x[2-7]xxxxx"
range: "712[2-7]xxxxxx"
range: "72[0235-9]x[2-7]xxxxx"
range: "72[14][2-7]xxxxxx"
range: "73[134][2-7]xxxxxx"
range: "73[2679]x[2-7]xxxxx"
range: "74[1-35689]x[2-7]xxxxx"
range: "74[47][2-7]xxxxxx"
range: "75[15][2-7]xxxxxx"
range: "75[2-46-9]x[2-7]xxxxx"
range: "7[67][02-9]x[2-7]xxxxx"
range: "7[67]1[2-7]xxxxxx"
range: "78[013-7]x[2-7]xxxxx"
range: "782[0-6][2-7]xxxxx"
range: "788[0189][2-7]xxxxx"
range: "788[2-7]xxxxxx"
range: "79[0189]x[2-7]xxxxx"
range: "79[2-7]xxxxxxx"
range: "80[2-467]xxxxxxx"
range: "81[1357-9]x[2-7]xxxxx"
range: "816[2-7]xxxxxx"
range: "82[014][2-7]xxxxxx"
range: "82[235-8]x[2-7]xxxxx"
range: "83[03-57-9]x[2-7]xxxxx"
range: "83[126][2-7]xxxxxx"
range: "84[0-24-9]x[2-7]xxxxx"
range: "85xx[2-7]xxxxx"
range: "86[136][2-7]xxxxxx"
range: "86[2457-9]x[2-7]xxxxx"
range: "87[078][2-7]xxxxxx"
range: "87[1-6]x[2-7]xxxxx"
range: "88[1256]x[2-7]xxxxx"
range: "88[34][2-7]xxxxxx"
range: "891[2-7]xxxxxx"
range: "89[2-4]x[2-7]xxxxx"
expected: "\x81\x0f\xac\x72\x08\x1e\x3b\x58\xad\xcc\x75\x8d\x8b\x0f\xac\x72\xdc\xec\xf4\x08"
expected: "\x0a\x0c\x0e\x10\x10\xf2\x10\xfa\x11\x00\x11\x06\x11\x0e\x93\x0f\xac\x6d\xc6\x09"
expected: "\x0b\x0d\x0f\x11\x13\x15\x17\x11\x07\x11\x0f\x11\x17\x11\x1f\x11\x27\x11\x2d\x11"
expected: "\x35\x11\x3d\x81\x31\xf5\x9d\x09\x0b\x0d\xa9\x0f\x11\x13\x15\x17\x12\x27\x12\x28"
expected: "\x11\x34\x11\x38\x11\x3d\x11\x41\x11\x43\x11\x45\x93\x0f\xa9\x9d\x8c\x09\x0b\x0d"
expected: "\x0f\x11\x13\x15\x17\x11\x3c\x11\x40\x11\x44\x11\x48\x11\x4c\x11\x50\x11\x52\x11"
expected: "\x54\x90\xed\xac\x72\x08\x99\x0a\x0c\x0e\x10\x12\x73\x11\xab\x11\xad\x11\xb1\x11"
expected: "\xb5\x11\xb9\x11\xdd\x95\x31\xf5\x9d\x63\x0a\x0c\x0e\x10\x12\x14\x16\x18\x1a\x11"
expected: "\xab\x11\xaf\x11\xb3\x11\xd4\x11\xd5\x11\xb1\x11\xb5\x11\xb9\x11\x44\x93\x0f\xac"
expected: "\x72\x09\x0b\x0d\x0f\x11\x13\x15\x17\x19\x11\x11\x11\x15\x11\x19\x11\x1d\x11\x21"
expected: "\x11\x25\x11\x29\x11\x2d\x11\x31\x81\x0f\xac\x72\x08\x0a\x0c\x0e\x10\x12\x14\x16"
expected: "\x11\x29\x11\x2d\x11\x13\x11\x2f\x11\x33\x11\x37\x11\x3b\x11\x40\x60\xfc\x11\x90"
expected: "\x6b\x03\x02\x04\x11\x93\x11\x88\x60\xdc\x11\x84\x6a\x17\x02\x04\x11\x80\x11\x85"
expected: "\x68\x27\x02\x04\x11\x78\x11\x7d\x84\x44\x89\x52\x02\x04\x11\x6e\x11\x73\x6b\xed"
expected: "\x02\x04\x11\x6d\x11\x64\x68\x13\x02\x04\x11\x5e\x11\x63\x84\x42\x8a\x4a\x02\x04"
expected: "\x11\x54\x11\x59\x68\x5b\x02\x04\x11\x4c\x11\x51\x82\x24\x51\x32\x02\x04\x11\x49"
expected: "\x11\x40\x80\x44\x92\x33\x02\x04\x11\x38\x11\x3d\x80\x44\x92\x53\x02\x04\x11\x2e"
expected: "\x11\x33\x84\x42\x90\x33\x02\x04\x11\x24\x11\x29\x69\x23\x02\x04\x11\x1c\x11\x21"
expected: "\x82\x42\x49\x22\x02\x04\x11\x19\x11\x10\x84\x24\x4a\x52\x02\x04\x11\x08\x11\x0d"
expected: "\x84\x44\x91\x52\x02\x04\x10\xfe\x11\x03\x80\x00\x89\x2a\xff\xf8\x80\x66\xd8\x32"
expected: "\xf2\xf5\xf9\x82\x20\x4a\x4a\xf2\xeb\x6b\x13\xe7\xee\x68\x5d\xe3\xea\x82\x04\x8a"
expected: "\x52\xdd\xe4\x80\x22\x89\x42\xde\xd7\x84\x42\x91\x2a\xd1\xd8\x80\x04\x8a\x52\xcb"
expected: "\xd2\x80\x04\x92\x0a\xc5\xcc\x82\x22\x50\x4b\xbf\xc6\x6b\xf7\xbb\xc2\x68\xbb\xb7"
expected: "\xbe\x68\xf3\xb3\xba\x84\x44\x8a\x0d\xad\xb4\x80\x22\x49\x12\xae\xa7\x80\x00\x51"
expected: "\x32\xa8\xa1\x82\x40\x49\x12\xa2\x9b\x80\x00\x82\x0a\x95\x9c\x82\x22\x51\x12\x96"
expected: "\x8f\x80\x00\x02\x52\x89\x90\x80\x44\x92\x52\x83\x8a\x80\x00\x8a\x12\x7d\x84\x80"
expected: "\x20\x08\x32\x7e\x77\x80\x04\x12\x12\x71\x78\x80\x04\x90\x52\x6b\x72\x84\x42\x92"
expected: "\x52\x65\x6c\x80\x44\x12\x32\x5f\x66\x84\x40\x93\x52\x59\x60\x5c\x80\x00\x92\x55"
expected: "\x52\x59\x6b\xfb\x55\x4e\x84\x04\x81\x32\x48\x4f\x82\x24\x4a\x2a\x49\x42\x84\x44"
expected: "\x8a\x52\x3c\x43\x6b\xfd\x3f\x38\x82\x22\x88\x22\x39\x32\x80\x44\x91\x53\x2c\x33"
expected: "\x6b\xb9\x2f\x28\x84\x44\x52\x32\x22\x29\x80\x22\x92\x55\x1c\x23\x80\x00\x4a\x4a"
expected: "\x1d\x16\x80\x62\x49\x33\x17\x19\x13\x21\x10\x11\x62\x80\x0e\x63\xf7\x0b\x40\x09"
expected: "\x40\x0c\x60\xfc\x09\x6b\x03\x09\x07\x40\x05\x60\x7f\x02\x40\x02\x60\xfc\x44\x00"
}

+ 106
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/AnyPathTest.java View File

@ -0,0 +1,106 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.EMPTY;
import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.OPTIONAL;
import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.SINGLE;
import com.google.common.collect.ImmutableSortedSet;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class AnyPathTest {
@Test
public void testConstants() {
assertPath(EMPTY, 0);
assertPath(SINGLE, 1);
assertPath(OPTIONAL, 0, 1);
}
@Test
public void testExtend() {
assertThat(EMPTY.extend(false)).isEqualTo(SINGLE);
assertThat(EMPTY.extend(true)).isEqualTo(OPTIONAL);
// Non-optional extension is the same as joining with SINGLE.
assertPath(SINGLE.extend(false), 2);
// This is not the same as joining SINGLE.join(OPTIONAL).
assertPath(SINGLE.extend(true), 0, 2);
// 100 extends to 1000 or 1001 (if optional).
assertPath(AnyPath.of(0x4).extend(false), 3);
assertPath(AnyPath.of(0x4).extend(true), 0, 3);
}
@Test
public void testJoin() {
assertThat(EMPTY.join(SINGLE)).isEqualTo(SINGLE);
assertThat(EMPTY.join(OPTIONAL)).isEqualTo(OPTIONAL);
assertPath(SINGLE.join(SINGLE), 2);
assertPath(SINGLE.join(OPTIONAL), 1, 2);
assertPath(OPTIONAL.join(OPTIONAL), 0, 1, 2);
// "(x(x)?)?" == 110 and matches 0 to 2.
// "(x(x)?)?".join("(x(x)?)?") == "(x(x(x(x)?)?)?)?" == 11111 and matches 0 to 4.
assertThat(AnyPath.of(0x7).join(AnyPath.of(0x7))).isEqualTo(AnyPath.of(0x1F));
// "xx(x)?" == 1100 and matches 2 or 3.
// "(xx)?" == 0101 and matches 0 or 2.
// "xx(x)?".join("(xx)?") == "xx(xx)?" == 111100 and matches 2 to 5.
assertThat(AnyPath.of(0xC).join(AnyPath.of(0x5))).isEqualTo(AnyPath.of(0x3C));
}
@Test
public void testMakeOptional() {
assertThat(OPTIONAL.makeOptional()).isEqualTo(OPTIONAL);
assertThat(SINGLE.makeOptional()).isEqualTo(OPTIONAL);
assertPath(AnyPath.of(0x4).makeOptional(), 0, 2);
}
@Test
public void testToString() {
assertThat(SINGLE.toString()).isEqualTo("x");
assertThat(OPTIONAL.toString()).isEqualTo("(x)?");
assertThat(AnyPath.of(0x8).toString()).isEqualTo("xxx"); // 1000 = 3 digits
assertThat(AnyPath.of(0xA).toString()).isEqualTo("x(xx)?"); // 1010 = 1 or 3 digits
assertThat(AnyPath.of(0xF).toString()).isEqualTo("(x(x(x)?)?)?"); // 1111 = 0 to 3 digits
}
// Ordering is important as we need to find the shortest path at certain times.
@Test
public void testOrdering() {
assertThat(SINGLE).isGreaterThan(EMPTY);
assertThat(OPTIONAL).isGreaterThan(SINGLE);
assertThat(AnyPath.of(0x8)).isGreaterThan(AnyPath.of(0x4));
// Same length, but the 2nd highest length match is taken into account as a tie break.
// This strategy turns out to match numeric comparison perfectly since set-bits are lengths.
assertThat(AnyPath.of(0xA)).isGreaterThan(AnyPath.of(0x9));
}
private static void assertPath(AnyPath p, Integer... n) {
ImmutableSortedSet<Integer> lengths = ImmutableSortedSet.copyOf(n);
int maxLength = lengths.last();
assertThat(p.maxLength()).isEqualTo(maxLength);
for (int i = 0; i <= maxLength; i++) {
assertThat(p.acceptsLength(i)).isEqualTo(lengths.contains(i));
}
}
}

+ 224
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeTest.java View File

@ -0,0 +1,224 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
import static org.junit.Assert.fail;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import com.google.i18n.phonenumbers.metadata.regex.Edge.Visitor;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class EdgeTest {
@Test
public void testSimple() {
assertThat(Edge.fromMask(0x6).getDigitMask()).isEqualTo(0x6);
assertThat(Edge.fromMask(0x6).isOptional()).isFalse();
assertThat(Edge.fromMask(0x3).toString()).isEqualTo("[01]"); // 0000000011
assertThat(Edge.fromMask(0x300).toString()).isEqualTo("[89]"); // 1100000000
assertThat(Edge.fromMask(0x1FE).toString()).isEqualTo("[1-8]"); // 0111111110
assertThat(Edge.fromMask(ALL_DIGITS_MASK).toString()).isEqualTo("x"); // any digit
}
@Test
public void testAny() {
assertThat(Edge.fromMask(ALL_DIGITS_MASK)).isEqualTo(Edge.any());
assertThat(Edge.any().optional()).isEqualTo(Edge.optionalAny());
assertThat(Edge.any().toString()).isEqualTo("x");
// Unlike AnyPath, simple edges are not sequences, so don't need parens for optional.
assertThat(Edge.optionalAny().toString()).isEqualTo("x?");
}
@Test
public void testEpsilon() {
// Epsilon isn't optional, it represents a path that non-optionally accepts no input.
assertThat(Edge.epsilon().isOptional()).isFalse();
assertThat(Edge.epsilon().toString()).isEqualTo("e");
}
@Test
public void testConcatenation() {
Edge concatenated = Edge.concatenation(Edge.fromMask(0x3), Edge.any());
assertThat(concatenated.toString()).isEqualTo("[01]x");
TestingVisitor v = new TestingVisitor() {
@Override
public void visitSequence(List<Edge> edges) {
assertThat(edges).containsExactly(Edge.fromMask(0x3), Edge.any()).inOrder();
wasTested = true;
}
};
concatenated.accept(v);
assertThat(v.wasTested).isTrue();
}
@Test
public void testGroup() {
Edge group = Edge.disjunction(ImmutableSet.of(Edge.fromMask(0x3), Edge.any()));
TestingVisitor v = new TestingVisitor() {
@Override
public void visitGroup(Set<Edge> edges, boolean isOptional) {
assertThat(edges).containsExactly(Edge.any(), Edge.fromMask(0x3)).inOrder();
assertThat(isOptional).isFalse();
wasTested = true;
}
};
group.accept(v);
assertThat(group.toString()).isEqualTo("(x|[01])");
assertThat(v.wasTested).isTrue();
}
@Test
public void testOptionalGroup() {
Edge group = Edge.disjunction(ImmutableSet.of(Edge.fromMask(0x3), Edge.epsilon(), Edge.any()));
TestingVisitor v = new TestingVisitor() {
@Override
public void visitGroup(Set<Edge> edges, boolean isOptional) {
// Reordered and epsilon removed.
assertThat(edges).containsExactly(Edge.any(), Edge.fromMask(0x3)).inOrder();
assertThat(isOptional).isTrue();
wasTested = true;
}
};
group.accept(v);
assertThat(group.toString()).isEqualTo("(x|[01])?");
assertThat(v.wasTested).isTrue();
}
@Test
public void testOrdering() {
// Testing ordering is important because when generating regular expressions, the edge order
// defines a lot about the visual order of the final regular expression. This order should be
// as close to "what a person would consider reasonable" as possible. In fact some of the cases
// tested here will never occur in real situations (e.g. sequences compared with groups)
// because of the way composite edges are created. However it seems sensible to test the
// behaviour nevertheless.
// Simple Edges
assertSameOrder(e("0"), e("0"));
// "0" < "1" - lowest bit set wins
assertOrdered(e("0"), e("1"));
// "[01]" < "1" - lowest bit set wins
assertOrdered(e("[01]"), e("1"));
// "x" < "9" - lowest bit set wins
assertOrdered(X, e("9"));
// Sequences
// ("0x" < "1") and ("0" < "1x") - first edge in sequence is compared to single edge.
assertOrdered(seq(e("0"), X), e("1"));
assertOrdered(e("0"), seq(e("1"), X));
// "[01]" < "[01]x" - single edges are "smaller" than sequences of edges if all else is equal.
assertOrdered(e("[01]"), seq(e("[01]"), X));
// "[01]x" == "[01]x"
assertSameOrder(seq(e("[01]"), X), seq(e("[01]"), X));
// "x1" < "x2" - comparing 2 sequences compares all edges.
assertOrdered(seq(X, e("1")), seq(X, e("2")));
// "[01]x" < "[01]xx" - shortest sequence wins in tie break (similar to how "[01]" < "[01]x")
assertOrdered(seq(e("[01]"), X), seq(e("[01]"), X, X));
// Disjunctions
// "(1|2)" == "(2|1)" - edges are sorted when creating disjunctions
assertSameOrder(or(e("1"), e("2")), or(e("2"), e("1")));
// "(1|2|3)" < "(1|2|4)" - comparing 2 disjunctions compares all edges.
assertOrdered(or(e("1"), e("2"), e("3")), or(e("1"), e("2"), e("4")));
// "(1|2)" < "(1|2|3)" - shortest sequence wins in tie break
assertOrdered(or(e("1"), e("2")), or(e("1"), e("2"), e("3")));
// Miscellaneous
// "1" < "(1|2)" - if first edge matches, single edges sort before groups.
assertOrdered(e("1"), or(e("1"), e("2")));
// "(1|x)" < "1x" - because "(1|x)" is actually "(x|1)" and "x" < "1".
assertOrdered(or(e("1"), X), seq(e("1"), X));
}
private static void assertSameOrder(Edge lhs, Edge rhs) {
assertThat(lhs).isEquivalentAccordingToCompareTo(rhs);
assertThat(lhs).isEqualTo(rhs);
}
private static void assertOrdered(Edge lhs, Edge rhs) {
assertThat(lhs).isNotEqualTo(rhs);
assertThat(lhs).isLessThan(rhs);
assertThat(rhs).isGreaterThan(lhs);
}
// A bit like a mock, but not really "mocking" existing behaviour.
private static class TestingVisitor implements Visitor {
// Set this in overridden method(s).
protected boolean wasTested = false;
@Override
public void visit(SimpleEdge edge) {
fail("unexpected call");
}
@Override
public void visitSequence(List<Edge> edges) {
fail("unexpected call");
}
@Override
public void visitGroup(Set<Edge> edges, boolean isOptional) {
fail("unexpected call");
}
}
// The 'any digit' edge.
private static final Edge X = e("x");
// Creates a simple edge from a range specification string for testing.
private static SimpleEdge e(String s) {
RangeSpecification spec = RangeSpecification.parse(s);
Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges");
return SimpleEdge.fromMask(spec.getBitmask(0));
}
// Creates sequence of edges (wrapping for convenience).
private static Edge seq(Edge first, Edge second, Edge... rest) {
// This already rejects epsilon edges.
Edge edge = Edge.concatenation(first, second);
for (Edge e : rest) {
edge = Edge.concatenation(edge, e);
}
return edge;
}
// Creates a non-optional disjunction of edges.
private static Edge or(Edge... edges) {
List<Edge> e = Arrays.asList(edges);
Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups");
return Edge.disjunction(e);
}
}

+ 154
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriterTest.java View File

@ -0,0 +1,154 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import com.google.common.base.Preconditions;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class EdgeWriterTest {
// Note that this code is tested very thoroughly by any "round-tripping" of regular expressions
// in the metadata (i.e. generating regular expressions from DFAs and then re-parsing then to
// ensure that the same DFA is produced). This is part of any acceptance test for generating
// regular expressions and serves as a far more comprehensive stress test on the code. These
// tests are thus limited to simpler cases and highlighting interesting behaviour.
// The 'any digit' edge.
private static final Edge X = e("x");
@Test
public void testSimple() {
assertThat(regex(e("0"))).isEqualTo("0");
assertThat(regex(e("[0-7]"))).isEqualTo("[0-7]");
assertThat(regex(e("[0-9]"))).isEqualTo("\\d");
assertThat(regex(X)).isEqualTo("\\d");
}
@Test
public void testSequences() {
assertThat(regex(seq(e("0"), e("1"), e("2")))).isEqualTo("012");
}
@Test
public void testGroups() {
// Non-optional groups spanning the top level don't need parentheses.
assertThat(regex(or(e("0"), e("1"), e("2")))).isEqualTo("0|1|2");
// Optional groups always need parentheses.
assertThat(regex(opt(e("0"), e("1"), e("2")))).isEqualTo("(?:0|1|2)?");
// Once a group has prefix or suffix, parentheses are needed.
assertThat(regex(
seq(
or(e("0"), e("1")),
e("2"))))
.isEqualTo("(?:0|1)2");
}
@Test
public void testNesting() {
// Basic nesting is handled by a very straightforward edge visitor, so one non-trivial test
// will cover all the basic cases ("any digit" sequences are a different matter however).
assertThat(regex(
seq(
e("0"),
or(
e("1"),
seq(
e("2"),
opt(e("3"), e("4")))),
e("5"), e("6"))))
.isEqualTo("0(?:1|2(?:3|4)?)56");
}
@Test
public void testAnyDigitSequences() {
// This is the complex part of efficient regular expression generation.
assertThat(regex(seq(e("0"), e("1"), X))).isEqualTo("01\\d");
// "\d\d" is shorter than "\d{2}"
assertThat(regex(seq(X, X))).isEqualTo("\\d\\d");
assertThat(regex(seq(X, X, X))).isEqualTo("\\d{3}");
// Top level optional groups are supported.
assertThat(regex(opt(seq(X, X)))).isEqualTo("(?:\\d{2})?");
// Optional parts go at the end.
assertThat(regex(
seq(
opt(seq(X, X)),
X, X)))
.isEqualTo("\\d\\d(?:\\d{2})?");
// "(x(x(x)?)?)?"
Edge anyGrp = opt(seq(
X,
opt(seq(
X,
opt(X)))));
// The two cases of a group on its own or as part of a sequence are handled separately, so
// must be tested separately.
assertThat(regex(anyGrp)).isEqualTo("\\d{0,3}");
assertThat(regex(seq(e("1"), e("2"), anyGrp))).isEqualTo("12\\d{0,3}");
// xx(x(x(x)?)?)?"
assertThat(regex(seq(X, X, anyGrp))).isEqualTo("\\d{2,5}");
// Combining "any digit" groups produces minimal representation
assertThat(regex(seq(anyGrp, anyGrp))).isEqualTo("\\d{0,6}");
}
// Helper to call standard version of regex generator (not using 'dot' for matching).
private String regex(Edge e) {
return EdgeWriter.toRegex(e, false /* use dot match */);
}
// Creates a simple edge from a range specification string for testing.
private static SimpleEdge e(String s) {
RangeSpecification spec = RangeSpecification.parse(s);
Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges");
return SimpleEdge.fromMask(spec.getBitmask(0));
}
// Creates sequence of edges (wrapping for convenience).
private static Edge seq(Edge first, Edge second, Edge... rest) {
// This already rejects epsilon edges.
Edge edge = Edge.concatenation(first, second);
for (Edge e : rest) {
edge = Edge.concatenation(edge, e);
}
return edge;
}
// Creates a non-optional disjunction of edges.
private static Edge or(Edge... edges) {
List<Edge> e = Arrays.asList(edges);
Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups");
return Edge.disjunction(e);
}
// Creates an optional disjunction of edges.
private static Edge opt(Edge... edges) {
List<Edge> e = new ArrayList<>();
e.addAll(Arrays.asList(edges));
Preconditions.checkArgument(!e.contains(Edge.epsilon()), "don't pass epsilon directly");
e.add(Edge.epsilon());
return Edge.disjunction(e);
}
}

+ 98
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaBuilder.java View File

@ -0,0 +1,98 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL;
import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL;
import com.google.common.graph.MutableValueGraph;
import com.google.common.graph.ValueGraph;
import com.google.common.graph.ValueGraphBuilder;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
/** Simple fluent API for constructing graphs for testing. */
final class NfaBuilder {
private final MutableValueGraph<Node, SimpleEdge> graph =
ValueGraphBuilder.directed().allowsSelfLoops(false).build();
// The last node added to the graph.
private Node lastNode;
/** Creates a new mutable NFA graph. */
public NfaBuilder() {
graph.addNode(INITIAL);
graph.addNode(TERMINAL);
lastNode = TERMINAL;
}
/**
* Returns an unmodifiable view of the underlying graph (not a snapshot). If the builder is
* modified after this method is called, it will affect what was returned.
*/
public ValueGraph<Node, SimpleEdge> graph() {
return graph;
}
/** Adds a new path from the given source node, returning the newly created target node. */
public Node addPath(Node source, String path) {
RangeSpecification spec = RangeSpecification.parse(path);
for (int n = 0; n < spec.length(); n++) {
lastNode = lastNode.createNext();
addEdge(source, lastNode, SimpleEdge.fromMask(spec.getBitmask(n)));
source = lastNode;
}
return lastNode;
}
/** Adds a new path between the given source and target (all intermediate nodes are new). */
public void addPath(Node source, Node target, String path) {
RangeSpecification spec = RangeSpecification.parse(path);
for (int n = 0; n < spec.length() - 1; n++) {
lastNode = lastNode.createNext();
addEdge(source, lastNode, SimpleEdge.fromMask(spec.getBitmask(n)));
source = lastNode;
}
addEdge(source, target, SimpleEdge.fromMask(spec.getBitmask(spec.length() - 1)));
}
/**
* Adds a new path between the given source and target nodes, along with an epsilon edge from the
* source to the target.
*/
public void addOptionalPath(Node source, Node target, String path) {
addPath(source, target, path);
addEpsilon(source, target);
}
private void addEpsilon(Node s, Node t) {
checkArgument(graph.nodes().contains(s), "missing source node");
checkArgument(graph.nodes().contains(s), "missing target node");
SimpleEdge e = graph.putEdgeValue(s, t, Edge.epsilon());
if (e != null) {
// Edge already exists; if not an epsilon, make it optional.
checkArgument(!e.equals(Edge.epsilon()) && !e.isOptional(), "epsilon already added");
graph.putEdgeValue(s, t, e.optional());
}
}
private void addEdge(Node s, Node t, SimpleEdge e) {
graph.addNode(s);
graph.addNode(t);
checkArgument(graph.putEdgeValue(s, t, e) == null, "edge already exists");
}
}

+ 229
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattenerTest.java View File

@ -0,0 +1,229 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL;
import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL;
import com.google.common.base.Preconditions;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.TreeSet;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class NfaFlattenerTest {
// The 'any digit' edge.
private static final Edge X = e("x");
@Test
public void testSimple() {
NfaBuilder nfa = new NfaBuilder();
nfa.addPath(INITIAL, TERMINAL, "12");
Edge flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(seq(e("1"), e("2")));
assertThat(flat.toString()).isEqualTo("12");
nfa.addPath(INITIAL, TERMINAL, "34");
flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(
or(
seq(e("1"), e("2")),
seq(e("3"), e("4"))));
assertThat(flat.toString()).isEqualTo("(12|34)");
}
@Test
public void testSubgroup() {
NfaBuilder nfa = new NfaBuilder();
Node split = nfa.addPath(INITIAL, "12");
Node join = nfa.addPath(split, "34");
nfa.addPath(split, join, "56");
nfa.addPath(join, TERMINAL, "78");
Edge flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(
seq(e("1"), e("2"),
or(
seq(e("3"), e("4")),
seq(e("5"), e("6"))
),
e("7"), e("8")));
assertThat(flat.toString()).isEqualTo("12(34|56)78");
}
@Test
public void testSubgroupWithEarlyJoining() {
NfaBuilder nfa = new NfaBuilder();
// Create a graph with 4 initial paths branching out which collapses to 3, 2 and then 1.
Node groupStart = nfa.addPath(INITIAL, "0");
// Add 2 edges to the first join point (if we add only one edge then it clashes with the
// joining edge, which goes directly from groupStart to firstJoin.
Node firstJoin = nfa.addPath(nfa.addPath(groupStart, "1"), "2");
nfa.addPath(groupStart, firstJoin, "3");
Node secondJoin = nfa.addPath(firstJoin, "4");
nfa.addPath(groupStart, secondJoin, "5");
Node groupEnd = nfa.addPath(secondJoin, "6");
nfa.addPath(groupStart, groupEnd, "7");
nfa.addPath(groupEnd, TERMINAL, "8");
Edge flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(
seq(e("0"),
or(
seq(
or(
seq(
or(
seq(e("1"), e("2")),
e("3")),
e("4")),
e("5")),
e("6")),
e("7")),
e("8")));
assertThat(flat.toString()).isEqualTo("0(((12|3)4|5)6|7)8");
}
@Test
public void testPathDuplication() {
NfaBuilder nfa = new NfaBuilder();
Node groupStart = nfa.addPath(INITIAL, "0");
Node lhsMid = nfa.addPath(groupStart, "1");
Node groupEnd = nfa.addPath(lhsMid, "2");
Node rhsMid = nfa.addPath(groupStart, "3");
nfa.addPath(rhsMid, groupEnd, "4");
nfa.addPath(groupEnd, TERMINAL, "5");
// So far this is a normal nestable graph:
// ,--1-->()--2--v
// (I)--0-->() ()--5-->(T)
// `--3-->()--4--^
Edge flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(
seq(e("0"),
or(
seq(e("1"), e("2")),
seq(e("3"), e("4"))),
e("5")));
assertThat(flat.toString()).isEqualTo("0(12|34)5");
// This new path "crosses" the group, creating a non-nestable structure which can only be
// resolved by duplicating some path (in this case it's the 2nd part of the right-hand-side).
nfa.addPath(lhsMid, rhsMid, "x");
flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(
seq(e("0"),
or(
seq(e("1"),
or(
e("2"),
seq(X, e("4")))),
seq(e("3"), e("4"))),
e("5")));
// Note the duplication of the '4' to make the graph nestable.
assertThat(flat.toString()).isEqualTo("0(1(x4|2)|34)5");
}
@Test
public void testNodeOrdering_bug_65250963() {
// ,--->(C)----------.
// | v
// (I)-->(D)-->(B)-->(T)
// | ^
// `--->(A)----'
NfaBuilder nfa = new NfaBuilder();
// IMPORTANT: Order of insertion determines the node IDs (A=1, B=2...). The edge index just
// happens to match node ID for readability, but doesn't affect the test directly.
Node a = nfa.addPath(INITIAL, "1");
Node b = nfa.addPath(a, "2");
Node c = nfa.addPath(INITIAL, "3");
Node d = nfa.addPath(INITIAL, "4");
// Now join up remaining paths.
nfa.addPath(d, b, "5");
nfa.addPath(b, TERMINAL, "6");
nfa.addPath(c, TERMINAL, "7");
Comparator<Node> ordering = NfaFlattener.nodeOrdering(nfa.graph());
// In the old ordering code, because (B) and (D) are not reachable to/from (C) we would have
// had the ordering (D < B), (B < C), (C < D) giving a cycle. In the new code, the longest path
// length to reach (C) is less than (B), so we get (C < B) and we no longer have a cycle.
// The node ordering is now: (INITIAL, A, C, D, B, TERMINAL)
TreeSet<Node> nodes = new TreeSet<>(ordering);
nodes.add(INITIAL);
nodes.add(TERMINAL);
nodes.add(a);
nodes.add(b);
nodes.add(c);
nodes.add(d);
assertThat(nodes).containsExactly(INITIAL, a, c, d, b, TERMINAL).inOrder();
}
@Test
public void testOptionalTopLevelGroup_bug_69101586() {
// ,--->(e)----.
// | v
// (I)-->(A)-->(T)
NfaBuilder nfa = new NfaBuilder();
nfa.addOptionalPath(INITIAL, TERMINAL, "xx");
Edge flat = NfaFlattener.flatten(nfa.graph());
assertThat(flat).isEqualTo(opt(seq(X, X)));
assertThat(flat.toString()).isEqualTo("(xx)?");
}
// Creates a simple edge from a range specification string for testing.
private static SimpleEdge e(String s) {
RangeSpecification spec = RangeSpecification.parse(s);
Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges");
return SimpleEdge.fromMask(spec.getBitmask(0));
}
// Creates sequence of edges (wrapping for convenience).
private static Edge seq(Edge first, Edge second, Edge... rest) {
// This already rejects epsilon edges.
Edge edge = Edge.concatenation(first, second);
for (Edge e : rest) {
edge = Edge.concatenation(edge, e);
}
return edge;
}
// Creates an optional disjunction of edges.
private static Edge opt(Edge... edges) {
List<Edge> e = new ArrayList<>();
e.addAll(Arrays.asList(edges));
Preconditions.checkArgument(!e.contains(Edge.epsilon()), "don't pass epsilon directly");
e.add(Edge.epsilon());
return Edge.disjunction(e);
}
// Creates a non-optional disjunction of edges.
private static Edge or(Edge... edges) {
List<Edge> e = Arrays.asList(edges);
Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups");
return Edge.disjunction(e);
}
}

+ 68
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NodeTest.java View File

@ -0,0 +1,68 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL;
import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class NodeTest {
@Test
public void testConstants() {
assertThat(INITIAL.id()).isEqualTo(0);
assertThat(TERMINAL.id()).isEqualTo(1);
assertThat(TERMINAL).isNotEqualTo(INITIAL);
}
@Test
public void testNext() {
assertThat(INITIAL.createNext()).isSameInstanceAs(TERMINAL);
assertThat(TERMINAL.createNext()).isNotEqualTo(TERMINAL);
assertThat(TERMINAL.createNext().id()).isEqualTo(2);
Node node = INITIAL;
for (int id = 0; id < 10; id++) {
assertThat(node.id()).isEqualTo(id);
node = node.createNext();
}
}
@Test
public void testToString() {
Node node = INITIAL;
for (int id = 0; id < 10; id++) {
assertThat(node.toString()).isEqualTo(Integer.toString(id));
node = node.createNext();
}
}
// Consistent ordering helps ensure regular expressions derived from graphs are deterministic.
@Test
public void testOrdering() {
assertThat(TERMINAL).isGreaterThan(INITIAL);
Node node = INITIAL;
for (int id = 0; id < 10; id++) {
Node next = node.createNext();
assertThat(next).isGreaterThan(node);
node = next;
}
}
}

+ 154
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverterTest.java View File

@ -0,0 +1,154 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.Edge.any;
import static com.google.i18n.phonenumbers.metadata.regex.Edge.epsilon;
import static com.google.i18n.phonenumbers.metadata.regex.Edge.optionalAny;
import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL;
import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL;
import com.google.common.collect.Iterables;
import com.google.common.graph.ValueGraph;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import java.util.List;
import java.util.stream.Stream;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class RangeTreeConverterTest {
// Simple 4 node DFA.
// (I) --1--> ( ) --2--> ( ) --x--> (T)
@Test
public void testSimple() {
RangeTree dfa = RangeTree.from(specs("12x"));
ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa);
assertThat(nfa.nodes()).hasSize(4);
Node node = assertOutEdge(nfa, INITIAL, edge(1));
node = assertOutEdge(nfa, node, edge(2));
node = assertOutEdge(nfa, node, any());
assertThat(node).isEqualTo(TERMINAL);
}
// Simple 4 node DFA with optional termination immediately before the terminal node.
// (I) --1--> ( ) --2--> (T) --x--> (T)
@Test
public void testWithOptionalEdge() {
RangeTree dfa = RangeTree.from(specs("12x", "12"));
ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa);
assertThat(nfa.nodes()).hasSize(4);
Node node = assertOutEdge(nfa, INITIAL, edge(1));
node = assertOutEdge(nfa, node, edge(2));
node = assertOutEdge(nfa, node, optionalAny());
assertThat(node).isEqualTo(TERMINAL);
}
// Simple 4 node DFA with optional termination.
// (I) --1--> (T) --2--> ( ) --x--> (T)
@Test
public void testWithEpsilon() {
RangeTree dfa = RangeTree.from(specs("12x", "1"));
ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa);
assertThat(nfa.nodes()).hasSize(4);
Node node = assertOutEdge(nfa, INITIAL, edge(1));
assertOutEdges(nfa, node, edge(2), epsilon());
// One of the out nodes should be the terminal.
assertThat(follow(nfa, node, epsilon())).isEqualTo(Node.TERMINAL);
node = follow(nfa, node, edge(2));
// The other is the normal edge that leads to the terminal.
node = follow(nfa, node, any());
assertThat(node).isEqualTo(TERMINAL);
}
// Simple 5 node DFA with 2 paths.
// (I) --1--> ( ) --2--> ( ) --x--> (T)
// `---3--> ( ) --4----^
@Test
public void testMultiplePathsWithCommonTail() {
RangeTree dfa = RangeTree.from(specs("12x", "34x"));
ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa);
assertThat(nfa.nodes()).hasSize(5);
assertOutEdges(nfa, INITIAL, edge(1), edge(3));
Node lhs = follow(nfa, INITIAL, edge(1));
lhs = assertOutEdge(nfa, lhs, edge(2));
Node rhs = follow(nfa, INITIAL, edge(3));
rhs = assertOutEdge(nfa, rhs, edge(4));
assertThat(lhs).isEqualTo(rhs);
Node node = assertOutEdge(nfa, lhs, any());
assertThat(node).isEqualTo(TERMINAL);
}
@Test
public void testOptionalTopLevelGroup_bug_69101586() {
// Requires making a top level optional group, which is (deliberately) not easy with the
// DFA tooling since it's pretty rare. This is a DFA which can terminate immediately and will
// match the empty input (as well as its normal input).
RangeTree dfa = RangeTree.from(specs("xx")).union(RangeTree.from(RangeSpecification.empty()));
ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(dfa);
assertThat(nfa.nodes()).hasSize(3);
assertThat(follow(nfa, INITIAL, epsilon())).isEqualTo(Node.TERMINAL);
Node node = follow(nfa, INITIAL, any());
node = assertOutEdge(nfa, node, any());
assertThat(node).isEqualTo(TERMINAL);
}
// Returns the simple edge matching exactly this one digit value.
SimpleEdge edge(int n) {
return SimpleEdge.fromMask(1 << n);
}
List<RangeSpecification> specs(String... s) {
return Stream.of(s).map(RangeSpecification::parse).collect(toImmutableList());
}
// Asserts that a node has only one out edge and returns that edge's target.
Node assertOutEdge(ValueGraph<Node, SimpleEdge> nfa, Node node, SimpleEdge edge) {
assertThat(nfa.successors(node)).hasSize(1);
Node target = Iterables.getOnlyElement(nfa.successors(node));
assertThat(nfa.edgeValue(node, target).get()).isEqualTo(edge);
return target;
}
// Asserts that a node has all the given edges.
void assertOutEdges(ValueGraph<Node, SimpleEdge> nfa, Node node, SimpleEdge... edges) {
assertThat(nfa.successors(node)).hasSize(edges.length);
List<Edge> out = nfa.successors(node).stream()
.map(t -> nfa.edgeValue(node, t).get())
.collect(toImmutableList());
assertThat(out).containsExactlyElementsIn(edges);
}
// Follows the given edge from a node (which must be in the graph), returning the target node
// (or null if the edge does not exist in the graph).
Node follow(ValueGraph<Node, SimpleEdge> nfa, Node node, SimpleEdge edge) {
return nfa.successors(node).stream()
.filter(t -> nfa.edgeValue(node, t).get().equals(edge))
.findFirst()
.orElse(null);
}
}

+ 107
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatterTest.java View File

@ -0,0 +1,107 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.FORCE_CAPTURING_GROUPS;
import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.FORCE_NON_CAPTURING_GROUPS;
import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.PRESERVE_CAPTURING_GROUPS;
import com.google.common.base.Joiner;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class RegexFormatterTest {
// Luckily the formatter cares only about 3 special characters, '(', '|' and ')', so we only need
// to test a few very straightforward cases to cover everything.
@Test
public void testSimple() {
assertThat(RegexFormatter.format("abcd", PRESERVE_CAPTURING_GROUPS))
.isEqualTo("abcd");
}
@Test
public void testNested() {
assertThat(RegexFormatter.format("ab(cd|ef)gh", PRESERVE_CAPTURING_GROUPS)).isEqualTo(lines(
"ab(",
" cd|",
" ef",
")gh"));
assertThat(RegexFormatter.format("ab(?:cd|ef)gh", PRESERVE_CAPTURING_GROUPS)).isEqualTo(lines(
"ab(?:",
" cd|",
" ef",
")gh"));
}
@Test
public void testDoubleNested() {
assertThat(RegexFormatter.format("ab(cd(ef|gh)|ij)", PRESERVE_CAPTURING_GROUPS))
.isEqualTo(lines(
"ab(",
" cd(",
" ef|",
" gh",
" )|",
" ij",
")"));
assertThat(RegexFormatter.format("ab(cd(?:ef|gh)|ij)", PRESERVE_CAPTURING_GROUPS))
.isEqualTo(lines(
"ab(",
" cd(?:",
" ef|",
" gh",
" )|",
" ij",
")"));
}
@Test
public void testForceNonCapturingGroups() {
assertThat(RegexFormatter.format("ab(?:cd(ef|gh)|ij)", FORCE_NON_CAPTURING_GROUPS))
.isEqualTo(lines(
"ab(?:",
" cd(?:",
" ef|",
" gh",
" )|",
" ij",
")"));
}
@Test
public void testForceCapturingGroups() {
assertThat(RegexFormatter.format("ab(?:cd(ef|gh)|ij)", FORCE_CAPTURING_GROUPS)).isEqualTo(lines(
"ab(",
" cd(",
" ef|",
" gh",
" )|",
" ij",
")"));
}
private static String lines(String... s) {
return Joiner.on('\n').join(s);
}
}

+ 197
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexGeneratorTest.java View File

@ -0,0 +1,197 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.base.CharMatcher.whitespace;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.RegexGenerator.basic;
import static java.util.stream.Collectors.joining;
import com.google.common.collect.ImmutableList;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class RegexGeneratorTest {
@Test
public void testSimple() {
assertRegex(basic(), ranges("123xxx"), "123\\d{3}");
// This could be improved to "..." rather than ".{3}" saving 1 char, probably not worth it.
assertRegex(basic().withDotMatch(), ranges("123xxx"), "123.{3}");
}
@Test
public void testVariableLength() {
assertRegex(basic(), ranges("123xxx", "123xxxx", "123xxxxx", "123xxxxxx"), "123\\d{3,6}");
}
@Test
public void testTailOptimization() {
RangeTree dfa = ranges("123xxx", "123xxxx", "145xxx");
assertRegex(basic(), dfa, "1(?:23\\d{3,4}|45\\d{3})");
assertRegex(basic().withTailOptimization(), dfa, "1(?:23\\d?|45)\\d{3}");
}
@Test
public void testDfaFactorization() {
// Essentially create a "thin" wedge of specific non-determinism with the shorter (5-digit)
// numbers which prevents the larger ranges from being contiguous in the DFA.
RangeTree dfa = ranges("1234x", "1256x", "[0-4]xxxxxx", "[0-4]xxxxxxx");
assertRegex(basic(), dfa,
"[02-4]\\d{6,7}|",
"1(?:[013-9]\\d{5,6}|",
"2(?:[0-246-9]\\d{4,5}|",
"3(?:[0-35-9]\\d{3,4}|4\\d(?:\\d{2,3})?)|",
"5(?:[0-57-9]\\d{3,4}|6\\d(?:\\d{2,3})?)))");
assertRegex(basic().withDfaFactorization(), dfa, "[0-4]\\d{6,7}|12(?:34|56)\\d");
}
@Test
public void testSubgroupOptimization() {
// The subgraph of "everything except 95, 96 and 100" (this appears in China leading digits).
RangeTree postgraph = ranges("[02-8]", "1[1-9]", "10[1-9]", "9[0-47-9]");
RangeTree pregraph = ranges("123", "234", "345", "456", "567");
// Cross product of pre and post paths.
RangeTree subgraph = RangeTree.from(
pregraph.asRangeSpecifications().stream()
.flatMap(a -> postgraph.asRangeSpecifications().stream().map(a::extendBy)));
// Union in other paths to trigger repetition in the "basic" case.
RangeTree rest = ranges("128xx", "238xx", "348xx", "458xx", "568xx");
RangeTree dfa = rest.union(subgraph);
assertRegex(basic(), dfa,
"12(?:3(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|",
"23(?:4(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|",
"34(?:5(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|",
"45(?:6(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|",
"56(?:7(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)");
assertRegex(basic().withSubgroupOptimization(), dfa,
"(?:12|23|34|45|56)8\\d\\d|",
"(?:123|234|345|456|567)(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])");
}
@Test
public void testRegression_bug_65250963() {
RangeTree dfa = ranges(
"1387",
"1697",
"1524",
"1539",
"1768",
"1946");
assertRegex(basic(), dfa,
"1(?:",
" (?:",
" 38|",
" 69",
" )7|",
" 5(?:",
" 24|",
" 39",
" )|",
" 768|",
" 946",
")");
}
@Test
public void testRegression_bug_68929642() {
assertMatches(
"1\\d{6}(?:\\d{2})?",
ImmutableList.of("1234567", "123456789"),
ImmutableList.of("12345678"),
"1xxx_xxx", "1xx_xxx_xxx");
assertMatches(
"1\\d{6}[0-7]?",
ImmutableList.of("1234567", "12345670"),
ImmutableList.of("123456", "123456700"),
"1xxx_xxx", "1x_xxx_xx[0-7]");
assertMatches(
"\\d\\d?",
ImmutableList.of("1", "12"),
ImmutableList.of("", "123"),
"x", "xx");
assertMatches(
"\\d{1,3}",
ImmutableList.of("1", "12", "123"),
ImmutableList.of("", "1234"),
"x", "xx", "xxx");
assertMatches(
"\\d(?:\\d{3}(?:\\d{2})?)?",
ImmutableList.of("1", "1234", "123456"),
ImmutableList.of("", "12", "123", "12345", "1234567"),
"x", "xxxx", "xxx_xxx");
assertMatches(
"(?:\\d\\d(?:\\d(?:\\d{2,4})?)?)?",
ImmutableList.of("", "12", "123", "12345", "123456", "1234567"),
ImmutableList.of("1", "1234", "12345678"),
"", "xx", "xxx", "xx_xxx", "xxx_xxx", "xxxx_xxx");
assertMatches(
"(?:\\d{2})?",
ImmutableList.of("", "12"),
ImmutableList.of("1", "123"),
"", "xx");
assertMatches(
"\\d?",
ImmutableList.of("", "1"),
ImmutableList.of("12"),
"", "x");
}
// This does not check that the generated regex is the same as the input, but it does test some
// positive/negative matching cases against both and verifies that the DFA for both are equal.
private static void assertMatches(
String pattern, List<String> matchNumbers, List<String> noMatchNumbers, String... specs) {
String regex = basic().toRegex(ranges(specs));
assertThat(regex).isEqualTo(pattern);
// Test the given positive/negative match numbers and expect the same behaviour from both.
for (String number : matchNumbers) {
assertThat(number).matches(pattern);
assertThat(number).matches(regex);
}
for (String number : noMatchNumbers) {
assertThat(number).doesNotMatch(pattern);
assertThat(number).doesNotMatch(regex);
}
}
private static void assertRegex(RegexGenerator generator, RangeTree dfa, String... lines) {
String regex = generator.toRegex(dfa);
String expected = Arrays.stream(lines).map(whitespace()::removeFrom).collect(joining());
assertThat(regex).isEqualTo(expected);
}
private static RangeTree ranges(String... specs) {
return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse));
}
}

+ 80
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/SubgraphOptimizerTest.java View File

@ -0,0 +1,80 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.common.truth.Truth8.assertThat;
import com.google.i18n.phonenumbers.metadata.RangeSpecification;
import com.google.i18n.phonenumbers.metadata.RangeTree;
import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
import com.google.i18n.phonenumbers.metadata.regex.SubgroupOptimizer.LinkNodeVisitor;
import java.util.Arrays;
import java.util.Optional;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class SubgraphOptimizerTest {
// The subgraph of "everything except 95, 96 and 100" (this appears in China leading digits).
// Note that unlike China, there's also an early terminating '9' in the subgraph to ensure that
// the entire subgraph is extracted (including teminating node).
private static final RangeTree POSTGRAPH = ranges("[02-9]", "1[1-9]", "10[1-9]", "9[0-47-9]");
// Some prefixes which come before the subgraph.
private static final RangeTree PREGRAPH = ranges("123", "234", "345", "456", "567");
// Cross product of pre and post paths.
private static final RangeTree SUBGRAPH = RangeTree.from(
PREGRAPH.asRangeSpecifications().stream()
.flatMap(a -> POSTGRAPH.asRangeSpecifications().stream().map(a::extendBy)));
// Additional paths which share edges in the subgraph and will cause repetition in regular
// expressions. Also add a couple of early terminating paths "on the way to" the subgroup.
// Note however that a terminating path that reaches the root of the subgraph (e.g. "123") will
// cause a split in the DFA at the root node (one terminating, one not terminating).
private static final RangeTree TEST_RANGES =
SUBGRAPH.union(ranges("128xx", "238xx", "348xx", "458xx", "568xx", "12", "34"));
@Test
public void testSubgraphWeightAndInOrder() {
LinkNodeVisitor v = new LinkNodeVisitor();
TEST_RANGES.accept(v);
DfaNode n = v.getHighestCostNode();
assertThat(n).isNotNull();
// 5 paths in PREGRAPH which reach the root of POSTGRAPH.
assertThat(v.getInOrder(n)).isEqualTo(5);
// 7 edges in POSTGRAPH with a total weight of 27:
// "[02-8]" = 6, "1", "0", "9" = 3, 2 x "[1-9]" = 10, "[0-47-9]" = 8
assertThat(v.getSubgraphWeight(n)).isEqualTo(27);
}
@Test
public void testSubgraphExtraction() {
Optional<RangeTree> extracted = SubgroupOptimizer.extractRepeatingSubgraph(TEST_RANGES);
assertThat(extracted).hasValue(SUBGRAPH);
// The "bridge" node is the same, so we extract the whole graph (so we return nothing).
assertThat(SubgroupOptimizer.extractRepeatingSubgraph(SUBGRAPH)).isEmpty();
// There's no repetition in this graph, so return nothing.
assertThat(SubgroupOptimizer.extractRepeatingSubgraph(ranges("123", "234", "345"))).isEmpty();
}
private static RangeTree ranges(String... specs) {
return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse));
}
}

+ 122
- 0
metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizerTest.java View File

@ -0,0 +1,122 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.i18n.phonenumbers.metadata.regex;
import static com.google.common.truth.Truth.assertThat;
import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL;
import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL;
import com.google.common.graph.ValueGraph;
import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@RunWith(JUnit4.class)
public class TrailingPathOptimizerTest {
@Test
public void testSimple() {
NfaBuilder nfa = new NfaBuilder();
nfa.addPath(INITIAL, TERMINAL, "12xx");
nfa.addPath(INITIAL, TERMINAL, "34xxxx");
ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph());
// Expect the common trailing "xx" to be factored out at some new join point.
NfaBuilder expected = new NfaBuilder();
Node join = expected.addPath(INITIAL, "12");
expected.addPath(INITIAL, join, "34xx");
expected.addPath(join, TERMINAL, "xx");
assertEquivalent(actual, expected);
}
@Test
public void testTrailingOptionalGroup() {
NfaBuilder nfa = new NfaBuilder();
nfa.addPath(INITIAL, TERMINAL, "12xx");
// Add path "34xx(xx)?"
Node optStart = nfa.addPath(INITIAL, "34xx");
nfa.addOptionalPath(optStart, TERMINAL, "xx");
ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph());
// Expect the common trailing "xx" to be factored out at some new join point.
NfaBuilder expected = new NfaBuilder();
Node join = expected.addPath(INITIAL, "12");
// Add "34(xx)?" up to the joining node.
optStart = expected.addPath(INITIAL, "34");
expected.addOptionalPath(optStart, join, "xx");
// Add the trailing "xx".
expected.addPath(join, TERMINAL, "xx");
assertEquivalent(actual, expected);
}
@Test
public void testDoubleRecursion() {
NfaBuilder nfa = new NfaBuilder();
nfa.addPath(INITIAL, TERMINAL, "12xx");
nfa.addPath(INITIAL, TERMINAL, "34xxxx");
// Add path "56xxxx(xx)?"
Node optStart = nfa.addPath(INITIAL, "56xxxx");
nfa.addOptionalPath(optStart, TERMINAL, "xx");
ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph());
// Factoring should be applied twice to pull out 2 lots of "xx".
// How I wish we had a way to embed proper graphs in JavaDoc!
//
// ,-----------12-----------v
// (I)------34----->(1)--xx-->(2)--xx-->(T)
// `-56-->()--xx--^
// `--e---^
//
NfaBuilder expected = new NfaBuilder();
Node secondJoin = expected.addPath(INITIAL, "12");
expected.addPath(secondJoin, TERMINAL, "xx");
Node firstJoin = expected.addPath(INITIAL, "34");
expected.addPath(firstJoin, secondJoin, "xx");
optStart = expected.addPath(INITIAL, "56");
expected.addOptionalPath(optStart, firstJoin, "xx");
assertEquivalent(actual, expected);
}
@Test
public void testNoChangeIfNoCommonFactor() {
NfaBuilder nfa = new NfaBuilder();
nfa.addPath(INITIAL, TERMINAL, "12xxxxxx");
// Add path "34xxx(xx)?" which, while it shares 'xxx' with '12xxxxxx', will not be factored
// because splitting out 'xxx' would make the resulting regular expression longer
// (e.g. "(?:34\d{2}?|12\d{3})\d{3}" is longer than "34\d{2}?\d{3}|12\d{6}").
//
// Note that there are some cases in which this isn't true (shorter sequences like 'x' might be
// splittable without cost, but they are unlikely to ever make the expression shorter,
// especially if they result in adding new parentheses for grouping.
Node optStart = nfa.addPath(INITIAL, "34xxx");
nfa.addOptionalPath(optStart, TERMINAL, "xx");
ValueGraph<Node, SimpleEdge> actual = TrailingPathOptimizer.optimize(nfa.graph());
assertEquivalent(actual, nfa);
}
private static void assertEquivalent(ValueGraph<Node, SimpleEdge> actual, NfaBuilder expected) {
// This is a somewhat cheeky way to test graph isomorphism and relies on the fact that graph
// flattening is deterministic according to how edges sort and doesn't care about node values.
// It also, obviously, relies on the flattening code to be vaguely well tested.
assertThat(NfaFlattener.flatten(actual)).isEqualTo(NfaFlattener.flatten(expected.graph()));
}
}

+ 49
- 0
metadata/src/test/proto/regression_test.proto View File

@ -0,0 +1,49 @@
/*
* Copyright (C) 2017 The Libphonenumber Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
syntax = "proto3";
package i18n.phonenumbers.internal.finitestatematcher.compiler;
option java_package = "com.google.i18n.phonenumbers.internal.finitestatematcher.compiler";
option java_outer_classname = "RegressionTestProto";
// A set of regression tests.
message Tests {
repeated TestCase test_case = 1;
}
// A single regression test entry.
message TestCase {
// A name for the test, ideally unique.
string name = 1;
// If set true, expect that the test will fail 100% of the time. This is
// useful to test that test numbers have enough coverage to force a failure
// and is typically achieved by modifying an input range after generating a
// passing test (or carefully modifying the output bytecodes). Note that not
// all changes will make a test fail 100% of the time, so care must be taken
// to avoid creating a flaky test (e.g. don't change a "[0-3]" to "[0-5]", as
// this only fails if the test number contains a 4 or 5 at the corresponding
// index, change it to "[4-6]" so there's no overlap and at least one test
// number that's valid for that range will not be accepted by the matcher).
bool should_fail = 2;
// The input ranges (in the form of range specifications) which form the DFA
// to be tested (e.g. "1[2-5]678xxxxx" etc...).
repeated string range = 3;
// The expected output bytes, encoded in test files using C-style hex notation
// (i.e. \xHH). This can be split over multiple lines for readability.
repeated bytes expected = 4;
}

Loading…
Cancel
Save