From 54ba70db861b571af6bd9ceae94bb81fa17a0488 Mon Sep 17 00:00:00 2001 From: David Beaumont Date: Tue, 7 Jul 2020 12:56:14 +0200 Subject: [PATCH] Project import generated by Copybara. (#2494) PiperOrigin-RevId: 319856719 Co-authored-by: The libphonenumber Team --- metadata/README.md | 3 +- .../phonenumbers/metadata/DigitSequence.java | 311 ++++ .../phonenumbers/metadata/MetadataKey.java | 65 + .../phonenumbers/metadata/PrefixTree.java | 351 +++++ .../metadata/RangeSpecification.java | 752 +++++++++ .../i18n/phonenumbers/metadata/RangeTree.java | 1342 +++++++++++++++++ .../metadata/RangeTreeFactorizer.java | 194 +++ .../i18n/phonenumbers/metadata/Types.java | 112 ++ .../metadata/i18n/PhoneRegion.java | 99 ++ .../metadata/i18n/SimpleLanguageTag.java | 60 + .../metadata/model/AltFormatSpec.java | 94 ++ .../metadata/model/AltFormatsSchema.java | 146 ++ .../metadata/model/CommentsSchema.java | 132 ++ .../phonenumbers/metadata/model/CsvData.java | 236 +++ .../metadata/model/ExamplesTableSchema.java | 126 ++ .../metadata/model/FileBasedCsvLoader.java | 68 + .../metadata/model/FormatSpec.java | 637 ++++++++ .../metadata/model/FormatsTableSchema.java | 96 ++ .../metadata/model/MetadataException.java | 36 + .../metadata/model/MetadataTableSchema.java | 168 +++ .../metadata/model/NumberingScheme.java | 750 +++++++++ .../metadata/model/NumberingSchemes.java | 63 + .../metadata/model/OperatorsTableSchema.java | 88 ++ .../metadata/model/RangesTableSchema.java | 396 +++++ .../metadata/model/ShortcodesTableSchema.java | 228 +++ .../metadata/model/XmlRangesSchema.java | 154 ++ .../metadata/table/Assignment.java | 92 ++ .../phonenumbers/metadata/table/Change.java | 131 ++ .../phonenumbers/metadata/table/Column.java | 217 +++ .../metadata/table/ColumnGroup.java | 100 ++ .../metadata/table/CsvKeyMarshaller.java | 74 + .../metadata/table/CsvParser.java | 241 +++ .../metadata/table/CsvSchema.java | 108 ++ .../phonenumbers/metadata/table/CsvTable.java | 589 ++++++++ .../metadata/table/CsvTableCollector.java | 99 ++ .../phonenumbers/metadata/table/DiffKey.java | 100 ++ .../metadata/table/DisjointRangeMap.java | 186 +++ .../metadata/table/MultiValue.java | 116 ++ .../metadata/table/RangeException.java | 74 + .../phonenumbers/metadata/table/RangeKey.java | 215 +++ .../metadata/table/RangeTable.java | 951 ++++++++++++ .../phonenumbers/metadata/table/Schema.java | 169 +++ metadata/src/main/proto/enums.proto | 69 + metadata/src/main/proto/types.proto | 82 + .../metadata/DigitSequenceTest.java | 134 ++ .../phonenumbers/metadata/PrefixTreeTest.java | 213 +++ .../metadata/RangeSpecificationTest.java | 308 ++++ .../metadata/RangeTreeFactorizerTest.java | 101 ++ .../phonenumbers/metadata/RangeTreeTest.java | 555 +++++++ .../metadata/i18n/PhoneRegionTest.java | 57 + .../metadata/i18n/SimpleLanguageTagTest.java | 42 + .../metadata/model/AltFormatSpecTest.java | 82 + .../metadata/model/AltFormatsSchemaTest.java | 111 ++ .../metadata/model/CommentsSchemaTest.java | 156 ++ .../metadata/model/FormatSpecTest.java | 160 ++ .../metadata/table/AssignmentTest.java | 70 + .../metadata/table/ChangeTest.java | 71 + .../metadata/table/ColumnGroupTest.java | 58 + .../metadata/table/ColumnTest.java | 93 ++ .../metadata/table/CsvParserTest.java | 177 +++ .../metadata/table/CsvTableTest.java | 275 ++++ .../metadata/table/RangeKeyTest.java | 132 ++ .../metadata/table/RangeTableTest.java | 412 +++++ .../metadata/table/SchemaTest.java | 71 + .../metadata/testing/RangeTableSubject.java | 132 ++ .../metadata/testing/RangeTreeSubject.java | 118 ++ .../metadata/testing/TestNumberingScheme.java | 477 ++++++ 67 files changed, 14323 insertions(+), 2 deletions(-) create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/DigitSequence.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/MetadataKey.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/PrefixTree.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeSpecification.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTree.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizer.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegion.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTag.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpec.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CsvData.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatSpec.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataException.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataTableSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingScheme.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingSchemes.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/OperatorsTableSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ShortcodesTableSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/XmlRangesSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Assignment.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Change.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Column.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroup.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvKeyMarshaller.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvSchema.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTableCollector.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DiffKey.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DisjointRangeMap.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/MultiValue.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeException.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeKey.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java create mode 100644 metadata/src/main/proto/enums.proto create mode 100644 metadata/src/main/proto/types.proto create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/DigitSequenceTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/PrefixTreeTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizerTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegionTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTagTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpecTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchemaTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchemaTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/FormatSpecTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/AssignmentTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ChangeTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroupTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvParserTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvTableTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeKeyTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeTableTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/SchemaTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTableSubject.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTreeSubject.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/TestNumberingScheme.java diff --git a/metadata/README.md b/metadata/README.md index 17bbf4c72..593200756 100644 --- a/metadata/README.md +++ b/metadata/README.md @@ -24,6 +24,5 @@ inevitable. Patches and pull requests cannot be accepted directly on this codebase, so if you find an issue with these libraries, please open a new issue for it. However -we do not accept feature requests, or provide answeres or technical support for +we do not accept feature requests, or provide answers or technical support for anything in this directory at this time. - diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/DigitSequence.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/DigitSequence.java new file mode 100644 index 000000000..4ead887d0 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/DigitSequence.java @@ -0,0 +1,311 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import com.google.common.base.CharMatcher; +import com.google.common.base.Preconditions; +import com.google.common.collect.DiscreteDomain; +import com.google.errorprone.annotations.Immutable; +import com.google.errorprone.annotations.concurrent.LazyInit; + +/** + * A small, fast, immutable representation of a phone number digit sequence. This class represents + * contiguous sequences of digits in phone numbers, such as "123" or "000". It does not encode + * semantic information such as the region code to which a number belongs or perform any semantic + * validation. It can be thought of as equivalent to a String containing only the ASCII digits + * {@code '0'} to {@code '9'}. + */ +@Immutable +public final class DigitSequence implements Comparable { + + private static final CharMatcher ASCII_DIGITS = CharMatcher.inRange('0', '9'); + + // IMPORTANT + // This cannot be more than 18 to avoid overflowing a signed long (it must be signed due to the + // calculation of the "distance" metric which can be +ve or -ve). + // + // If it does need to be raised, this whole class probably needs to be rethought. ITU recommends + // a limit of 15 digits (not including country calling code) but there are currently 2 examples + // in the metadata XML file which exceed this (Japan) where some non-international toll free + // numbers (those starting with 0037 and 0036) can be up to 17 digits (still okay) in the current + // metadata but there's a note saying that they may even extend to 21 digits!! + // + // An appropriate way to split this class would be to make a closed type hierarchy with 2 + // separate implementations, one using a long to encode the numbers and one using BigInteger (or + // maybe just encoding digits in a string directly). + // The good thing about this approach is that instances of the different implementations could + // never be equal to each other. This is likely not a difficult refactoring, although the Domain + // class will also need to be considered carefully and details like the "index()" value will have + // to change completely between the classes. + // + /** The maximum number of digits which can be held in a digit sequence. */ + public static final int MAX_DIGITS = 18; + + // Simple lookup of powers-of-10 for all valid sequence lengths (0 - MAX_DIGITS). + private static final long[] POWERS_OF_TEN = new long[MAX_DIGITS + 1]; + static { + // 1, 10, 100, 1000, 10000 ... + POWERS_OF_TEN[0] = 1; + for (int n = 1; n < POWERS_OF_TEN.length; n++) { + POWERS_OF_TEN[n] = 10 * POWERS_OF_TEN[n - 1]; + } + } + + // A table of adjustment values to convert a digit sequence into an absolute index in the + // integer domain, to impose a true lexicographical ordering. The value of a digit sequence is + // adjusted by the number of additional elements in the phone number domain which cannot be + // represented as integers (the empty sequence or anything with leading zeros). This results in + // an absolute ordering of all digit sequences. For example the digit sequence "0123" is length + // 4, and there are 111 additional additional elements that come before 4-length sequences + // ("", "00"-"09", "000"-"099"), so its index is {@code 123 + 111 = 234}. + // To calculate this value dynamically for any length N, offset=floor(10^N / 9). + private static final long[] DOMAIN_OFFSET = new long[MAX_DIGITS + 1]; + static { + // 0, 1, 11, 111, 1111 ... + for (int n = 1; n < DOMAIN_OFFSET.length; n++) { + DOMAIN_OFFSET[n] = 10 * DOMAIN_OFFSET[n - 1] + 1; + } + } + + private static final DigitSequence EMPTY = new DigitSequence(0, 0L); + private static final DigitSequence[] SINGLETON_DIGITS = new DigitSequence[] { + new DigitSequence(1, 0L), + new DigitSequence(1, 1L), + new DigitSequence(1, 2L), + new DigitSequence(1, 3L), + new DigitSequence(1, 4L), + new DigitSequence(1, 5L), + new DigitSequence(1, 6L), + new DigitSequence(1, 7L), + new DigitSequence(1, 8L), + new DigitSequence(1, 9L), + }; + + // Simple helper to return {@code 10^n} for all valid sequence lengths. + private static long pow10(int n) { + return POWERS_OF_TEN[n]; + } + + /** + * Returns the domain in which phone number digit sequences exist. This is needed when creating + * canonical {@link com.google.common.collect.Range Ranges} of digit-sequences. + */ + public static DiscreteDomain domain() { + return Domain.INSTANCE; + } + + private static final class Domain extends DiscreteDomain { + private static final Domain INSTANCE = new Domain(); + private static final DigitSequence MIN = EMPTY; + private static final DigitSequence MAX = DigitSequence.of("999999999999999999"); + + @Override + public DigitSequence next(DigitSequence num) { + long next = num.value + 1; + if (next < pow10(num.length)) { + return new DigitSequence(num.length, next); + } else { + int len = num.length + 1; + return (len <= MAX_DIGITS) ? new DigitSequence(len, 0) : null; + } + } + + @Override + public DigitSequence previous(DigitSequence num) { + long prev = num.value - 1; + if (prev >= 0) { + return new DigitSequence(num.length, prev); + } else { + int len = num.length - 1; + return (len >= 0) ? new DigitSequence(len, pow10(len) - 1) : null; + } + } + + @Override + public long distance(DigitSequence start, DigitSequence end) { + // The indices get up to 19 digits but can't overflow Long.MAX_VALUE, so they can be safely + // subtracted to get a signed long "distance" without risk of over-/under- flow. + return end.index() - start.index(); + } + + @Override + public DigitSequence minValue() { + return MIN; + } + + @Override + public DigitSequence maxValue() { + return MAX; + } + } + + /** Returns the digit sequence of length one representing the given digit value. */ + public static DigitSequence singleton(int digit) { + Preconditions.checkArgument(0 <= digit && digit <= 9, "invalid digit value: %s", digit); + return SINGLETON_DIGITS[digit]; + } + + /** + * Returns the empty digit sequence. This is useful in special cases where you need to build up + * a digit sequence starting from nothing). + */ + public static DigitSequence empty() { + return EMPTY; + } + + /** Returns a digit sequence for the given string (e.g. "012345"). */ + public static DigitSequence of(String digits) { + Preconditions.checkArgument(digits.length() <= MAX_DIGITS, + "Digit string too long: '%s'", digits); + Preconditions.checkArgument(ASCII_DIGITS.matchesAllOf(digits), + "Digit string contains non-digit characters: '%s'", digits); + return digits.isEmpty() ? empty() : new DigitSequence(digits.length(), Long.parseLong(digits)); + } + + /** + * Returns a digit sequence of {@code length} containing only the digit '0'. This is useful when + * performing range calculations to determine the smallest digit sequence in a block. + */ + public static DigitSequence zeros(int length) { + return new DigitSequence(length, 0L); + } + + /** + * Returns a digit sequence of {@code length} containing only the digit '9'. This is useful when + * performing range calculations to determine the largest digit sequence in a block. + */ + public static DigitSequence nines(int length) { + return new DigitSequence(length, pow10(length) - 1); + } + + // The overall length of the digit sequence, including any leading zeros. + private final int length; + // The decimal value of the digit sequence (excluding leading zeros, obviously). + private final long value; + // Cached toString() representation (toString() of DigitSequence is used in comparisons for + // sorting to achieve lexicographical ordering, which means it gets churned a lot). + @LazyInit + private String toString; + + // Called directly from RangeSpecification. + DigitSequence(int length, long value) { + // Don't check for -ve length as this should never happen and will blow up in pow10() anyway. + Preconditions.checkArgument(length <= MAX_DIGITS, + "Digit sequence too long [%s digits]", length); + // This should not happen unless there's a code error, so nice user messages aren't needed. + Preconditions.checkArgument(value >= 0 && value < pow10(length)); + this.length = length; + this.value = value; + } + + /** Returns if this sequence is empty (i.e. length == 0). */ + public boolean isEmpty() { + return length == 0; + } + + /** Returns the length of this digit sequence. */ + public int length() { + return length; + } + + /** + * Returns the digit at index {@code n} in this digit sequence, starting from the most + * significant digit. + */ + public int getDigit(int n) { + Preconditions.checkElementIndex(n, length); + return (int) (value / pow10(((length - 1) - n)) % 10); + } + + /** + * Returns the sub-sequence representing only the first {@code n} digits in this sequence. For + * example, {@code "01234".first(3) == "012"}. + */ + public DigitSequence first(int n) { + Preconditions.checkElementIndex(n, length); + return new DigitSequence(n, value / pow10(length - n)); + } + + /** + * Returns the sub-sequence representing only the last {@code n} digits in this sequence. For + * example, {@code "01234".last(3) == "234"}. + */ + public DigitSequence last(int n) { + Preconditions.checkElementIndex(n, length); + return new DigitSequence(n, value % pow10(n)); + } + + /** + * Returns a new sequence which extends this sequence by a single digit ({@code 0 <= digit <= 9}). + */ + public DigitSequence extendBy(int digit) { + Preconditions.checkArgument(0 <= digit && digit <= 9); + return new DigitSequence(length + 1, (10 * value) + digit); + } + + /** Returns a new sequence which extends this sequence by the given value. */ + public DigitSequence extendBy(DigitSequence n) { + Preconditions.checkNotNull(n); + return new DigitSequence(length + n.length, (pow10(n.length) * value) + n.value); + } + + /** + * Returns the digit sequence immediately after this one, or {@code null} if this is the + * maximum value. + */ + public DigitSequence next() { + return domain().next(this); + } + + /** + * Returns the digit sequence immediately before this one, or {@code null} if this is the + * minimum value. + */ + public DigitSequence previous() { + return domain().previous(this); + } + + /** Returns the absolute index of this digit sequence within the integer domain. */ + private long index() { + return value + DOMAIN_OFFSET[length]; + } + + @Override + public int compareTo(DigitSequence other) { + return Long.signum(index() - other.index()); + } + + @Override + public boolean equals(Object o) { + return (o instanceof DigitSequence) && index() == ((DigitSequence) o).index(); + } + + @Override + public int hashCode() { + return Long.hashCode(index()); + } + + @Override + public String toString() { + // This little dance is required (according to the docs for the LazyInit annotation) for lazy + // initialization of non-volatile fields (yes, that's a double init in a single statement). + String localVar = toString; + if (localVar == null) { + toString = localVar = (length > 0 ? String.format("%0" + length + "d", value) : ""); + } + return localVar; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/MetadataKey.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/MetadataKey.java new file mode 100644 index 000000000..32f656295 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/MetadataKey.java @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import com.google.auto.value.AutoValue; +import com.google.common.base.Preconditions; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import java.util.Comparator; + +/** + * A key for uniquely identifying number metadata for a region. For "geographical" regions, the + * region code suffices to identify the range information, but for "non geographical" regions, the + * calling code is required and the region is set to "UN001" (world). + */ +@AutoValue +public abstract class MetadataKey implements Comparable { + private static final Comparator ORDERING = + Comparator.comparing(MetadataKey::region).thenComparing(MetadataKey::callingCode); + + /** + * Returns a key to identify phone number data in the given region with the specified calling + * code. Care must be taken when creating keys because it is possible to create invalid keys that + * would not match any data (e.g. region="US", calling code="44"). + */ + public static MetadataKey create(PhoneRegion region, DigitSequence callingCode) { + // Null checks and semantic checks. + Preconditions.checkArgument(region.equals(PhoneRegion.getWorld()) + || (region.toString().length() == 2 && !region.equals(PhoneRegion.getUnknown()))); + Preconditions.checkArgument(!callingCode.isEmpty()); + return new AutoValue_MetadataKey(region, callingCode); + } + + /** + * Returns the region for this key (this is {@link PhoneRegion#getWorld()} for non-geographical + * regions). + */ + public abstract PhoneRegion region(); + + /** Returns the calling code for this key. */ + public abstract DigitSequence callingCode(); + + @Override + public int compareTo(MetadataKey other) { + return ORDERING.compare(this, other); + } + + // Used in human readable formatting during presubmit checks; be careful if you change it. + @Override + public final String toString() { + return String.format("region=%s, calling code=+%s", region(), callingCode()); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/PrefixTree.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/PrefixTree.java new file mode 100644 index 000000000..64f3faf7c --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/PrefixTree.java @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import com.google.i18n.phonenumbers.metadata.RangeTree.SetOperations; +import java.util.ArrayList; +import java.util.List; + +/** + * A variation of a {@link RangeTree} which represents a set of prefixes (as opposed to a set of + * ranges). While this implementation is backed by a {@code RangeTree} and has a similar serialized + * representation, it is a deliberately distinct type and should not be thought of as a subset of + * {@code RangeTree}. In particular, set operations are defined to work differently for + * {@code PrefixTree} due to its differing semantics and some set operations (e.g. subtraction) are + * not even well defined. + */ +public final class PrefixTree { + private static final PrefixTree EMPTY = new PrefixTree(RangeTree.empty()); + + /** Returns the "empty" prefix tree, which matches no ranges. */ + public static PrefixTree empty() { + return EMPTY; + } + + /** + * Returns a prefix tree with the paths of the given ranges, trimmed to the earliest point of + * termination. For example, the ranges {@code {"1[0-3]", "1234", "56x"}} will result in the + * prefixes {@code {"1[0-3]", "56x"}}, since {@code "1[0-3]"} contains {@code "12"}, which is a + * prefix of {@code "1234"}. + */ + public static PrefixTree from(RangeTree ranges) { + return !ranges.isEmpty() + ? new PrefixTree(removeTrailingAnyDigitPaths(TrimmingVisitor.trim(ranges))) + : empty(); + } + + /** + * Returns a prefix tree containing all digit sequences in the given range specification. A + * single range specification cannot overlap in the way that general range trees can, so unlike + * {@link #from(RangeTree)}, this method will never throw {@code IllegalArgumentException}. + */ + public static PrefixTree from(RangeSpecification spec) { + // Range specifications define ranges of a single length, so must always be a valid prefix. + return from(RangeTree.from(spec)); + } + + /** + * Returns the minimal prefix tree which includes all the paths in "include", and none of the + * paths in "exclude". For example: + *
 {@code
+   *   minimal({ "123x", "456x" }, { "13xx", "459x" }, 0) == { "12", "456" }
+   *   minimal({ "123x", "456x" }, {}, 0) == { "" }
+   *   minimal({ "123x", "456x" }, {}, 1) == { "[14]" }
+   * }
+ * + *

A minimal length can be specified to avoid creating prefixes that are "too short" for some + * circumstances. + * + *

Caveat: In cases where the {@code include} and {@code exclude} ranges overlap, the shortest + * possible prefix is chosen. For example: + *

 {@code
+   *   minimal({ "12", "1234", "56" }, { "123", "5678" }) == { "12", "56" }
+   * }
+ * This means that it may not always be true that {@code minimal(A, B).intersect(minimal(B, A))} + * is empty. + */ + public static PrefixTree minimal(RangeTree include, RangeTree exclude, int minLength) { + checkArgument(include.intersect(exclude).isEmpty(), "ranges must be disjoint"); + checkArgument(minLength >= 0, "invalid minimum prefix length: %s", minLength); + PrefixTree prefix = PrefixTree.from(include); + if (prefix.isEmpty()) { + // This matches no input, not all input. + return prefix; + } + // Ignore anything that the prefix already captures, since there's no point avoiding shortening + // the prefix to avoid what's already overlapping. + exclude = exclude.subtract(prefix.retainFrom(exclude)); + + // This can contain only the empty sequence (i.e. match all input) if the original include set + // was something like "xxxxx". In that case the initial node is just the terminal. + RangeTree minimal; + DfaNode root = prefix.asRangeTree().getInitial(); + if (prefix.isIdentity() || exclude.isEmpty()) { + // Either we already accept anything, or there is nothing to exclude. + minimal = emit(root, RangeSpecification.empty(), RangeTree.empty(), minLength); + } else { + minimal = recursivelyMinimize( + root, RangeSpecification.empty(), exclude.getInitial(), RangeTree.empty(), minLength); + } + // No need to go via the static factory here, since that does a bunch of work we know cannot + // be necessary. The range tree here is a subset of an already valid prefix tree, so cannot + // contain "early terminating nodes" or "trailing any digit sequences". + return new PrefixTree(minimal); + } + + private final RangeTree ranges; + + private PrefixTree(RangeTree ranges) { + // Caller is responsible for ensuring that the ranges conform to expectations of a prefix tree. + this.ranges = ranges; + } + + /** + * Returns a {@link RangeTree} containing the same digit sequences as this prefix tree. Prefix + * trees and range trees do not have the same semantics, but they do have the same serialized + * form (i.e. to serialize a prefix tree, you can just serialize the corresponding range tree). + */ + public RangeTree asRangeTree() { + return ranges; + } + + /** + * Returns whether this prefix tree is empty. Filtering a {@link RangeTree} by the empty prefix + * tree always returns the empty range tree. The result of filtering a range tree is defined as + * containing only digit sequences which are prefixed by some digit sequence in the prefix tree. + * If the prefix tree is empty, no digit sequence can ever satisfy that requirement. + */ + public boolean isEmpty() { + return ranges.isEmpty(); + } + + /** + * Returns whether this prefix tree matches any digit sequence. Filtering a {@link RangeTree} by + * the identity prefix returns the original range tree. The result of filtering a range tree is + * defined as containing only digit sequences which are prefixed by some digit sequence in the + * prefix tree. The identity prefix tree contains the empty digit sequence, which is a prefix of + * every digit sequence. + */ + public boolean isIdentity() { + return !ranges.isEmpty() && ranges.getInitial().equals(RangeTree.getTerminal()); + } + + /** Returns whether the given sequence would be retained by this prefix tree. */ + public boolean prefixes(DigitSequence digits) { + DfaNode node = ranges.getInitial(); + for (int n = 0; n < digits.length(); n++) { + DfaEdge e = node.find(digits.getDigit(n)); + if (e == null) { + break; + } + node = e.getTarget(); + } + return node.equals(RangeTree.getTerminal()); + } + + /** + * Returns a subset of the given ranges, containing only ranges which are prefixed by an + * element in this prefix tree. For example: + *
 {@code
+   *   RangeTree r = { "12xx", "1234x" }
+   *   PrefixTree p = { "12[0-5]" }
+   *   p.retainFrom(r) = { "12[0-5]x", "1234x"}
+   * }
+ * Note that if the prefix tree is empty, this method returns the empty range tree. + */ + public RangeTree retainFrom(RangeTree ranges) { + return SetOperations.INSTANCE.retainFrom(this, ranges); + } + + /** + * Returns the union of two prefix trees. For prefix trees {@code p1}, {@code p2} and any range + * tree {@code R}, the union {@code P = p1.union(p2)} is defined such that: + *
 {@code
+   *   P.retainFrom(R) = p1.retainFrom(R).union(p2.retainFrom(R))
+   * }
+ * If prefixes are the same length this is equivalent to {@link RangeTree#union(RangeTree)}, + * but when prefixes overlap, only the more general (shorter) prefix is retained. + */ + public PrefixTree union(PrefixTree other) { + return SetOperations.INSTANCE.union(this, other); + } + + /** + * Returns the intersection of two prefix trees. For prefix trees {@code p1}, {@code p2} and any + * range tree {@code R}, the intersection {@code P = p1.intersect(p2)} is defined such that: + *
 {@code
+   *   P.retainFrom(R) = p1.retainFrom(R).intersect(p2.retainFrom(R))
+   * }
+ * If prefixes are the same length this is equivalent to {@link RangeTree#intersect(RangeTree)}, + * but when prefixes overlap, only the more specific (longer) prefix is retained. + */ + public PrefixTree intersect(PrefixTree other) { + return SetOperations.INSTANCE.intersect(this, other); + } + + /** + * Returns a prefix tree trimmed to at most {@code maxLength} digits. The returned value may be + * shorter if, in the process of trimming, trailing edges are collapsed to "any digit" sequences. + * For example: + *
 {@code
+   * { "12[0-4]5", "12[5-9]" }.trim(3) == "12"
+   * { "7001", "70[1-9]", "7[1-9]" }.trim(3) == "7"
+   * }
+ */ + public PrefixTree trim(int maxLength) { + return PrefixTree.from( + RangeTree.from( + ranges.asRangeSpecifications().stream() + .map(s -> s.first(maxLength)) + .collect(toImmutableList()))); + } + + @Override + public int hashCode() { + return ranges.hashCode(); + } + + @Override + public boolean equals(Object o) { + return (o instanceof PrefixTree) && ranges.equals(((PrefixTree) o).ranges); + } + + @Override + public String toString() { + return ranges.toString(); + } + + private static final class TrimmingVisitor implements DfaVisitor { + static RangeTree trim(RangeTree ranges) { + if (ranges.isEmpty()) { + return ranges; + } + if (ranges.getInitial().canTerminate()) { + // Not the "empty range tree" (which matches no input), but the range tree containing the + // empty range specification (which matches only the empty digit sequence). + return RangeTree.from(RangeSpecification.empty()); + } + TrimmingVisitor v = new TrimmingVisitor(); + ranges.accept(v); + return RangeTree.from(v.paths); + } + + private final List paths = new ArrayList<>(); + private RangeSpecification path = RangeSpecification.empty(); + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + RangeSpecification oldPath = path; + path = path.extendByMask(edge.getDigitMask()); + if (target.canTerminate()) { + paths.add(path); + } else { + target.accept(this); + } + path = oldPath; + } + } + + // Note: This is NOT as simple as just calling "getPrefix()" on each range specification because + // ranges that are too short become problematic. Consider { "7[1-9]", "70x" } which should result + // in "7". If we just call "getPrefix()" and merge, we end up with "7x". + // + // One way to fix this is by repeatedly creating prefix trees (removing trailing "any digit" + // sequences) until it becomes stable. + // + // The other way (simpler) is to extend the length of any shorter range specifications to bring + // them up to the max length before merging them. In the above example, we extend the length of + // "7[1-9]" to "7[1-9]x" and merge it with "70x" to get "7xx", which can then have its prefix + // extracted. + private static RangeTree removeTrailingAnyDigitPaths(RangeTree ranges) { + if (ranges.isEmpty()) { + return ranges; + } + // Skip this if "ranges" matches only one length (since it would be a no-op). + if (ranges.getLengths().size() > 1) { + int length = ranges.getLengths().last(); + ranges = ranges.map(s -> s.length() < length ? s.extendByLength(length - s.length()) : s); + } + // Having merged everything, we can now extract the correct prefixes as the final step. + return ranges.map(RangeSpecification::getPrefix); + } + + /** + * Recursively determines the next level of prefix minimization. The algorithm follows as much + * of the "included" path as possible (node), potentially splitting into several sub-recursive + * steps if the current included edge overlaps with multiple "excluded" paths. Once a path no + * longer overlaps with the exclude paths, it is added to the result. Paths are also added to + * the result if they terminate while still overlapping the excluded paths. + */ + private static RangeTree recursivelyMinimize( + DfaNode node, RangeSpecification path, DfaNode exclude, RangeTree minimal, int minLength) { + for (DfaEdge edge : node.getEdges()) { + int mask = edge.getDigitMask(); + DfaNode target = edge.getTarget(); + // This algorithm only operates on the DFA of a prefix tree (not a general range tree). As + // such the only terminating node we can reach is the terminal node itself. If we hit that + // from the current edge, just emit it and continue on to the next edge. + if (target.equals(RangeTree.getTerminal())) { + minimal = minimal.union(RangeTree.from(path.extendByMask(mask))); + continue; + } + checkState(!target.canTerminate(), "invalid DFA state for prefix tree at: %s", path); + // Otherwise recurse on every "exclude" path, using the intersection of the "include" and + // "exclude" masks. Anything left on the include mask which didn't overlap any of excluded + // edges can emitted. This also works at the end of the exclude paths (exclude == TERMINAL) + // since that has no outgoing edges (so the entire include path is emitted). + for (DfaEdge ex : exclude.getEdges()) { + int m = ex.getDigitMask() & mask; + if (m != 0) { + mask &= ~m; + minimal = + recursivelyMinimize(target, path.extendByMask(m), ex.getTarget(), minimal, minLength); + } + } + // The mask identifies edges which are now outside the exclude tree, and thus safe to emit. + if (mask != 0) { + // Emitting an included path may involve emitting some of the sub-tree below it in order + // to make up the minimal length (we can't do this for the terminating case above). + minimal = emit(target, path.extendByMask(mask), minimal, minLength); + } + } + return minimal; + } + + /** + * Recursively visits the sub-tree under the given node, extending the path until it reaches the + * minimum length before emitting it. + */ + private static RangeTree emit( + DfaNode node, RangeSpecification path, RangeTree minimal, int minLength) { + if (path.length() >= minLength || node.equals(RangeTree.getTerminal())) { + minimal = minimal.union(RangeTree.from(path)); + } else { + for (DfaEdge e : node.getEdges()) { + minimal = minimal.union( + emit(e.getTarget(), path.extendByMask(e.getDigitMask()), minimal, minLength)); + } + } + return minimal; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeSpecification.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeSpecification.java new file mode 100644 index 000000000..f3f833cee --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeSpecification.java @@ -0,0 +1,752 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain; +import static java.lang.Integer.numberOfLeadingZeros; +import static java.lang.Integer.numberOfTrailingZeros; + +import com.google.common.collect.ContiguousSet; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.collect.Range; +import com.google.common.collect.RangeSet; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +/** + * A compact representation of a disjoint set of ranges of digit sequences. This is a compact way + * to represent one or many ranges of digit sequences which share the same length. Examples include: + *
{@code
+ * "01234" --> the singleton range containing only the digit sequence "01234"
+ * "012xx" --> the contiguous digit sequence range ["01200".."01299"]
+ * "012[3-5]6xx" --> the disjoint set of contiguous digit sequence ranges
+ *     ["0123600".."0123699"], ["0124600".."0124699"], ["0125600".."0125699"]
+ * }
+ * Note that the sets of contiguous ranges defined by a {@code RangeSpecification} are always + * mutually disjoint. + * + *

Range specifications have a natural prefix based lexicographical ordering (based on the + * most-significant point at which a difference appears), but if you are comparing a disjoint set + * of range specifications (e.g. from a {@link RangeTree}) then it can be more intuitive to use an + * ordering based on the minimum digit sequence, but note this approach fails if the range + * specifications can overlap (e.g. comparing "1xx" and "100"). + */ +public final class RangeSpecification implements Comparable { + /** The mask of all possible digits. */ + public static final char ALL_DIGITS_MASK = (1 << 10) - 1; + + private static final RangeSpecification EMPTY = new RangeSpecification(""); + + /** Returns the empty range specification, which matches only the empty digit sequence. */ + public static RangeSpecification empty() { + return EMPTY; + } + + /** Returns the range specification of length one which matches any of the given digits. */ + public static RangeSpecification singleton(Iterable digits) { + int mask = 0; + for (int digit : digits) { + checkArgument(0 <= digit && digit <= 9, "bad digit value '%s'", digit); + mask |= (1 << digit); + } + return new RangeSpecification(String.valueOf((char) mask)); + } + + /** Returns a new range specification which matches only the given non-empty digit sequence. */ + public static RangeSpecification from(DigitSequence s) { + if (s.length() == 0) { + return RangeSpecification.empty(); + } + char[] masks = new char[s.length()]; + for (int n = 0; n < masks.length; n++) { + masks[n] = (char) (1 << s.getDigit(n)); + } + return new RangeSpecification(new String(masks)); + } + + /** Returns a new range specification which matches any digit sequence of the specified length. */ + public static RangeSpecification any(int length) { + checkArgument(length >= 0); + if (length == 0) { + return RangeSpecification.empty(); + } + char[] masks = new char[length]; + Arrays.fill(masks, ALL_DIGITS_MASK); + return new RangeSpecification(new String(masks)); + } + + /** + * Parses the string form of a range specification (e.g. "1234[57-9]xxx"). This must be + * correctly formed, including having all ranges be well formed (e.g. not "[33]", "[3-3]" or + * "[6-4]"). + * + *

Note that non-canonical ranges are permitted if the digits are in order (e.g. "[1234]", + * "[4-5]" or "[0-9]" but not "[4321]"). The returned range specification is canonical (e.g. + * {@code parse("12[34569]").toString() == "12[3-69]"}). + * + *

The empty string is parsed as the empty range specification. + * + *

The use of single ASCII underscores ("_") to group ranges and aid readability is supported + * during parsing but is not retained in the parsed result (e.g. + * {@code parse("12_34[5-8]_xxx_xxx").toString() == "1234[5-8]xxxxxx"}). Note that underscore may + * not be present inside ranges (e.g. "1_4") or at the ends of the range (e.g. "123xxx_"). + */ + public static RangeSpecification parse(String s) { + if (s.isEmpty()) { + return empty(); + } + checkArgument(!s.startsWith("_") && !s.endsWith("_"), "cannot start/end with '_': %s", s); + StringBuilder bitmasks = new StringBuilder(); + boolean lastCharWasUnderscore = false; + for (int n = 0; n < s.length(); n++) { + char c = s.charAt(n); + switch (c) { + case '_': + checkArgument(!lastCharWasUnderscore, "cannot have multiple '_' in a row: %s", s); + lastCharWasUnderscore = true; + // Continue the for-loop rather than breaking out the switch to avoid resetting the flag. + continue; + case 'x': + bitmasks.append(ALL_DIGITS_MASK); + break; + case '[': + n += 1; + int end = s.indexOf(']', n); + checkArgument(end != -1, "unclosed range in specification: %s", s); + checkArgument(end > n, "empty range in specification: %s", s); + bitmasks.append(parseRange(s, n, end)); + n = end; + break; + default: + checkArgument('0' <= c && c <= '9', + "bad digit value '%s' in range specification: %s", c, s); + bitmasks.append((char) (1 << (c - '0'))); + break; + } + lastCharWasUnderscore = false; + } + return new RangeSpecification(bitmasks.toString()); + } + + private static char parseRange(String s, int start, int end) { + int mask = 0; + for (int n = start; n < end;) { + char c = s.charAt(n++); + checkArgument('0' <= c && c <= '9', + "bad digit value '%s' in range specification: %s", c, s); + int shift = (c - '0'); + // check that this bit and all above it are zero (to ensure correct ordering). + checkArgument(mask >> shift == 0, "unordered range in specification: %s", s); + if (n == end || s.charAt(n) != '-') { + // Single digit not in a range. + mask |= 1 << shift; + continue; + } + n++; + checkArgument(n < end, "unclosed range in specification: %s", s); + c = s.charAt(n++); + checkArgument('0' <= c && c <= '9', + "bad digit value '%s' in range specification: %s", c, s); + int rshift = (c - '0'); + checkArgument(rshift > shift, "unordered range in specification: %s", s); + // Set bits from shift to rshift inclusive (e.g. 11111 & ~11 = 11100). + mask |= ((1 << (rshift + 1)) - 1) & ~((1 << shift) - 1); + } + return (char) mask; + } + + /** + * Returns the canonical representation of the given ranges. The number of range specifications + * in the returned instance may be higher or lower than the number of given ranges. + *

+ * NOTE: This is only used by RangeTree for generating a RangeTree from a RangeSet, and is not + * suitable as a public API (one day we might generate the RangeTree directly and be able to + * delete this code). + */ + static ImmutableList from(RangeSet ranges) { + List specs = new ArrayList<>(); + Set> s = ranges.asRanges(); + checkArgument(!s.isEmpty(), "empty range set not permitted"); + // Make sure are ranges we use are canonicalized over the domain of DigitSequences (so Range + // operations (e.g. isConnected()) work as expected. See Range for more on why this matters. + Range cur = s.iterator().next().canonical(domain()); + checkArgument(!cur.contains(DigitSequence.empty()), + "empty digit sequence not permitted in range set"); + for (Range next : Iterables.skip(ranges.asRanges(), 1)) { + next = next.canonical(domain()); + if (cur.isConnected(next)) { + // Even though 'cur' and 'next' are both canonicalized, it's not guaranteed that they are + // closed-open (singleton ranges are fully closed and any range containing the maximum + // value must be closed. To "union" the two ranges we must also preserve the bound types. + cur = Range.range( + cur.lowerEndpoint(), cur.lowerBoundType(), + next.upperEndpoint(), next.upperBoundType()) + .canonical(domain()); + continue; + } + addRangeSpecsOf(cur, specs); + cur = next; + } + addRangeSpecsOf(cur, specs); + return ImmutableList.sortedCopyOf(Comparator.comparing(RangeSpecification::min), specs); + } + + /** Adds the canonical minimal range specifications for a single range to the given list. */ + private static void addRangeSpecsOf(Range r, List specs) { + // Given range is already canonical but may span multiple lengths. It's easier to view this + // as a contiguous set when finding first/last elements however to avoid worrying about bound + // types. A contiguous set is not an expensive class to create. + ContiguousSet s = ContiguousSet.create(r, domain()); + DigitSequence start = s.first(); + DigitSequence end = s.last(); + while (start.length() < end.length()) { + // Add to "999..." for the current block length (the max domain value is all 9's). + DigitSequence blockEnd = DigitSequence.nines(start.length()); + addRangeSpecs(start, blockEnd, specs); + // Reset the start to the next length up (i.e. the "000..." sequence that's one longer). + start = blockEnd.next(); + } + // Finally and the range specs up to (and including) the end value. + addRangeSpecs(start, end, specs); + } + + // Adds canonical minimal range specifications for the range of same-length digit sequences. + private static void addRangeSpecs( + DigitSequence start, DigitSequence end, List specs) { + int length = start.length(); + checkArgument(end.length() == length); + + // Masks contains a running total of the bitmasks we want to convert to RangeSpecifications. + // As processing proceeds, the mask array is reused. This is because the prefix used for + // successive range specifications is always a subset of the previous specifications and the + // trailing part of the array always fills up with the range mask for 'x' (i.e. [0-9]). + int[] masks = new int[length]; + + // Stage 1: + // Starting from the last digit in the 'start' sequence, work up until we find something that + // is not a '0'. This is the first digit that needs to be adjusted to create a range + // specification covering it and the digits 'below' it. For example, the first specification + // for the range ["1200".."9999"] is "1[2-9]xx". + // Once a specification is emitted, the start value is adjusted to the next digit sequence + // immediately above the end of the emitted range, so after emitting "1[2-9]xx", start="2000". + // Once each range specification is emitted, we continue working 'up' the digit sequence until + // the next calculated start value exceeds the 'end' of our range. This specification cannot + // be emitted and signals the end of stage 1. + setBitmasks(masks, start); + for (int n = previousNon(0, start, length); n != -1; n = previousNon(0, start, n)) { + int loDigit = start.getDigit(n); + DigitSequence prefix = start.first(n); + DigitSequence blockEnd = prefix.extendBy(DigitSequence.nines(length - n)); + if (blockEnd.compareTo(end) > 0) { + // The end of this block would exceed the end of the main range, so we must stop. + break; + } + // The bitmasks we want is: + // [loDigit..9] + masks[n] = bitmaskUpFrom(loDigit); + fillBitmasksAfter(masks, n); + specs.add(RangeSpecification.fromBitmasks(masks)); + // Adjust the range start now we have emitted the range specification. + start = blockEnd.next(); + } + + // Stage 2: + // Very similar to stage 1, but work up from the last digit in the 'end' sequence. The + // difference now is that we look for the first digit that's not '9' and generate ranges that + // go down to the start of the range, not up to the end. Thus for ["0000", "1299"] the first + // specification generated is "1[0-2]xx", which is emitted at the end of the list. + int midIdx = specs.size(); + setBitmasks(masks, end); + for (int n = previousNon(9, end, length); n != -1; n = previousNon(9, end, n)) { + int hiDigit = end.getDigit(n); + DigitSequence prefix = end.first(n); + DigitSequence blockStart = prefix.extendBy(DigitSequence.zeros(length - n)); + if (blockStart.compareTo(start) < 0) { + // The start of this block would precede the start of the main range, so we must stop. + break; + } + // The bitmasks we want is: + // [0..hiDigit] + masks[n] = bitmaskDownFrom(hiDigit); + fillBitmasksAfter(masks, n); + specs.add(midIdx, RangeSpecification.fromBitmasks(masks)); + // Adjust the range end now we have emitted the range specification. + end = blockStart.previous(); + } + + // Stage 3: Having emitted the first and last set of range specifications, it only remains to + // emit the "center" specification in the middle of the list. This is special as neither bound + // is the end of a block. In previous stages, all partial ranges are either "up to 9" or + // "down to zero". For example: ["1234".."1789"] has the center range "1[3-6]xx", and + // ["1234".."1345"] has no center range at all. + if (start.compareTo(end) < 0) { + // Find the last digit before start and end combine (ie, 1200, 1299 --> 12xx --> n=1). We + // know that 'start' and 'end' are the same length and bound a range like: + // [X..Y] [000..999] + // but X or Y could be 0 or 9 respectively (just not both). + // + // Note that we don't even both to test the first digit in the sequences because if 'start' + // and 'end' span a full range (e.g. [000.999]) we can just use the same code to fill the + // masks correctly anyway. + int n = start.length(); + while (--n > 0 && start.getDigit(n) == 0 && end.getDigit(n) == 9) {} + // Bitwise AND the masks for [X..9] and [0..Y] to get the mask for [X..Y]. + // Note that the "masks" array already contains the correct prefix digits up to (n-1). + masks[n] = bitmaskUpFrom(start.getDigit(n)) & bitmaskDownFrom(end.getDigit(n)); + fillBitmasksAfter(masks, n); + specs.add(midIdx, RangeSpecification.fromBitmasks(masks)); + } + } + + // Sets the values in the given array to correspond to the digits in the given sequence. If a + // range specification were made from the resulting array it would match only that digit sequence. + private static void setBitmasks(int[] masks, DigitSequence s) { + for (int n = 0; n < s.length(); n++) { + masks[n] = 1 << s.getDigit(n); + } + } + + /** + * Creates a range specification from a given array of integer masks. The Nth element of the + * array corresponds to the Nth element in the range specification, and mask values must be + * non-zero and have only bits 0 to 9 set. + */ + private static RangeSpecification fromBitmasks(int[] bitmasks) { + checkArgument(bitmasks.length <= DigitSequence.MAX_DIGITS, + "range specification too large"); + StringBuilder s = new StringBuilder(bitmasks.length); + s.setLength(bitmasks.length); + for (int n = 0; n < bitmasks.length; n++) { + int mask = bitmasks[n]; + checkArgument(mask > 0 && mask <= ALL_DIGITS_MASK, "invalid bitmask: %s", mask); + s.setCharAt(n, (char) mask); + } + return new RangeSpecification(s.toString()); + } + + // Fills the bitmasks after the given index with the "all digits" mask (i.e. matching [0-9]). + // This can accept -1 as the index since it always pre-increments before using it. + private static void fillBitmasksAfter(int[] masks, int n) { + // Because of the iterative way the mask array is handled, we can stop filling when we hit + // ALL_DIGITS_MASK because everything past that must already be filled. + while (++n < masks.length && masks[n] != ALL_DIGITS_MASK) { + masks[n] = ALL_DIGITS_MASK; + } + } + + // Starting at digit-N, returns the index of the nearest preceding digit that's not equal to the + // given value (or -1 if no such digit exists). + private static int previousNon(int digit, DigitSequence s, int n) { + while (--n >= 0 && s.getDigit(n) == digit) {} + return n; + } + + /** Returns the bitmask for the range {@code [n-9]}. */ + private static int bitmaskUpFrom(int n) { + return (-1 << n) & ALL_DIGITS_MASK; + } + + /** Returns the bitmask for the range {@code [0-n]}. */ + private static int bitmaskDownFrom(int n) { + return ALL_DIGITS_MASK >>> (9 - n); + } + + + // String containing one bitmasks per character (bits 0..9). + private final String bitmasks; + // Minimum and maximum sequences (inclusive) which span the ranges defined by this specification. + // Caching this is deliberate, since we sort disjoint ranges using the minimum value. It might + // not be so useful to cache the maximum value though. + private final DigitSequence min; + private final DigitSequence max; + // Total number of sequences matched by this specification. + private final long sequenceCount; + + private RangeSpecification(String bitmasks) { + int length = bitmasks.length(); + checkArgument(length <= DigitSequence.MAX_DIGITS, + "Range specification too long (%s digits)", length); + this.bitmasks = bitmasks; + long minValue = 0; + long maxValue = 0; + long sequenceCount = 1; + for (int n = 0; n < length; n++) { + int mask = bitmasks.charAt(n); + checkArgument(mask > 0 && mask <= ALL_DIGITS_MASK, "invalid bitmask: %s", mask); + minValue = (minValue * 10) + numberOfTrailingZeros(mask); + maxValue = (maxValue * 10) + (31 - numberOfLeadingZeros(mask)); + sequenceCount *= Integer.bitCount(mask); + } + this.min = new DigitSequence(length, minValue); + this.max = new DigitSequence(length, maxValue); + this.sequenceCount = sequenceCount; + } + + /** + * Returns the number of digits that this specification can match. This is the length of all + * digit sequences which can match this specification. + */ + public int length() { + return bitmasks.length(); + } + + /** Returns the smallest digit sequence matched by this range. */ + public DigitSequence min() { + return min; + } + + /** Returns the largest digit sequence matched by this range. */ + public DigitSequence max() { + return max; + } + + /** Returns the total number of digit sequences matched by (contained in) this specification. */ + public long getSequenceCount() { + return sequenceCount; + } + + /** + * Returns the bitmask of the Nth range in this specification. Bit-X (0<= X <= 9) corresponds to + * the digit with value X. As every range in a specification must match at least one digit, this + * mask can never be zero. + */ + public int getBitmask(int n) { + return bitmasks.charAt(n); + } + + /** + * Returns whether the given digit sequence is in one of the ranges specified by this instance. + * This is more efficient that obtaining the associated {@code RangeSet} and checking that. + */ + public boolean matches(DigitSequence digits) { + if (digits.length() != length()) { + return false; + } + for (int n = 0; n < length(); n++) { + if ((bitmasks.charAt(n) & (1 << digits.getDigit(n))) == 0) { + return false; + } + } + return true; + } + + // Returns the next sequence in forward order which is contained by a range defined by this + // range specification, or null if none exists. The given sequence must not be matched by this + // specification. + private DigitSequence nextRangeStart(DigitSequence s) { + // Easy length based checks (this is where the fact that range specification only define ranges + // of the same length really simplifies things). + if (s.length() < length()) { + return min(); + } else if (s.length() > length()) { + return null; + } + // Algorithm: + // 1) Find the highest digit that isn't in the corresponding bitmask for the range. + // 2) Try and increase the digit value until it's inside the next available range. + // 3) If that fails, move back up the sequence and increment the next digit up. + // 4) Repeat until a digit can be adjusted to start a new range, or all digits are exhausted. + // If all digits exhausted, the sequence was above all ranges in this specification. + // Otherwise return a new sequence using the unchanged prefix of the original sequence, the + // newly adjusted digit and the trailing digits of the minimal sequence. + for (int n = 0; n < length(); n++) { + int d = s.getDigit(n); + int mask = bitmasks.charAt(n); + if ((mask & (1 << d)) != 0) { + continue; + } + while (true) { + // Digit 'd' is either outside the range mask (first time though the loop) or inside a + // range. Either way we want to find the next digit above it which is inside a range. + // First increment 'd', and then find the next set bit in the mask at or above that point. + // Not extra check is needed at the end of ranges because numberOfTrailingZeros(0)==32 + // which neatly ensures that the new value of 'd' must be out-of-range. + // If mask=[3-58]: d=1-->d'=3, d=4-->d'=5, d=5-->d'=8, d=8-->d'>9 + d++; + d += numberOfTrailingZeros(mask >>> d); + if (d <= 9) { + // Found the value of the largest digit which can be adjusted to start the next range. + // Everything higher than this digit is the same as the original sequence and everything + // lower that this digit is the same as the corresponding digit in the minimal value. + return s.first(n).extendBy(d).extendBy(min.last((length() - n) - 1)); + } + // No more bits available in this range, so go back up to the previous range. + if (--n < 0) { + // The sequence was above the last element in the set. + // Example: Range Spec: 1[2-8][3-8]456, Sequence: 188457 + return null; + } + d = s.getDigit(n); + mask = bitmasks.charAt(n); + } + } + // If we finish the outer loop the given sequence was in a range (which is an error). + throw new IllegalArgumentException( + "Digit sequence '" + s + "' is in the range specified by: " + this); + } + + // Given a sequence inside a range defined by this specification, return the highest sequence + // in the current range (possibly just the given sequence). + private DigitSequence currentRangeEnd(DigitSequence s) { + // Build up a value representing the trailing digits (which must always be 9's). + long nines = 0; + for (int n = length() - 1; n >= 0; n--, nines = (10 * nines) + 9) { + int mask = bitmasks.charAt(n); + if (mask == ALL_DIGITS_MASK) { + continue; + } + // The new digit is the top of the current range that the current sequence digit is in. + int d = nextUnsetBit(mask, s.getDigit(n)) - 1; + DigitSequence end = + s.first(n).extendBy(d).extendBy(new DigitSequence((length() - n) - 1, nines)); + // Edge case for cases like "12[34][09]x" where "1239x" and "1240x" abut. This adjustment + // will happen at most once because the second range cannot also include an upper bound + // ending at '9', since otherwise (mask == ALL_DIGITS_MASK) at this position. The next + // sequence must be terminated with zeros starting at the current position having "rolled + // over" on the digit above. + if (d == 9) { + DigitSequence next = end.next(); + if (matches(next)) { + d = nextUnsetBit(mask, 0) - 1; + end = next.first(n).extendBy(d).extendBy(new DigitSequence((length() - n) - 1, nines)); + } + } + return end; + } + // The range specification is entirely 'x', which means it's a single range. + return max; + } + + /** + * Returns a generating iterator which iterates in forward order over the disjoint ranges defined + * by this specification. This is not actually as useful as you might expect because in a lot of + * cases you would be dealing with a sequence of range specifications and it's not true that all + * ranges from multiple specifications are disjoint. + */ + Iterable> asRanges() { + return () -> new Iterator>() { + // Start is always in a range. + private DigitSequence start = min; + + @Override + public boolean hasNext() { + return start != null; + } + + @Override + public Range next() { + DigitSequence end = currentRangeEnd(start); + Range r = Range.closed(start, end).canonical(DigitSequence.domain()); + start = nextRangeStart(end.next()); + return r; + } + }; + } + + /** + * Returns a new range specification which is extended by the given mask value. For example: + *

{@code
+   * "0123[4-6]".extendByMask(7) == "0123[4-6][0-2]"
+   * }
+ */ + public RangeSpecification extendByMask(int mask) { + checkArgument(mask > 0 && mask <= ALL_DIGITS_MASK, "bad mask value '%s'", mask); + return new RangeSpecification(bitmasks + ((char) mask)); + } + + /** + * Returns a new range specification which is extended by the given specification. For example: + *
{@code
+   * "0123[4-6]".extendBy("7[89]") == "0123[4-6]7[89]"
+   * }
+ */ + public RangeSpecification extendBy(RangeSpecification extra) { + return new RangeSpecification(bitmasks + extra.bitmasks); + } + + /** + * Returns a new range specification which is extended by a sequence of any digits of the given + * length. For example: + *
{@code
+   * "012".extendByLength(4) == "012xxxx"
+   * }
+ */ + public RangeSpecification extendByLength(int length) { + return this.extendBy(any(length)); + } + + /** + * Returns a range specification containing only the first {@code n} digits. If the given length + * is the same or greater than the specification's length, this specification is returned. + * For example: + *
{@code
+   * "01[2-4]xx".first(8) == "01[2-4]xx" (same instance)
+   * "01[2-4]xx".first(5) == "01[2-4]xx" (same instance)
+   * "01[2-4]xx".first(3) == "01[2-4]"
+   * "01[2-4]xx".first(0) == "" (the empty specification)
+   * }
+ */ + public RangeSpecification first(int n) { + checkArgument(n >= 0); + if (n == 0) { + return empty(); + } + return n < length() ? new RangeSpecification(bitmasks.substring(0, n)) : this; + } + + /** + * Returns a range specification containing only the last {@code n} digits. If the given length + * is the same or greater than the specification's length, this specification is returned. + * For example: + *
{@code
+   * "01[2-4]xx".last(8) == "01[2-4]xx" (same instance)
+   * "01[2-4]xx".last(5) == "01[2-4]xx" (same instance)
+   * "01[2-4]xx".last(3) == "[2-4]xx"
+   * "01[2-4]xx".last(0) == "" (the empty specification)
+   * }
+ */ + public RangeSpecification last(int n) { + checkArgument(n >= 0); + if (n == 0) { + return empty(); + } + return n < length() ? new RangeSpecification(bitmasks.substring(length() - n)) : this; + } + + /** + * Returns a range specification with any trailing "any digit" sequence removed. For example: + *
{@code
+   * "0123".getPrefix() == "0123" (same instance)
+   * "0123xx".getPrefix() == "0123"
+   * "xxx".getPrefix() == "" (the empty specification)
+   * }
+ */ + public RangeSpecification getPrefix() { + int length = length(); + while (length > 0 && getBitmask(length - 1) == ALL_DIGITS_MASK) { + length--; + } + return first(length); + } + + @Override + public int compareTo(RangeSpecification other) { + int length = Math.min(length(), other.length()); + for (int i = 0; i < length; i++) { + int mask = getBitmask(i); + int otherMask = other.getBitmask(i); + if (mask == otherMask) { + continue; + } + int commonBits = mask & otherMask; + mask -= commonBits; + otherMask -= commonBits; + // At least one mask is still non-zero and they don't overlap. + // + // The mask with the lowest set bit is the smaller mask in the ordering, since that bit + // distinguishes a smaller prefix than can never exist in the other specification. + // Testing the number of trailing zeros is equivalent to finding the lowest set bit. + return Integer.compare(numberOfTrailingZeros(mask), numberOfTrailingZeros(otherMask)); + } + return Integer.compare(length(), other.length()); + } + + @Override + public boolean equals(Object o) { + return (o instanceof RangeSpecification) && bitmasks.equals(((RangeSpecification) o).bitmasks); + } + + @Override + public int hashCode() { + return bitmasks.hashCode(); + } + + /** + * If you want lexicographical ordering of range specifications, don't use this method, use the + * {@code min().toString()}. This works assuming the ranges being compared are disjoint. + */ + @Override + public String toString() { + // Consider caching if it turns out that we are serializing a lot of these. + StringBuilder s = new StringBuilder(); + for (int n = 0; n < bitmasks.length(); n++) { + appendMask(bitmasks.charAt(n), s); + } + return s.toString(); + } + + /** Returns the string representation of a single bit-mask. */ + public static String toString(int bitMask) { + checkArgument(bitMask > 0 && bitMask < (1 << 10), "bad mask value: %s", bitMask); + return appendMask(bitMask, new StringBuilder()).toString(); + } + + static StringBuilder appendMask(int mask, StringBuilder out) { + if (mask == ALL_DIGITS_MASK) { + out.append('x'); + } else if (hasOneBit(mask)) { + out.append(asChar(numberOfTrailingZeros(mask))); + } else { + out.append('['); + for (int loBit = numberOfTrailingZeros(mask); + loBit != 32; + loBit = numberOfTrailingZeros(mask)) { + // Always append the loBit digit into the range. + out.append(asChar(loBit)); + int hiBit = nextUnsetBit(mask, loBit); + int numBits = hiBit - loBit; + if (numBits > 1) { + // Stylistically prefer "[34]" to "[3-4]" for compactness. + if (numBits > 2) { + out.append('-'); + } + out.append(asChar(hiBit - 1)); + } + // Clear the bits we've just processed before going back round the loop. + mask &= ~((1 << hiBit) - 1); + } + out.append(']'); + } + return out; + } + + // Turns a value in the range [0-9] into the corresponding ASCII character. + private static char asChar(int digit) { + return (char) ('0' + digit); + } + + // Determines if the given bit-mask has only one bit set. + private static boolean hasOneBit(int mask) { + return (mask & (mask - 1)) == 0; + } + + private static int nextUnsetBit(int mask, int bit) { + // Example mask transform for [013-589] if bit=3: + // v-- bit=3 + // 01100111011 + // 00000000111 (1 << 3) - 1 + // 01100111111 OR with mask + // 10011000000 Bitwise NOT + // ^-- return=6 + return numberOfTrailingZeros(~(mask | ((1 << bit) - 1))); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTree.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTree.java new file mode 100644 index 000000000..b83f6b06c --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTree.java @@ -0,0 +1,1342 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static java.lang.Integer.numberOfTrailingZeros; + +import com.google.common.base.Preconditions; +import com.google.common.base.Supplier; +import com.google.common.base.Suppliers; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableRangeSet; +import com.google.common.collect.ImmutableSortedSet; +import com.google.common.collect.RangeSet; +import java.lang.ref.Reference; +import java.lang.ref.ReferenceQueue; +import java.lang.ref.WeakReference; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Stream; +import javax.annotation.Nullable; + +/** + * Minimal decision tree for matching digit sequences. A range tree represents an arbitrary set of + * digit sequences typically grouped as a set of disjoint ranges. A range tree can be thought of + * as equivalent to either {@code RangeSet} or a canonical + * {@code List}. Range trees can have set-like operations performed on them, + * such as union and intersection. + */ +public final class RangeTree { + /** + * Simple API for representing nodes in the DFA during visitation. See also {@link DfaVisitor} + * and {@link #accept(DfaVisitor)}. + */ + public interface DfaNode { + /** Returns whether this node can terminate. */ + boolean canTerminate(); + + /** Accepts the given visitor on this node, visiting its immediate outgoing (child) edges. */ + void accept(DfaVisitor visitor); + + /** Finds the outgoing edge from this node which contains the given digit. */ + @Nullable + DfaEdge find(int digit); + + /** Returns the list of edges leading out from this node.*/ + List getEdges(); + + /** + * Returns a bit-mask of possible lengths from this node. Bit-N is set of the sub-tree rooted + * at this node can terminate (in any branch) at depth N. A corollary to this is that bit-0 is + * set if this node can terminate. + */ + int getLengthMask(); + } + + /** + * Simple API for representing edges in the DFA during visitation. See also {@link DfaVisitor} + * and {@link #accept(DfaVisitor)}. + */ + public interface DfaEdge { + /** Returns a bit-mask of the digits accepted by this edge. */ + int getDigitMask(); + + /** Returns the target node of this edge. */ + DfaNode getTarget(); + } + + /** + * Visitor API for traversing edges in {@code RangeTrees}. When a node accepts a visitor, it + * visits only the immediate outgoing edges of that node. If recursive visitation is required, it + * is up to the visitor to call {@link DfaNode#accept(DfaVisitor)} during visitation. + * + *

Graph nodes and edges obey {@code Object} equality and this can be used by visitor + * implementations to track which nodes have been reached if they only wish to visit each edge + * once (e.g. storing visited nodes in a set or map). + */ + public interface DfaVisitor { + /** + * Visits an edge in the DFA graph of a {@code RangeTree} as the result of a call to + * {@link DfaNode#accept(DfaVisitor)} on the source node. + */ + void visit(DfaNode source, DfaEdge edge, DfaNode target); + } + + /** + * A single node within a range tree. Nodes are really just a specialized implementation of a + * node in a deterministic finite state automaton (DFA), and {@link RangeTree} instances are just + * wrappers around a single "root" node. + *

+ * Nodes have outgoing {@link Edge}s but may optionally allow termination of matching operations + * when they are reached (this is the same as in DFAs). Unlike DFAs however, the out-edges of a + * node are grouped according to a mask of all digits which can reach the same target node. For + * node {@code A}, which can reach a target node {@code B} via digits {@code {1, 2, 3, 7, 9 }}, + * there is a single edge labeled with the bitmask {@code 0x287} (binary {@code 1010001110}) as + * opposed to 5 separate edges, each marked with a single digit. + *

+ * This approach more closely matches the data representations in classes like {@link + * RangeSpecification} and affords additional efficiencies compared to the {@code JAutomata} + * library, which has performance issues when processing large trees. + */ + private static final class Node implements DfaNode { + /** The unique "terminal" node which must be the final node in all paths in all RangeTrees. */ + private static final Node TERMINAL = new Node(); + + /** + * The list of edges, ordered by the lowest bit in each mask. The masks for each edge must be + * mutually disjoint. Only the terminal node is permitted to have zero outbound edges. + */ + private final ImmutableList edges; + + /** + * Derived bit-packed table of digit-number to edge-index. The edge index for digit {@code 'd'} + * is stored in 4-bits, starting at bit {@code 4 * d}. This is useful as it makes finding an + * outbound edge a constant time operation (instead of having to search through the list of + * edges). + */ + private final long jumpTable; + + /** A cached value of the total number of unique digits sequences matched by this DFA. */ + private final long matchCount; + + /** + * Mask of all possible lengths of digit sequences this node can match. If bit-N is set then + * this node can match at least one input sequence of length N. Note that this includes bit-0, + * which is matched if the node itself can terminate. It is even possible to have a range tree + * containing only the terminal node which matches only the empty digit sequence (this is + * distinct from the "empty" tree which matches no sequences at all). + */ + private final int lengthMask; + + /** Nodes are used a keys during graph interning so we cache the hashcode. */ + private final int hashcode; + + // Only for the terminal node. + private Node() { + this.edges = ImmutableList.of(); + this.jumpTable = -1L; + // Unlike the empty tree, the terminal node matches one input sequence, the empty sequence. + // The empty tree on the other hand doesn't even reference a terminal node, so there is no + // possible sequence it can match. + this.matchCount = 1L; + this.lengthMask = 1; + this.hashcode = -1; + } + + // A node is defined entirely from its set of edges and whether it can terminate. + private Node(ImmutableList edges, boolean canTerminate) { + checkArgument(!edges.isEmpty()); + this.edges = edges; + // Everything below here is derived information from the edges and termination flag. + int lengthMask = 0; + // Set all bits in the jump table (so each 4-bit entry is '-1' unless otherwise overwritten). + long jumpTable = -1L; + int outMask = 0; + int lastLowBit = -1; + // If we can terminate we get an additional match count for the sequence that reaches us, but + // we may get more for longer sequences matched by nodes we link to. + long matchCount = canTerminate ? 1 : 0; + for (int n = 0; n < edges.size(); n++) { + Edge e = edges.get(n); + // Make sure edges are disjoint (edges masks are already known to be valid individually). + checkArgument((outMask & e.mask) == 0, "edge masks not disjoint: %s", e); + outMask |= e.mask; + // Make sure edges are ordered as expected. + int lowBit = numberOfTrailingZeros(e.mask); + checkArgument(lowBit > lastLowBit, "edge masks not ordered: %s", e); + lastLowBit = lowBit; + // Work out what the match count is based on the counts of everything we link to (the sum + // of all the counts of our target nodes weighted by how many times we link to them). + matchCount += Integer.bitCount(e.mask) * e.target.matchCount; + // Build up a mask of all the lengths that our target node can contain. + lengthMask |= e.target.lengthMask; + // For each bit in the edge mask, set the 4-bit nibble in the jump table to the edge index. + for (int d = 0; d <= 9; d++) { + if ((e.mask & (1 << d)) != 0) { + // We rely on the jump table entry having all its bits set here (true from above). + // n = 1010 (9). + // (n ^ 1111) << (4d) == 0000.0101.0000... + // xxxx.1111.yyyy... ^ 0000.0101.0000... == xxxx.1010.yyyy + jumpTable ^= (n ^ 0xFL) << (4 * d); + } + } + } + this.jumpTable = jumpTable; + this.matchCount = matchCount; + // Our length set is one more than all our targets (including bit zero if we can terminate). + this.lengthMask = (lengthMask << 1) | (canTerminate ? 1 : 0); + // Caching the hashcode makes interning faster (note that this is not recursive because the + // hashcode of an Edge relies on the identity hashcode of the target nodes). + this.hashcode = edges.hashCode() ^ Boolean.hashCode(canTerminate); + } + + /** + * Returns the target node for the given input digit, or {@code null} if there is no out-edge + * for that digit. + */ + Node findTarget(int digit) { + checkArgument(0 <= digit && digit <= 9); + return targetFromJumpTableIndex((int) (jumpTable >>> (4 * digit)) & 0xF); + } + + /** Helper to get the target node from an edge index (rather than a digit value). */ + private Node targetFromJumpTableIndex(int n) { + return (n != 0xF) ? edges.get(n).target : null; + } + + @Nullable + @Override + public DfaEdge find(int digit) { + checkArgument(0 <= digit && digit <= 9); + int jumpTableIndex = (int) (jumpTable >>> (4 * digit)) & 0xF; + return (jumpTableIndex != 0xF) ? edges.get(jumpTableIndex) : null; + } + + @Override + public boolean canTerminate() { + return (lengthMask & 1) != 0; + } + + @Override + public void accept(DfaVisitor visitor) { + for (Edge e : edges) { + visitor.visit(this, e, e.target); + } + } + + @Override + public List getEdges() { + // NOTE: This DOES NOT make a copy (or any allocations), since ImmutableList is clever + // enough to know that a list of is also a List if they are + // unmodifiable. It's a clever cast essentially! + return ImmutableList.copyOf(edges); + } + + @Override + public int getLengthMask() { + return lengthMask; + } + + /** + * Returns whether this node is interchangeable with the given instance. Equality of + * {@code Node} instances is not "deep" equality and is carefully designed to make constructing + * minimal range trees easier. Two nodes are equal if they have the same set of edges and + * termination flag; however edges are equal only if they point to exactly the same target + * instances. This is carefully designed to make "interning" work efficiently and avoid + * unwanted recursion during various operations. + */ + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (!(o instanceof Node)) { + return false; + } + Node tree = (Node) o; + return edges.equals(tree.edges) && canTerminate() == tree.canTerminate(); + } + + @Override + public int hashCode() { + return hashcode; + } + } + + /** + * A directed edge to a target {@link Node}. Note that edge equality is based on instance identity + * as part of the interning semantics of range trees. + */ + private static final class Edge implements DfaEdge { + /** Bit mask of digit values this edge accepts (bit-N set implies this edge accepts digit N). */ + private final int mask; + /** Target node (or node function) this edge points at. */ + private final Node target; + + private Edge(int mask, Node target) { + checkArgument(mask > 0 && mask <= RangeSpecification.ALL_DIGITS_MASK); + this.mask = mask; + this.target = checkNotNull(target); + } + + /** Returns a new edge with the same target whose mask is OR'ed with the given value. */ + Edge merge(int m) { + return new Edge(mask | m, target); + } + + @Override + public int getDigitMask() { + return mask; + } + + @Override + public DfaNode getTarget() { + return target; + } + + /** + * Edges are equal only if the point to exactly the same targets. This is important to avoid + * expensive recursive equality checking during common operations. + */ + @Override + @SuppressWarnings("ReferenceEquality") + public boolean equals(Object o) { + if (!(o instanceof Edge)) { + return false; + } + Edge other = (Edge) o; + return mask == other.mask && target == other.target; + } + + @Override + public int hashCode() { + return mask ^ System.identityHashCode(target); + } + + /** The natural string representation of an edge is the set of digits it accepts. */ + @Override + public String toString() { + return RangeSpecification.toString(mask); + } + } + + /** + * Implementation of set-like operations for range trees. As well as having well defined set-like + * operations (union, intersection etc...), range trees are also minimal DFAs, ensuring that no + * duplication of sub-trees occurs. This class implements set-like operations efficiently using + * recursive interning of nodes and always produces minimal results by construction. This + * approach is similar to (but not the same as) dynamic programming. + *

+ * Note that the terms "interning" and "minimizing" of range trees are related but not the same. + * Interning only makes sense in relation to some sequence of operations (and is a pure + * implementation detail for efficiency reasons). A minimal sub-tree exists outside of logical + * operations and may be the result of a logical operation, or something else. Minimization of + * the DFA represented by a range tree is a user concept. + *

+ * Currently all logical operations produce minimal DFAs, but a minimal DFA can come from other + * sources (the DFA generated for a single range specification is minimal by construction). + */ + public static final class SetOperations { + // A weak reference wrapper around a Node allowing it to be garbage collected when not needed. + private static final class WeakNodeRef extends WeakReference { + // This MUST be cached since it cannot be calculated once the Node is garbage collected, and + // it's required that keys in maps do not change their hashcode. + private final int hashCode; + + public WeakNodeRef(Node referent, ReferenceQueue q) { + super(checkNotNull(referent), q); + this.hashCode = referent.hashCode(); + } + + @Override + public int hashCode() { + return hashCode; + } + + /* + * This is very subtle. To avoid multiple "cleared" references becoming equal to each other + * (which violates the expectations of a map) we consider other nodes equal only if either: + * 1) they are the same instance + * 2) they have a non-null referent that's equal() to our referent + * + * Two distinct cleared references with the same hashcode must not compare as equal. + */ + @Override + public boolean equals(Object obj) { + if (obj == this) { + return true; + } + // Don't worry about checking "instanceof" since this is a private type. + Node referent = get(); + return referent != null && referent.equals(((WeakNodeRef) obj).get()); + } + } + + /** Minimal API for any logical operation which can be applied to two range trees. */ + interface LogicalOperation extends BiFunction { } + + /** + * Implementation of the "union" operation for two sub-trees. The union of two sub-trees is a + * sub-tree which matches digit sequences if-and-only-if they are matched by either of the + * original sub-trees. + */ + private final class Union implements LogicalOperation { + @Override + @SuppressWarnings("ReferenceEquality") + public Node apply(Node lhs, Node rhs) { + // Assert that inputs are always interned. + // NOTE: It might be worth doing checks for "TERMINAL" here as well. + if (lhs == rhs || rhs == null) { + // (A ∪ A) = A and (A ∪ ∅) = A + return lhs; + } else if (lhs == null) { + // (∅ ∪ B) = B + return rhs; + } + return recurse(this, lhs, rhs, lhs.canTerminate() || rhs.canTerminate()); + } + } + + /** + * Implementation of the "intersection" operation for two sub-trees. The intersection of two + * sub-trees is a sub-tree which matches digit sequences if-and-only-if they are matched by + * both the original sub-trees. + */ + private final class Intersection implements LogicalOperation { + @Override + @SuppressWarnings("ReferenceEquality") + public Node apply(Node lhs, Node rhs) { + // Assert that inputs are always interned. + // NOTE: It might be worth doing checks for "TERMINAL" here as well. + if (lhs == rhs) { + // (A ∩ A) = A + return lhs; + } else if (lhs == null || rhs == null) { + // (∅ ∩ X) = ∅ for any X + return null; + } + return recurse(this, lhs, rhs, lhs.canTerminate() && rhs.canTerminate()); + } + } + + /** + * Implementation of the "subtraction" operation for two sub-trees. The subtraction of two + * sub-trees {@code A} and {@code B}, is a sub-tree which matches digit sequences if-and-only-if + * they are matched by {@code A} but not {@code B}. This is not a symmetrical operation. + */ + private final class Subtraction implements LogicalOperation { + @Override + @SuppressWarnings("ReferenceEquality") + public Node apply(Node lhs, Node rhs) { + // Assert that inputs are always interned. + // NOTE: It might be worth doing checks for "TERMINAL" here as well. + if (lhs == rhs || lhs == null) { + // (A ∖ A) = ∅ and (∅ ∖ B) = ∅ + return null; + } else if (rhs == null) { + // (A ∖ ∅) = A + return lhs; + } + return recurse(this, lhs, rhs, lhs.canTerminate() && !rhs.canTerminate()); + } + } + + private final class Filter implements LogicalOperation { + // IMPORTANT: The prefix is neither returned, nor tested directly against instances in the + // range tree being filtered (other than the singleton TERMINAL node) which means it need + // not be interned before calling this function. If this method were ever changed to return + // nodes from the prefix tree or test instance equality with (interned) nodes in the range + // tree, then the prefix tree must also be interned before this method is called. + @Override + @SuppressWarnings("ReferenceEquality") + public Node apply(Node prefix, Node range) { + // Assert that ranges are always interned (prefixes don't need to be since we never return + // nodes in the prefix tree to form part of the filtered range). + if (prefix == null || range == null) { + return null; + } + // If we get to the end of the prefix, just return whatever's left in the range. + if (prefix == Node.TERMINAL) { + return range; + } + // Still "in" the prefix but we hit the end of the range (e.g. "123".filter("12") == ∅ + if (range == Node.TERMINAL) { + return null; + } + // Since we only recurse while still "in" the prefix we are never terminating (e.g. + // "123".filter({"12", "1234"}) == {"1234"} and does not contain "12"). + return recurse(this, prefix, range, false); + } + } + + // Singleton set operations instance to handle interning of nodes. + static final SetOperations INSTANCE = new SetOperations(); + + /** + * Weak-referenced interning map. This cannot be a standard Guava {@code Interner} because it + * must recursively intern the targets of any nodes to ensure that once a Node is interned, all + * the nodes reachable from it are also interned. + */ + private final Map interningMap = new ConcurrentHashMap<>(); + + /** + * Referent queue onto which node references clear by GC will be put. The elements in this + * queue should correspond to unused entries in the map which need to be tidied up. + */ + private final ReferenceQueue tidyUpQueue = new ReferenceQueue<>(); + + private final LogicalOperation unionFn = new Union(); + private final LogicalOperation intersectionFn = new Intersection(); + private final LogicalOperation subtractionFn = new Subtraction(); + private final LogicalOperation retainFromFn = new Filter(); + + private SetOperations() { + intern(Node.TERMINAL); + } + + /** + * Interns the target of an edge (this does not make the edge itself interned, but it does + * allow edges to be efficiently compared via their targets). If the target of the given edge + * was already interned then it is just returned. + */ + @SuppressWarnings("ReferenceEquality") + private Edge internTarget(Edge edge) { + Node target = intern(edge.target); + return (target == edge.target) ? edge : new Edge(edge.mask, target); + } + + /** + * Recursively interns a node and all nodes reachable from it. Note that if the given nodes do + * not represent a minimal DFA, then the interning process itself won't necessarily produce a + * minimal result. The minimal DFA property of range trees exists by induction and assumes that + * all trees are constructed minimally and that logical operations produce minimal trees. Note + * that if necessary, the interning operation could ensure minimization but at the cost of some + * efficiency (you would use the EDGE_COLLECTOR to squash duplicate edges). + */ + private Node intern(Node node) { + WeakNodeRef ref = new WeakNodeRef(node, tidyUpQueue); + WeakNodeRef existingRef = interningMap.get(ref); + if (existingRef != null) { + // Claim strong reference once into a local variable. + Node interned = existingRef.get(); + if (interned != null) { + // Clear "ref" to prevent it going in the tidy-up queue (it wasn't added to the map). + ref.clear(); + return interned; + } + } + // In the vast majority of cases, the edges of the node being interned already reference + // interned targets. The returned list contains edges to (recursively) interned nodes. + // If the edges we get back are the edges of our node, we just need to add ourselves to the + // intern map. If our edges were not interned (and we aren't in the map yet) then we must + // make a duplicate node that has only the interned edges before adding it to the map. This + // preserves the property that interned nodes only ever connect to other interned nodes. + ImmutableList edges = + node.edges.stream().map(this::internTarget).collect(toImmutableList()); + if (!node.edges.equals(edges)) { + // Clear the original reference before overwriting the node to avoid it being put on the + // tidy-up queue (otherwise as soon as "node" is overwritten, GC could enqueue "ref"). + ref.clear(); + // Create a new node with interned edges and a corresponding weak reference. + node = new Node(edges, node.canTerminate()); + ref = new WeakNodeRef(node, tidyUpQueue); + } + // Consider the race condition where another thread added this node in the meantime. We + // cannot obtain a strong reference until after the WeakNode is returned from the map, which + // means there's always a race condition under which the referenced node will be collected. + while (true) { + existingRef = interningMap.putIfAbsent(ref, ref); + if (existingRef == null) { + // Easy case: We succeeded in putting our new reference into the map, so return our node. + return node; + } + // There's still a risk that the reference became null after being found in the map. + Node interned = existingRef.get(); + if (interned != null) { + // Clear "ref" to prevent it going in the tidy-up queue (it wasn't added to the map). + ref.clear(); + return interned; + } + // The reference must have been garbage collected after the weak node was found. This is + // very rare but possible, and the only real strategy is to try again. We can't really end + // up in a loop here unless we're continuously garbage collecting (i.e. bigger problems). + // We can't find the same reference again (it was cleared and is no longer equal-to "ref"). + } + } + + // Remove all WeakNodeRefs that have been cleared by the garbage collector. This should + // precisely account for all weak nodes in the interning map which have been cleared by the + // garbage collector (weak nodes that were never added to the map are cleared manually and + // should not appear in the tidy-up queue). + private void tidyUpInterningMap() { + Reference ref; + while ((ref = tidyUpQueue.poll()) != null) { + interningMap.remove(ref); + } + } + + /** + * Applies the given operation recursively to a pair of interned nodes. The resulting node is + * interned and (if the input nodes were both minimal) minimal. + */ + @SuppressWarnings("ReferenceEquality") + Node recurse(LogicalOperation logicalOp, Node lhs, Node rhs, boolean canTerminate) { + // Stage 1: Use the jump tables of target nodes to make a lookup of input edge index and mask. + // + // Each entry in the 'inputMap' array is a coded integer containing: + // [ lhs edge index | rhs edge index | bitmask of edges ] + // [ bits 20-24 | bits 16-20 | bits 0-10 ] + // + // Basically the top 16 bits are the indices for the inputs to the logical operation and the + // lower 16 bits are the mask of edge indices to which that result will apply. Because the + // map is constructed to avoid any duplication of the indices (the 'inputKey') we ensure that + // the logical operation is applied to the minimal number of unique inputs (no duplication). + long lhsJumpTable = lhs.jumpTable; + long rhsJumpTable = rhs.jumpTable; + // Note: Could reuse from a field (no longer thread safe, but that might be fine) + int[] inputMap = new int[10]; + int mapSize = 0; + // The digit mask runs from bit-0 (1) to bit-9. + for (int digitMask = 1; digitMask <= (1 << 9); digitMask <<= 1) { + int inputKey = (int) (((lhsJumpTable & 0xF) << 4) | (rhsJumpTable & 0xF)); + int n; + for (n = 0; n < mapSize; n++) { + if ((inputMap[n] >> 16) == inputKey) { + // Add this digit to an existing entry in the input map (the inputs are the same). + inputMap[n] |= digitMask; + break; + } + } + if (n == mapSize) { + // Add this digit to a new entry in the input map (and increase the map size). + inputMap[n] = (inputKey << 16) | digitMask; + mapSize++; + } + lhsJumpTable >>= 4; + rhsJumpTable >>= 4; + } + // Stage 2: Given a minimal set in inputs, perform the minimal number of logical operations. + // Note however that two operations can often return the same value (especially null or the + // TERMINAL node) so we have to minimize the results again and merge identical targets and + // masks. + List out = new ArrayList<>(); + for (int n = 0; n < mapSize; n++) { + int mask = inputMap[n]; + // If lhs and rhs nodes are interned, then every target they reference is interned. + // We also assert that any nodes returned by logical operations are also interned. + Node node = logicalOp.apply( + lhs.targetFromJumpTableIndex((mask >> 20) & 0xF), + rhs.targetFromJumpTableIndex((mask >> 16) & 0xF)); + if (node == null) { + continue; + } + // Mask out the upper bits that are no longer needed. + mask &= RangeSpecification.ALL_DIGITS_MASK; + // Find if the result of the logical operation matches an existing result in the edge list. + int idx; + for (idx = 0; idx < out.size(); idx++) { + Edge e = out.get(idx); + if (e.target == node) { + // We matched an existing result, so replace the existing entry with a new edge which + // points to the same target but includes the new digits that also share this result. + out.set(idx, e.merge(mask)); + break; + } + } + if (idx == out.size()) { + // This is the first time this result was seen, so add it in a new entry. + out.add(new Edge(mask, node)); + } + } + // Stage 3: Given a minimal list of final edges (and after checking for degenerate cases + // (empty or terminating nodes) create and intern a new minimal node for the edges. + if (out.isEmpty()) { + return canTerminate ? Node.TERMINAL : null; + } else { + return intern(new Node(ImmutableList.copyOf(out), canTerminate)); + } + } + + /** + * Returns the (minimal) logical union {@code '∪'} of two (minimal) sub-trees rooted at the + * given nodes. The sub-trees are interned before recursively applying the "union" function. + */ + private Node unionImpl(Node lhs, Node rhs) { + if (lhs == null) { + return rhs; + } else if (rhs == null) { + return lhs; + } else { + return unionFn.apply(intern(lhs), intern(rhs)); + } + } + + /** + * Returns the (minimal) logical intersection {@code '∩'} of two (minimal) sub-trees rooted at + * the given nodes. The sub-trees are interned before recursively applying the "union" function. + */ + private Node intersectImpl(Node lhs, Node rhs) { + if (lhs == null || rhs == null) { + return null; + } else { + return intersectionFn.apply(intern(lhs), intern(rhs)); + } + } + + /** + * Returns the (minimal) logical subtraction {@code '∖'} of two (minimal) sub-trees rooted at + * the given nodes. The sub-trees are interned before recursively applying the "union" function. + */ + private Node subtractImpl(Node lhs, Node rhs) { + if (lhs == null) { + return null; + } else if (rhs == null) { + return lhs; + } else { + return subtractionFn.apply(intern(lhs), intern(rhs)); + } + } + + private Node retainFromImpl(Node prefix, Node range) { + if (prefix == null || range == null) { + return null; + } + // As this operation never returns nodes that were in the prefix tree, or tests if prefix + // nodes are the same as instances in the range tree, there's no need to intern it. + return retainFromFn.apply(prefix, intern(range)); + } + + /** Returns the union of one or more range trees. */ + private RangeTree union(RangeTree first, RangeTree... rest) { + Node node = first.root; + for (RangeTree t : rest) { + node = unionImpl(node, t.root); + } + tidyUpInterningMap(); + return newOrEmptyTree(node); + } + + /** + * Returns the union of two prefix trees. For prefix trees {@code p1} and {@code p2}, the union + * {@code P = p1.union(p2)} is defined such that: + *

{@code
+     *   P.filter(R) = p1.filter(R).union(p2.filter(R))
+     * }
+ * If prefixes are the same length this is equivalent to {@link RangeTree#union(RangeTree)}, + * but when prefixes overlap, only the more general (shorter) prefix is retained. + */ + PrefixTree union(PrefixTree lhs, PrefixTree rhs) { + // Using one prefix tree (A) to filter another (B), gives you the set of ranges in B which + // are, at least, also contained in A. The union of two prefix trees need only contain the + // more general (shorter) prefix and the more specific (longer) prefixes must be removed + // since they overlap with the more general ones. + // + // For example "12".retainFrom("1234") == "1234", but we don't want to retain "1234" in the + // final range tree (since it will already contain "12" anyway). + // + // If the same prefix exists in both inputs however, just doing this subtraction would remove + // it from the result (which is not what we want), so we also include the intersection of the + // prefix ranges. + RangeTree ltree = lhs.asRangeTree(); + RangeTree rtree = rhs.asRangeTree(); + return PrefixTree.from(union( + // Prefixes in both inputs (which would otherwise be removed by the subtractions below). + intersect(ltree, rtree), + // Prefixes in "lhs" which are strictly more general than any prefix in "rhs" + subtract(ltree, retainFrom(rhs, ltree)), + // Prefixes in "rhs" which are strictly more general than any prefix in "lhs" + subtract(rtree, retainFrom(lhs, rtree)))); + } + + /** Returns the intersection of one or more range trees. */ + private RangeTree intersect(RangeTree first, RangeTree... rest) { + Node node = first.root; + for (RangeTree t : rest) { + node = intersectImpl(node, t.root); + } + tidyUpInterningMap(); + return newOrEmptyTree(node); + } + + /** + * Returns the intersection of two prefix trees. For prefix trees {@code p1} and {@code p2}, + * the intersection {@code P = p1.intersect(p2)} is defined such that: + *
{@code
+     *   P.filter(R) = p1.filter(R).intersect(p2.filter(R))
+     * }
+ * If prefixes are the same length this is equivalent to {@link RangeTree#intersect(RangeTree)}, + * but when prefixes overlap, only the more specific (longer) prefix is retained. + */ + PrefixTree intersect(PrefixTree lhs, PrefixTree rhs) { + return PrefixTree.from(union( + // Prefixes in "lhs" which are the same or more specific as any prefix in "rhs" + retainFrom(rhs, lhs.asRangeTree()), + // Prefixes in "rhs" which are the same or more specific as any prefix in "lhs" + retainFrom(lhs, rhs.asRangeTree()))); + } + + /** Returns the difference of two range trees, {@code lhs - rhs}. */ + private RangeTree subtract(RangeTree lhs, RangeTree rhs) { + Node node = subtractImpl(lhs.root, rhs.root); + tidyUpInterningMap(); + return newOrEmptyTree(node); + } + + /** + * Returns a subset of the given ranges, containing only ranges which are prefixed by an + * element in the given prefix tree. For example: + *
 {@code
+     *   RangeTree r = { "12xx", "1234x" }
+     *   PrefixTree p = { "12[0-5]" }
+     *   retainFrom(p, r) = { "12[0-5]x", "1234x"}
+     * }
+ * Note that if the prefix tree is empty, this method returns the empty range tree. + */ + RangeTree retainFrom(PrefixTree prefixes, RangeTree ranges) { + Node node = retainFromImpl(prefixes.asRangeTree().root, ranges.root); + tidyUpInterningMap(); + return newOrEmptyTree(node); + } + } + + /** + * Returns a minimal range tree for the given specification. The tree has only one path and only + * matches digit sequences of the same length as the given specification. + */ + public static RangeTree from(RangeSpecification s) { + Node node = Node.TERMINAL; + for (int n = s.length() - 1; n >= 0; n--) { + node = new Node(ImmutableList.of(new Edge(s.getBitmask(n), node)), false); + } + return newOrEmptyTree(node); + } + + /** + * Returns a minimal range tree for the given specifications. This tree is formed as the logical + * union of the trees for all given specifications. + */ + public static RangeTree from(Iterable specs) { + SetOperations setOps = SetOperations.INSTANCE; + Node node = null; + for (RangeSpecification s : specs) { + node = setOps.unionImpl(node, from(s).root); + } + setOps.tidyUpInterningMap(); + return newOrEmptyTree(node); + } + + /** + * Returns a minimal range tree for the given specifications. This tree is formed as the logical + * union of the trees for all given specifications. + */ + public static RangeTree from(Stream specs) { + return from(specs.collect(toImmutableList())); + } + + /** + * Returns a minimal range tree for the given digit sequence ranges. This tree is formed as the + * logical union of all the range specifications derived from the given ranges. + */ + public static RangeTree from(RangeSet ranges) { + // Currently we don't accept an empty range set in RangeSpecification.from(). + return !ranges.isEmpty() ? from(RangeSpecification.from(ranges)) : RangeTree.empty(); + } + + /** + * Returns a range tree whose root is the given DfaNode. The given node must have been found by + * visiting an existing range tree. This method is useful for efficiently implementing "sub tree" + * logic in some cases. + * + * @throws IllegalArgumentException if the given node did not come from a valid range tree. + */ + @SuppressWarnings("ReferenceEquality") + public static RangeTree from(DfaNode root) { + checkNotNull(root, "root node cannot be null"); + checkArgument(root instanceof Node, + "invalid root node (wrong type='%s'): %s", root.getClass(), root); + Node node = (Node) root; + // Reference equality is correct since this is testing for interning. + checkArgument(node == SetOperations.INSTANCE.intern(node), + "invalid root node (not from valid RangeTree): %s", node); + return new RangeTree(node); + } + + private static final RangeTree EMPTY = new RangeTree(); + + /** Returns the enpty range tree, which matches only the empty digit sequence. */ + public static RangeTree empty() { + return EMPTY; + } + + private static RangeTree newOrEmptyTree(Node node) { + return node != null ? new RangeTree(node) : EMPTY; + } + + /** + * The root node, possibly null to signify the "empty" tree which matches no possible digit + * sequences (this is distinct from a tree that matches only the empty digit sequence). + */ + private final Node root; + private final long matchCount; + private final Supplier> lengths; + // Cached on demand. + private Integer hashCode = null; + + /** Constructor for the singleton empty tree. */ + private RangeTree() { + this.root = null; + // Unlike the terminal node (which matches the empty sequence), the empty tree matches nothing. + this.matchCount = 0L; + this.lengths = Suppliers.ofInstance(ImmutableSortedSet.of()); + } + + /** Constructor for a non-empty tree. */ + private RangeTree(Node root) { + this.root = Preconditions.checkNotNull(root); + this.matchCount = root.matchCount; + this.lengths = Suppliers.memoize(() -> calculateLengths(root)); + } + + /** + * Returns whether this range tree accepts any input sequences. Note that in theory a range tree + * could accept the empty digit sequence (but in that case it would not be empty). An empty range + * tree cannot match any possible sequence. + */ + public boolean isEmpty() { + return root == null; + } + + private static ImmutableSortedSet calculateLengths(Node root) { + // Length mask cannot be 0 as it must match (at least) sequences of length 0. + int lengthMask = root.lengthMask; + ImmutableSortedSet.Builder lengths = ImmutableSortedSet.naturalOrder(); + do { + int length = numberOfTrailingZeros(lengthMask); + lengths.add(length); + // Clear each bit as we go. + lengthMask &= ~(1 << length); + } while (lengthMask != 0); + return lengths.build(); + } + + /** Returns the set of digit sequence lengths which could be matched by this range tree. */ + public ImmutableSortedSet getLengths() { + return lengths.get(); + } + + /** + * Returns the smallest digit sequence which will be accepted by this range tree, in + * {@link DigitSequence} order. Note that this is not the same as calling {@code sample(0)}, + * since {@link #sample(long)} does not use {@code DigitSequence} order. + * + * @return the smallest digit sequence accepted by this tree. + * @throws IllegalStateException if the tree is empty. + */ + public DigitSequence first() { + checkState(!isEmpty(), "cannot get minimum sequence for an empty range tree"); + DigitSequence first = DigitSequence.empty(); + Node node = root; + int minLength = Integer.numberOfTrailingZeros(root.lengthMask); + if (minLength > 0) { + // Length mask is the mask for checking against the target node's length(s), so we pre-shift + // it by one (i.e. not "1 << minLength"). This is also why there needs to be a zero check + // around this loop since otherwise we would not correctly detect when the empty sequence was + // in the tree (we don't check the root node in this loop). + for (int lengthMask = 1 << (minLength - 1); lengthMask > 0; lengthMask >>>= 1) { + for (Edge e : node.edges) { + // Exit when we find the first edge for which the minimum length path can be reached. + // This is only possible for first() because edges are ordered by their minimum digit + // (you could not use a similar trick to implement a last() method). This break must be + // reached since at least once edge _must_ have the expected length bit set. + if ((e.target.lengthMask & lengthMask) != 0) { + first = first.extendBy(DigitSequence.singleton(Integer.numberOfTrailingZeros(e.mask))); + node = e.target; + break; + } + } + } + } + return first; + } + + /** + * Returns a digit sequence in the range tree for a given sampling index (in the range + * {@code 0 <= index < size()}). Note that this method makes no promises about the specific + * ordering used since it is dependant on the internal tree structure. + * + *

However the mapping from index to sequence is guaranteed to be a bijection, so while it is + * not true that {@code sample(n).next().equals(sample(n+1))}, it is true that + * {@code sample(n).equals(sample(m))} if-and-only-if {@code n == m}. Thus a pseudo random sample + * of N distinct indices will result in N distinct sequences. + * + *

This method is not recommended for general iteration over a tree, since there can be + * trillions of digit sequences. + * + * @throws ArrayIndexOutOfBoundsException if the index is invalid. + */ + public DigitSequence sample(long index) { + if (index < 0 || index >= size()) { + throw new IndexOutOfBoundsException( + String.format("index (%d) out of bounds [0...%d]", index, size())); + } + return recursiveGet(root, index); + } + + @SuppressWarnings("ReferenceEquality") // Nodes are interned. + private static DigitSequence recursiveGet(Node node, long index) { + // We can assert that 0 <= index < node.matchCount by inspection (checked initially and true + // by code inspection below). + if (node.canTerminate()) { + // Every recursion should end here (since at some point we traverse the final edge where the + // index has been reduced to zero). However we also get here while still in the tree and must + // decide whether to terminate if we see a terminating node. + if (index == 0) { + return DigitSequence.empty(); + } + // Subtract 1 to account for this early terminating digit sequence (it is reflected in the + // match count so we must adjust our index before moving on). + index -= 1; + } + // Should always have at least one out edge here so the mask isn't empty. + checkState(node != Node.TERMINAL, "!!! Bad RangeTree !!!"); + for (Edge e : node.edges) { + long weightedCount = e.target.matchCount * Integer.bitCount(e.mask); + if (index >= weightedCount) { + // We are not following this edge, so adjust index and continue. + index -= weightedCount; + continue; + } + // Find which digit of the edge we are traversing. If we are in the Nth copy of the match + // count we want the digit corresponding to the Nth bit in the edge mask. Achieve this by + // repeatedly removing the lowest set bit each time around the loop (getting the lowest set + // bit as a mask is way faster than getting it's bit position). + int mask = e.mask; + while (index >= e.target.matchCount) { + index -= e.target.matchCount; + mask &= ~Integer.lowestOneBit(mask); + } + return DigitSequence.singleton(Integer.numberOfTrailingZeros(mask)) + .extendBy(recursiveGet(e.target, index)); + } + // Should be impossible since we should always find an edge for the current index. If we get + // here something is very messed up with either this code or the internal data structure. + throw new IllegalStateException("!!! Bad RangeTree !!!"); + } + + /** Returns the number of unique digit sequences contained in this range tree. */ + public long size() { + return matchCount; + } + + // -------- Set-like operations -------- + + /** Returns the minimal logical union of this instance and the given tree. */ + public RangeTree union(RangeTree tree) { + return SetOperations.INSTANCE.union(this, tree); + } + + /** Returns the minimal logical intersection of this instance and the given tree. */ + public RangeTree intersect(RangeTree tree) { + return SetOperations.INSTANCE.intersect(this, tree); + } + + /** Returns the minimal logical subtraction of the given tree from this instance. */ + public RangeTree subtract(RangeTree tree) { + return SetOperations.INSTANCE.subtract(this, tree); + } + + /** Returns whether a given digit sequence is in the set of sequences defined by this tree. */ + public boolean contains(DigitSequence digits) { + Node node = root; + if (node == null) { + return false; + } + for (int n = 0; n < digits.length(); n++) { + node = node.findTarget(digits.getDigit(n)); + if (node == null) { + return false; + } + } + return node.canTerminate(); + } + + /** + * Returns true if the given tree is a subset of this instance. This is functionally equivalent + * to {@code tree.subtract(this).isEmpty()}, but much more efficient in cases where it returns + * false. + */ + public boolean containsAll(RangeTree tree) { + if (tree.isEmpty()) { + // Everything contains all the contents of the empty set (even the empty set). + return true; + } + if (isEmpty()) { + // Nothing is contained by the empty set. + return false; + } + ContainsAllVisitor v = new ContainsAllVisitor(getInitial()); + tree.getInitial().accept(v); + return v.containsAll; + } + + // A very efficient test of tree containment (faster than doing "b.subtract(a).isEmpty()"). + private static final class ContainsAllVisitor implements DfaVisitor { + private boolean containsAll = true; + private DfaNode current; + private ContainsAllVisitor(DfaNode node) { + current = node; + } + + @SuppressWarnings("ReferenceEquality") + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + // Since nodes are interned once in a tree, '==' is sufficient and not potentially slow. + if (current == source || !containsAll) { + // An identical subtree means we can shortcut everything (also if we know we've failed). + return; + } + // No containment if the "subset" tree has lengths not in the current tree. + // This also effectively checks for termination at this node (via bit-0). + if ((~current.getLengthMask() & source.getLengthMask()) != 0) { + containsAll = false; + return; + } + // Recursively check that the sub-tree of the target node is contained within one or more + // edges of the current tree. + int subMask = edge.getDigitMask(); + for (DfaEdge e : current.getEdges()) { + // Look at paths which are in both trees. + int m = (e.getDigitMask() & subMask); + if (m != 0) { + DfaNode oldCurrent = current; + current = e.getTarget(); + edge.getTarget().accept(this); + current = oldCurrent; + if (!containsAll) { + // Containment failure in some sub-tree. + return; + } + // Clear bits we're accounted for. + subMask &= ~m; + } + } + // If not all edges were accounted for, this was not a valid sub-tree. + if (subMask != 0) { + containsAll = false; + } + } + } + + // -------- Non set-like operations (transforming a RangeTree in non set-like ways) -------- + + /** A general mapping function for transforming a range tree via its specifications. */ + public RangeTree map(Function fn) { + return from(asRangeSpecifications().stream().map(fn)); + } + + /** + * Returns a range tree which matches the same digit sequences as this instance down to the first + * {@code n} digits. The returned tree is a super-set of this instance. + */ + public RangeTree significantDigits(int n) { + checkArgument(n >= 0, "invalid significant digits"); + return map(s -> s.first(n).extendByLength(Math.max(s.length() - n, 0))); + } + + /** Returns a range tree with the given "path" prefixed to the front. */ + public RangeTree prefixWith(RangeSpecification prefix) { + checkArgument(isEmpty() || getLengths().last() + prefix.length() <= DigitSequence.MAX_DIGITS, + "cannot extend range tree (prefix '%s' too long): %s", prefix, this); + return (prefix.length() > 0) ? map(prefix::extendBy) : this; + } + + /** + * Slices a range tree at a single length. This is equivalent to {@code slice(length, length)}. + */ + public RangeTree slice(int length) { + return slice(length, length); + } + + /** + * Slices a range tree within the specified length bounds. A path exists in the returned tree if + * it's length is in the (inclusive) range {@code [minLength, maxLength]} or it was longer than + * {@code maxLength} but has been truncated. Importantly the returned range tree is not a subset + * of the original tree. + * + *

This method can be thought of as returning the "complete or partially complete digit + * sequences up to the specified maximum length". It is useful for calculating prefixes which + * match partial digit sequences (i.e. "as you type" formatting). + * + *

For example: + *

 {@code
+   * slice({ 12345, 67xxx, 89 }, 0, 3) == { 123, 67x, 89 }
+   * slice({ 12345, 67xxx, 89 }, 3, 3) == { 123, 67x }
+   * slice({ 12, 34, 5 }, 2, 3) == { 12, 34 }
+   * slice({ 12, 34 }, 3, 3) == { }
+   * }
+ */ + public RangeTree slice(int minLength, int maxLength) { + return from( + asRangeSpecifications().stream() + .filter(s -> s.length() >= minLength) + .map(s -> s.first(maxLength))); + } + + // -------- Transformation APIs (converting a RangeTree to another representation) -------- + + /** Returns the minimal, ordered list of range specifications represented by this tree. */ + public ImmutableList asRangeSpecifications() { + if (root == null) { + return ImmutableList.of(); + } + List out = new ArrayList<>(); + int lenMask = root.lengthMask; + + if ((lenMask & (lenMask - 1)) == 0) { + // If this tree only matches one length of sequences, we can just serialize it directly. + addSpecs(this.root, RangeSpecification.empty(), out); + } else { + // When a tree matches more than one length, we cannot just serialize it in one go, because + // the tree for ["123", "####"] would serialize as: + // ["123", "[02-9]###", "1[013-9]##", "12[0-24-9]#", "123#"] + // and while the union of those 4-digit sequences is the same as "####", it's hardly minimal + // or user friendly. In order to get a minimal serialization for a given length N, it is + // sufficient to intersect the tree with the "allMatch" tree of length N (e.g. "####...") + // and then serialize the result. + SetOperations setOps = SetOperations.INSTANCE; + for (Integer length : getLengths()) { + // This can't be empty because we know there's at least one branch that matches digits + // of the current length (or we would not have returned the length from getLengths()). + addSpecs( + setOps.intersectImpl(this.root, allMatch(length)), RangeSpecification.empty(), out); + } + setOps.tidyUpInterningMap(); + } + return ImmutableList.sortedCopyOf(Comparator.comparing(RangeSpecification::min), out); + } + + /** + * Recursively adds the range specifications generated from the given sub-tree to the output + * list. + */ + private static void addSpecs(Node node, RangeSpecification spec, List out) { + if (node.canTerminate()) { + out.add(spec); + } + for (Edge e : node.edges) { + addSpecs(e.target, spec.extendByMask(e.mask), out); + } + } + + /** Returns a node which accepts any digit sequences of the given length. */ + private static Node allMatch(int length) { + Node node = Node.TERMINAL; + for (int n = 0; n < length; n++) { + node = new Node(ImmutableList.of(new Edge(ALL_DIGITS_MASK, node)), false); + } + return node; + } + + /** Returns the minimal covering of ranges specified by this tree. */ + public ImmutableRangeSet asRangeSet() { + ImmutableRangeSet.Builder out = ImmutableRangeSet.builder(); + // Not all ranges create different range specifications are disjoint and this will merge then + // into then minimal set. + for (RangeSpecification s : asRangeSpecifications()) { + out.addAll(s.asRanges()); + } + return out.build(); + } + + // -------- DFA visitor API -------- + + /** + * Accepts the given visitor on the root of this non-empty tree. + * + * @throws IllegalStateException if the tree is empty. + */ + public void accept(DfaVisitor visitor) { + checkState(root != null, "cannot accept a visitor on an empty range tree"); + root.accept(visitor); + } + + /** + * Returns the initial node of this non-empty tree. + * + * @throws IllegalStateException if the tree is empty. + */ + public DfaNode getInitial() { + checkState(root != null, "cannot get the initial node from an empty range tree"); + return root; + } + + /** Returns the singleton terminal node of any range tree. */ + public static DfaNode getTerminal() { + return Node.TERMINAL; + } + + // -------- Miscellaneous Object APIs -------- + + @SuppressWarnings("ReferenceEquality") + @Override + public boolean equals(Object o) { + // This could also just convert both to range specifications and use that, but that's likely + // a lot more work (not that this is trivial). If you really want fast equality of trees then + // the interning map needs to be global (but also switched to a weak map). + if (this == o) { + return true; + } + if (!(o instanceof RangeTree)) { + return false; + } + RangeTree other = (RangeTree) o; + if (root == null && other.root == null) { + // Empty trees are equal. + return true; + } + if (root == null || other.root == null) { + // An empty tree is never equal to a non-empty tree. + return false; + } + // Intern both trees and see if their roots are now identical. + SetOperations setOps = SetOperations.INSTANCE; + return setOps.intern(root) == setOps.intern(((RangeTree) o).root); + } + + @Override + public int hashCode() { + if (hashCode == null) { + hashCode = asRangeSpecifications().hashCode(); + } + return hashCode; + } + + /** Debugging only. */ + @Override + public String toString() { + return asRangeSpecifications().toString(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizer.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizer.java new file mode 100644 index 000000000..23d785679 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizer.java @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES; + +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import java.util.ArrayList; +import java.util.List; + +/** + * Factor a range tree into a sequence of trees which attempts to minimize overall complexity in + * the face of non-determinism. This can be used to reduce the size of any generated regular + * expressions. + */ +public final class RangeTreeFactorizer { + /** Strategies to control how merging is achieved when building factors.*/ + public enum MergeStrategy { + /** + * Edges are only merged if they accept exactly the same set of digits. If the existing factor + * contains "[0-5]" it will not be merged with the candidate edge "[0-8]". + */ + REQUIRE_EQUAL_EDGES, + /** + * Edges can be merged if the candidate edge accepts more digits than the existing edge. If the + * existing factor contains "[0-5]" and the candidate edge is "[0-8]", the candidate edge is + * split so that "[0-5]" is merged as normal and an additional edge "[6-8]" is branched off. + */ + ALLOW_EDGE_SPLITTING, + } + + /** + * Factors the given range tree. + *

+ * Paths are processed longest-first, and a path belongs in particular "factor" if it can be + * added without "causing a split" in the existing factor. For example, given an existing factor + * {@code {"12[3-6]x", "45xx"}}: + *

    + *
  • The path "12[3-6]" can be added, since it is a prefix of one of the existing paths in + * the DFA. + *
  • The path "13xx" can be added since it forms a new branch in the DFA, which does not + * affect any existing branches ("13..." is disjoint with "12..."). + *
  • The path "12[34]" cannot be added since it would "split" the existing path + * "12[3-6]x" in the DFA ("[34]" is a subset of "[3-6]"). " + *
  • Depending on the merge strategy, the path "12[0-6]x" might be added ("[0-6]" is a + * superset of "[3-6]"). See {@link MergeStrategy} for more information. + *
+ */ + public static ImmutableList factor(RangeTree ranges, MergeStrategy strategy) { + // If only one length on all paths, the DFA is already "factored". + if (ranges.getLengths().size() == 1) { + return ImmutableList.of(ranges); + } + List factors = new ArrayList<>(); + // Start with the "naive" factors (splitting by length) from longest to shortest. + for (int n : ranges.getLengths().descendingSet()) { + factors.add(ranges.intersect(RangeTree.from(RangeSpecification.any(n)))); + } + // Now attempt to merge as much of each of the shorter factors as possible into the longer ones. + // In each loop we subsume a candidate factor into previous factors, either in whole or in part. + int index = 1; + while (index < factors.size()) { + // Merge (as much as possible) each "naive" factor into earlier factors. + RangeTree r = factors.get(index); + for (int n = 0; n < index && !r.isEmpty(); n++) { + RangeTree merged = new RangeTreeFactorizer(factors.get(n), strategy).mergeFrom(r); + factors.set(n, merged); + // Calculate the ranges which haven't yet been merged into any earlier factor. + r = r.subtract(merged); + } + if (r.isEmpty()) { + // All ranges merged, so remove the original factor (index now references the next factor). + factors.remove(index); + } else { + // We have some un-factorable ranges which are kept to start a new factor. + factors.set(index, r); + index++; + } + } + return ImmutableList.copyOf(factors); + } + + // This is modified as paths are added. + private RangeTree factor; + private final MergeStrategy strategy; + + RangeTreeFactorizer(RangeTree factor, MergeStrategy strategy) { + this.factor = checkNotNull(factor); + this.strategy = strategy; + } + + RangeTree mergeFrom(RangeTree ranges) { + recursivelyMerge(ranges.getInitial(), factor.getInitial(), RangeSpecification.empty()); + return factor; + } + + void recursivelyMerge(DfaNode srcNode, DfaNode dstNode, RangeSpecification path) { + if (srcNode.canTerminate()) { + factor = factor.union(RangeTree.from(path)); + } else { + srcNode.accept(new FactoringVisitor(dstNode, path)); + } + } + + private final class FactoringVisitor implements DfaVisitor { + private final RangeSpecification path; + private final DfaNode dstNode; + + // True if we encountered a situation when an edge we are merging (srcMask) has a partial + // overlap with the existing edge (dstMask) (e.g. merging "[0-6]" into "[4-9]"). This is + // distinct from the case where the existing edge is a subset of the edge being merged (e.g. + // merging "[0-6]" into "[2-4]", where the edge being merged can be split into "[0156]" and + // "[2-4]"). In either strategy, a partial overlap will prevent merging. + private boolean partialOverlap = false; + + // Records the union of all edge ranges visited for the current node. This is used to determine + // the remaining edges that must be added after visiting the existing factor (especially in the + // case of ALLOW_EDGE_SPLITTING). + private int allDstMask = 0; + + FactoringVisitor(DfaNode dstNode, RangeSpecification path) { + this.dstNode = dstNode; + this.path = path; + } + + @Override + public void visit(DfaNode source, DfaEdge srcEdge, DfaNode srcTarget) { + int srcMask = srcEdge.getDigitMask(); + dstNode.accept((s, dstEdge, dstTarget) -> { + int dstMask = dstEdge.getDigitMask(); + if ((strategy == REQUIRE_EQUAL_EDGES) ? (dstMask == srcMask) : (dstMask & ~srcMask) == 0) { + // The set of digits accepted by the edge being merged (mask) is equal-to or a superset + // of the digits of the edge in the factor we are merging into. The path is extended by + // the destination edge because during recursion we only follow paths already in the + // factor. + recursivelyMerge(srcTarget, dstTarget, path.extendByMask(dstMask)); + } else { + partialOverlap |= (dstMask & srcMask) != 0; + } + allDstMask |= dstMask; + }); + if (!partialOverlap) { + // Work out the digits that weren't in any of the edges of the factor we were processing + // and merge the sub-tree under that edge into the current factor. For REQUIRE_EQUAL_EDGES + // the extraMask is always either srcMask or 0 (since the edge was either added in full, + // or disjoint with all the existing edges). For ALLOW_EDGE_SPLITTING it's the remaining + // range that wasn't merged with any of the existing paths. + int extraMask = srcMask & ~allDstMask; + if (extraMask != 0) { + new MergingVisitor(path).recurse(srcTarget, extraMask); + } + } + } + } + + private final class MergingVisitor implements DfaVisitor { + private final RangeSpecification path; + + MergingVisitor(RangeSpecification path) { + this.path = checkNotNull(path); + } + + void recurse(DfaNode node, int mask) { + RangeSpecification newPath = path.extendByMask(mask); + if (node.canTerminate()) { + factor = factor.union(RangeTree.from(newPath)); + } else { + node.accept(new MergingVisitor(newPath)); + } + } + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + recurse(target, edge.getDigitMask()); + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java new file mode 100644 index 000000000..392e62db4 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.base.CaseFormat.LOWER_CAMEL; +import static com.google.common.base.CaseFormat.UPPER_UNDERSCORE; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableBiMap.toImmutableBiMap; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.FIXED_LINE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.MOBILE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.PAGER; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.PERSONAL_NUMBER; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.PREMIUM_RATE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.SHARED_COST; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.TOLL_FREE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.UAN; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.VOICEMAIL; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.VOIP; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_FIXED_LINE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_MOBILE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_PAGER; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_PERSONAL_NUMBER; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_PREMIUM_RATE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_SHARED_COST; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_TOLL_FREE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_UAN; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_UNKNOWN; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_VOICEMAIL; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_VOIP; +import static java.util.function.Function.identity; + +import com.google.common.collect.ImmutableBiMap; +import com.google.common.collect.ImmutableSet; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType; +import com.google.i18n.phonenumbers.metadata.proto.Types.XmlShortcodeType; +import java.util.Optional; +import java.util.stream.Stream; + +/** Static utility for conversion of number types. */ +public final class Types { + private static final ImmutableBiMap XML_TYPE_MAP = + Stream.of(XmlNumberType.values()) + .filter(t -> t != XML_UNKNOWN && t != XmlNumberType.UNRECOGNIZED) + .collect(toImmutableBiMap(Types::toXmlName, identity())); + + // Map the subset of XmlNumberType values which correspond to valid number types. Note that while + // FIXED_LINE and MOBILE exist in both types, and can be converted, their semantics change. + private static final ImmutableBiMap XML_TO_SCHEMA_TYPE_MAP = + ImmutableBiMap.builder() + .put(XML_FIXED_LINE, FIXED_LINE) + .put(XML_MOBILE, MOBILE) + .put(XML_PAGER, PAGER) + .put(XML_TOLL_FREE, TOLL_FREE) + .put(XML_PREMIUM_RATE, PREMIUM_RATE) + .put(XML_SHARED_COST, SHARED_COST) + .put(XML_PERSONAL_NUMBER, PERSONAL_NUMBER) + .put(XML_VOIP, VOIP) + .put(XML_UAN, UAN) + .put(XML_VOICEMAIL, VOICEMAIL) + .build(); + + /** Returns the set of valid XML type names. */ + public static ImmutableSet getXmlNames() { + return XML_TYPE_MAP.keySet(); + } + + /** Returns the XML element name based on the given XML range type. */ + public static String toXmlName(XmlNumberType type) { + checkState(type.name().startsWith("XML_"), "Bad type: %s", type); + return UPPER_UNDERSCORE.to(LOWER_CAMEL, type.name().substring(4)); + } + + /** Returns the XML element name based on the given XML shortcode type. */ + public static String toXmlName(XmlShortcodeType type) { + checkState(type.name().startsWith("SC_"), "Bad type: %s", type); + return UPPER_UNDERSCORE.to(LOWER_CAMEL, type.name().substring(3)); + } + + /** + * Returns the XML range type based on the given case-sensitive XML element name (e.g. + * "fixedLine"). + */ + public static Optional forXmlName(String xmlName) { + return Optional.ofNullable(XML_TYPE_MAP.get(xmlName)); + } + + /** Returns the {@code ValidNumberType} equivalent of the given XML range type (if it exists). */ + public static Optional toSchemaType(XmlNumberType rangeType) { + return Optional.ofNullable(XML_TO_SCHEMA_TYPE_MAP.get(rangeType)); + } + + /** Returns the {@code XmlNumberType} equivalent of the given schema range type (if it exists). */ + public static Optional toXmlType(ValidNumberType schemaType) { + return Optional.ofNullable(XML_TO_SCHEMA_TYPE_MAP.inverse().get(schemaType)); + } + + private Types() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegion.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegion.java new file mode 100644 index 000000000..0fe9d3dcd --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegion.java @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.i18n; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static java.util.Comparator.comparing; +import static java.util.Comparator.naturalOrder; + +import com.google.auto.value.AutoValue; +import com.ibm.icu.util.ULocale; +import java.util.Comparator; +import java.util.regex.Pattern; + +/** + * A simple type-safe identifier for CLDR regions for phone numbers. Only basic checking of regions + * is performed, but this should be fine since the set of input regions is tightly controlled. + * + *

The metadata tooling makes only minimal use of the semantics of region codes, relying on + * them mainly as key values, and never tries to canonicalize or modify them. + */ +@AutoValue +public abstract class PhoneRegion implements Comparable { + // We limit the non XX region codes to just "world" for this project. + private static final Pattern VALID_CODE = Pattern.compile("[A-Z]{2}|001"); + // Since we want "ZZ" < "001" in the ordering. + private static Comparator ORDERING = + comparing(r -> r.locale().getCountry(), + comparing(String::length).thenComparing(naturalOrder())); + + private static final PhoneRegion UNKNOWN = of("ZZ"); + private static final PhoneRegion WORLD = of("001"); + + /** Returns the "world" region (001). */ + public static PhoneRegion getWorld() { + return PhoneRegion.WORLD; + } + + /** Returns the "unknown" region (ZZ). */ + public static PhoneRegion getUnknown() { + return PhoneRegion.UNKNOWN; + } + + /** + * Returns the region identified by the given case-insensitive CLDR String representation. + * + * @throws IllegalArgumentException if there is no region for {@code cldrCode} + */ + public static PhoneRegion of(String cldrCode) { + checkArgument(VALID_CODE.matcher(cldrCode).matches(), "invalid region code: %s", cldrCode); + return new AutoValue_PhoneRegion(new ULocale.Builder().setRegion(cldrCode).build()); + } + + @Override + public int compareTo(PhoneRegion other) { + return ORDERING.compare(this, other); + } + + /** Returns the string representation for the region (either a two-letter or three-digit code). */ + @Override public final String toString() { + String s = locale().getCountry(); + checkArgument(!s.isEmpty(), "invalid (empty) country: %s", locale()); + return s; + } + + // Visible for AutoValue only. + abstract ULocale locale(); + + /** + * Return an English identifier for the region in the form {@code " ()"}. + * If the English name is not available, then {@code "Region: "} is returned. This + * This string is only suitable for use in comments. + * + * @throws IllegalStateException if this method is called on the "world" region. + */ + public String getEnglishNameForXmlComments() { + checkState(!equals(getWorld()), "cannot ask for display name of 'world' region"); + String regionStr = locale().getCountry(); + // Use "US" so we get "en_US", and not just "en", since the policy is to use the name as it + // would appear in America. + String displayCountry = locale().getDisplayCountry(ULocale.US); + return !displayCountry.isEmpty() && !displayCountry.equals(regionStr) + ? String.format("%s (%s)", displayCountry, regionStr) + : String.format("Region: %s", regionStr); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTag.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTag.java new file mode 100644 index 000000000..b029e6cfa --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTag.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.i18n; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.value.AutoValue; +import java.util.regex.Pattern; + +/** + * A simple type-safe identifier for BCP 47 language tags containing only language code and an + * optional script (e.g. "en" or "zh-Hant"). This class does no canonicalization on the values its + * given, apart from normalizing the separator to a hyphen. + * + *

We can't really use {@code Locale} here because there's an issue whereby the JDK deliberately + * uses deprecated language tags and would, for example, convert "id" (Indonesian) to "in", which + * is at odds with BCP 47. See {@link java.util.Locale#forLanguageTag(String) forLanguageTag()} for + * more information. + * + *

The metadata tooling makes only minimal use of the semantics of language codes, relying on + * them mainly as key values, and never tries to canonicalize or modify them (i.e. it is possible + * that a language code used for this data may end up being non-canonical). It is up to any library + * which loads the metadata at runtime to ensure that its mappings to the data account for current + * canonicalization. + */ +@AutoValue +public abstract class SimpleLanguageTag { + // This can be extended or modified to use Locale as necessary. + private static final Pattern SIMPLE_TAG = Pattern.compile("[a-z]{2,3}(?:[-_][A-Z][a-z]{3})?"); + + /** + * Returns a language tag instance for the given string with minimal structural checking. If the + * given tag uses {@code '_'} for separating language and script it's converted into {@code '-'}. + */ + public static SimpleLanguageTag of(String lang) { + checkArgument(SIMPLE_TAG.matcher(lang).matches(), "invalid language tag: %s", lang); + return new AutoValue_SimpleLanguageTag(lang.replace('_', '-')); + } + + // Visible for AutoValue only. + abstract String lang(); + + @Override + public final String toString() { + return lang(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpec.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpec.java new file mode 100644 index 000000000..4fc607063 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpec.java @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.value.AutoValue; +import com.google.auto.value.extension.memoized.Memoized; +import com.google.common.base.Ascii; +import com.google.common.base.CharMatcher; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.model.FormatSpec.FormatGroup; +import com.google.i18n.phonenumbers.metadata.model.FormatSpec.FormatTemplate; +import java.util.Optional; + +/** + * An alternate format, used to describe less common ways we believe a phone number can be + * formatted in a region. These can be derived from an "alias" in the formats table, or as + * "historical" formats which are not associated with any specific current format. + * + *

Note that alternate formats can be defined with the same template, and they are merged + * together to produce a canonical map in which the format template is the key. + */ +@AutoValue +public abstract class AltFormatSpec { + private static final CharMatcher OPT_DIGIT = CharMatcher.is('*'); + private static final CharMatcher ANY_DIGIT = CharMatcher.is('X'); + private static final CharMatcher ALLOWED_TEMPLATE_CHARS = CharMatcher.anyOf("X* "); + + public static AltFormatSpec create( + FormatTemplate template, RangeSpecification prefix, String parent, Optional comment) { + // As only a limited set of chars is allowed, we know things like national prefix or carrier + // codes cannot be present. We're just interested in basic grouping like "XXX XXX**". + String spec = template.getSpecifier(); + checkArgument(ALLOWED_TEMPLATE_CHARS.matchesAllOf(spec) && !template.getXmlPrefix().isPresent(), + "invalid alternate format template: %s", template); + // Prefix must be shorter than the template and not contain any trailing 'x'. + checkArgument(prefix.length() <= template.minLength() && prefix.equals(prefix.getPrefix()), + "invalid prefix '%s' for alternate format template: %s", prefix, template); + // If variable length, the spec must have room for the prefix before the '*' characters. + checkArgument( + OPT_DIGIT.matchesNoneOf(spec) + || prefix.length() <= ANY_DIGIT.countIn(spec.substring(0, OPT_DIGIT.indexIn(spec))), + "invalid prefix '%s' for alternate format template: %s", prefix, template); + return new AutoValue_AltFormatSpec(template, prefix, parent, comment); + } + + /** Return the alternate format template containing only simple grouping (e.g. "XXX XXX**"). */ + public abstract FormatTemplate template(); + + /** + * Returns the prefix for this alternate format which (along with the template length) defines + * the bounds over which this format can apply based. + */ + public abstract RangeSpecification prefix(); + + /** Returns the ID of the format for which this specifier is an alternative. */ + public abstract String parentFormatId(); + + /** Returns the arbitrary comment, possibly containing newlines, for this format. */ + public abstract Optional comment(); + + /** Returns the format specifier as used in the CSV representation (e.g. "20 XXX XXX"). */ + @Memoized + public String specifier() { + RangeSpecification prefix = prefix(); + int digitIdx = 0; + StringBuilder buf = new StringBuilder(); + for (FormatGroup g : template().getGroups()) { + for (int i = 0; i < g.maxLength(); i++, digitIdx++) { + // Uppercasing is so that 'x' --> 'X' + buf.append(digitIdx < prefix.length() + ? Ascii.toUpperCase(RangeSpecification.toString(prefix.getBitmask(digitIdx))) + : (i < g.minLength() ? "X" : "*")); + } + buf.append(" "); + } + buf.setLength(buf.length() - 1); + return buf.toString(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchema.java new file mode 100644 index 000000000..9005990a2 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchema.java @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.CharMatcher.whitespace; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.table.CsvParser.rowMapper; +import static java.util.function.Function.identity; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Ascii; +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.model.FormatSpec.FormatTemplate; +import com.google.i18n.phonenumbers.metadata.table.CsvParser; +import com.google.i18n.phonenumbers.metadata.table.CsvParser.RowMapper; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.CsvTableCollector; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Stream; +import javax.annotation.Nullable; + +/** + * The schema of the "AltFormats" table with rows identified by an "alternate format specifier": + *

    + *
  1. {@link #PARENT}: The ID of the "main" format that this is an alternate of. + *
  2. {@link #COMMENT}: Freeform comment text. + *
+ * + *

Rows keys are serialized via the marshaller and produce the leading column: + *

    + *
  1. {@code Format}: The alternate format specifier including prefix and grouping information + * (e.g. "20 XXXX XXXX"). + *
+ */ +public final class AltFormatsSchema { + private static final String FORMAT = "Format"; + private static final String PARENT = "Parent Format"; + private static final String COMMENT = "Comment"; + + public static final ImmutableList HEADER = ImmutableList.of(FORMAT, PARENT, COMMENT); + + private static final CsvParser CSV_PARSER = CsvParser.withSeparator(';').trimWhitespace(); + private static final RowMapper ROW_MAPPER = + rowMapper(h -> checkArgument(h.equals(HEADER), "unexpected alt-format header: %s", h)); + + /** Loads the alternate formats from a given file path. */ + public static ImmutableList loadAltFormats(Path path) { + if (!Files.exists(path)) { + return ImmutableList.of(); + } + try (Reader csv = Files.newBufferedReader(path)) { + return importAltFormats(csv); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @VisibleForTesting + static ImmutableList importAltFormats(Reader csv) throws IOException { + List altFormats = new ArrayList<>(); + Consumer> rowCallback = getRowCallback(altFormats); + try (BufferedReader r = new BufferedReader(csv)) { + CSV_PARSER.parse(r.lines(), + row -> rowCallback.accept(row.map(CsvTable::unescapeSingleLineCsvText))); + } + return ImmutableList.copyOf(altFormats); + } + + public static ImmutableList importAltFormats(Supplier> rows) { + List altFormats = new ArrayList<>(); + Consumer> rowCallback = getRowCallback(altFormats); + // Expect header row always. + rowCallback.accept(rows.get().stream()); + List row; + while ((row = rows.get()) != null) { + rowCallback.accept(row.stream()); + } + return ImmutableList.copyOf(altFormats); + } + + private static Consumer> getRowCallback(List altFormats) { + return ROW_MAPPER.mapTo( + row -> altFormats.add(parseAltFormat(row.get(FORMAT), row.get(PARENT), row.get(COMMENT)))); + } + + public static AltFormatSpec parseAltFormat( + String altId, String parent, @Nullable String comment) { + // "1X [2-8]XXX** XXX" --> "XX XXXX** XXX" + FormatTemplate template = FormatTemplate.parse(altId.replaceAll("[0-9]|\\[[-0-9]+\\]", "X")); + + // "1X [2-8]XXX** XXX" --> "1X [2-8]" --> "1X[2-8]" --> "1x[2-8]" + // The prefix here can (and often will be) the empty string. + // This fails if '*' is ever left in the specification, but that really should not happen. + RangeSpecification prefix = RangeSpecification.parse( + Ascii.toLowerCase(whitespace().removeFrom(altId.replaceAll("[X* ]*$", "")))); + return AltFormatSpec.create(template, prefix, parent, Optional.ofNullable(comment)); + } + + /** Exports alternate formats to a collector (potentially escaping fields for CSV). */ + public static void export( + List altFormats, Consumer> collector, boolean toCsv) { + collector.accept(HEADER.stream()); + Function escapeFn = toCsv ? CsvTable::escapeForSingleLineCsv : identity(); + altFormats.forEach( + f -> collector.accept( + Stream.of(f.specifier(), f.parentFormatId(), f.comment().map(escapeFn).orElse("")))); + } + + /** Helper method to write alternate formats in same CSV format as CsvTable. */ + public static boolean exportCsv(Writer csv, List altFormats) { + if (altFormats.isEmpty()) { + return false; + } + CsvTableCollector collector = new CsvTableCollector(true); + export(altFormats, collector, true); + collector.writeCsv(csv); + return true; + } + + private AltFormatsSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchema.java new file mode 100644 index 000000000..0d9191c2b --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchema.java @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.table.CsvParser.rowMapper; +import static java.util.Comparator.comparing; +import static java.util.function.Function.identity; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment.Anchor; +import com.google.i18n.phonenumbers.metadata.table.CsvParser; +import com.google.i18n.phonenumbers.metadata.table.CsvParser.RowMapper; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.CsvTableCollector; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.io.Writer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Stream; + +/** + * The data schema for handling XML comments. Note that, unlike other "table" schemas, this does + * not represent comments in the form of a CsvTable. This is because comment anchors can appear + * multiple times in the CSV file (so there's no unique key). This is not an issue since the + * internal data representation handles this, but it just means that code cannot be reused as much. + */ +public class CommentsSchema { + private static final String REGION = "Region"; + private static final String LABEL = "Label"; + private static final String COMMENT = "Comment"; + + public static final ImmutableList HEADER = ImmutableList.of(REGION, LABEL, COMMENT); + + private static final Comparator ORDERING = comparing(Comment::getAnchor); + + private static final CsvParser CSV_PARSER = CsvParser.withSeparator(';').trimWhitespace(); + private static final RowMapper ROW_MAPPER = + rowMapper(h -> checkArgument(h.equals(HEADER), "unexpected comment header: %s", h)); + + /** Loads the comments from a given file path. */ + public static ImmutableList loadComments(Path path) { + if (!Files.exists(path)) { + return ImmutableList.of(); + } + try (Reader csv = Files.newBufferedReader(path)) { + return importComments(csv); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @VisibleForTesting + static ImmutableList importComments(Reader csv) throws IOException { + List comments = new ArrayList<>(); + Consumer> rowCallback = getRowCallback(comments); + try (BufferedReader r = new BufferedReader(csv)) { + CSV_PARSER.parse(r.lines(), + row -> rowCallback.accept(row.map(CsvTable::unescapeSingleLineCsvText))); + } + return ImmutableList.sortedCopyOf(ORDERING, comments); + } + + public static ImmutableList importComments(Supplier> rows) { + List comments = new ArrayList<>(); + Consumer> rowCallback = getRowCallback(comments); + // Expect header row always. + rowCallback.accept(rows.get().stream()); + List row; + while ((row = rows.get()) != null) { + rowCallback.accept(row.stream()); + } + return ImmutableList.sortedCopyOf(ORDERING, comments); + } + + private static Consumer> getRowCallback(List comments) { + return ROW_MAPPER.mapTo(row -> { + if (row.containsKey(COMMENT)) { + comments.add( + Comment.fromText( + Anchor.of(PhoneRegion.of(row.get(REGION)), row.get(LABEL)), + row.get(COMMENT))); + } + }); + } + + /** Exports alternate formats to a collector (potentially escaping fields for CSV). */ + public static void export( + List comments, Consumer> collector, boolean toCsv) { + collector.accept(HEADER.stream()); + Function escapeFn = toCsv ? CsvTable::escapeForSingleLineCsv : identity(); + comments.stream() + .sorted(ORDERING) + .forEach(c -> collector.accept(Stream.of( + c.getAnchor().region().toString(), c.getAnchor().label(), escapeFn.apply(c.toText())))); + } + + /** Helper method to write comments in same CSV format as CsvTable. */ + public static boolean exportCsv(Writer csv, List comments) { + if (comments.isEmpty()) { + return false; + } + CsvTableCollector collector = new CsvTableCollector(true); + export(comments, collector, true); + collector.writeCsv(csv); + return true; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CsvData.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CsvData.java new file mode 100644 index 000000000..b2d1d3833 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/CsvData.java @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.i18n.phonenumbers.metadata.model.MetadataException.checkMetadata; + +import com.google.auto.value.AutoValue; +import com.google.auto.value.extension.memoized.Memoized; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedMap; +import com.google.common.collect.ImmutableTable; +import com.google.common.collect.Iterables; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.ExamplesTableSchema.ExampleNumberKey; +import com.google.i18n.phonenumbers.metadata.model.MetadataTableSchema.Regions; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment; +import com.google.i18n.phonenumbers.metadata.model.ShortcodesTableSchema.ShortcodeKey; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.CsvTable.DiffMode; +import com.google.i18n.phonenumbers.metadata.table.DiffKey; +import com.google.i18n.phonenumbers.metadata.table.DiffKey.Status; +import com.google.i18n.phonenumbers.metadata.table.RangeKey; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +/** + * All CSV based tables and legacy XML for a single calling code. This is the data from which all + * legacy data can be reconstructed (metadata XML, carrier/geocode/timezone mappings). + * + *

This is loaded at once, possibly from multiple files, since conversion to legacy formats + * often requires more than one of these data structures. + */ +@AutoValue +public abstract class CsvData { + /** CSV data loading API. */ + public interface CsvDataProvider { + /** Loads the top-level metadata table which containing data for all supported calling codes.*/ + CsvTable loadMetadata() throws IOException; + /** Loads the CSV data for a single calling code. */ + CsvData loadData(DigitSequence cc) throws IOException; + } + + /** + * Creates a single CsvData instance, either directly or from a provider. The given metadata + * table will have the single row relating to the specified calling code removed. + */ + public static CsvData create( + DigitSequence cc, + CsvTable allMetadata, + CsvTable ranges, + CsvTable shortcodes, + CsvTable examples, + CsvTable formats, + ImmutableList altFormats, + CsvTable operators, + ImmutableList comments) { + // Row keys are unique, so we end up with at most 1 row in the filtered table. + CsvTable ccMetadata = + allMetadata.toBuilder().filterRows(r -> r.equals(cc)).build(); + checkMetadata(!ccMetadata.getKeys().isEmpty(), "no such calling code %s in metadata", cc); + checkRegions(ccMetadata, ranges, shortcodes); + checkNoOverlappingRows(ranges); + checkNoOverlappingShortcodeRows(shortcodes); + return new AutoValue_CsvData( + cc, ccMetadata, ranges, shortcodes, examples, formats, altFormats, operators, comments); + } + + private static void checkNoOverlappingRows(CsvTable csv) { + RangeTree allRanges = RangeTree.empty(); + for (RangeKey key : csv.getKeys()) { + RangeTree ranges = key.asRangeTree(); + checkMetadata(allRanges.intersect(ranges).isEmpty(), "overlapping row in CSV: %s", key); + allRanges = allRanges.union(ranges); + } + } + + private static void checkNoOverlappingShortcodeRows(CsvTable csv) { + Map allRangesMap = new HashMap<>(); + for (ShortcodeKey key : csv.getKeys()) { + RangeTree allRegionRanges = allRangesMap.getOrDefault(key.getRegion(), RangeTree.empty()); + RangeTree ranges = key.getRangeKey().asRangeTree(); + checkMetadata(allRegionRanges.intersect(ranges).isEmpty(), "overlapping row in CSV: %s", key); + allRangesMap.put(key.getRegion(), allRegionRanges.union(ranges)); + } + } + + private static void checkRegions( + CsvTable metadata, + CsvTable ranges, + CsvTable shortcodes) { + DigitSequence cc = Iterables.getOnlyElement(metadata.getKeys()); + PhoneRegion mainRegion = metadata.getOrDefault(cc, MetadataTableSchema.MAIN_REGION); + Regions extraRegions = metadata.getOrDefault(cc, MetadataTableSchema.EXTRA_REGIONS); + + ImmutableSet csvRegions = ranges + .getValues(RangesTableSchema.CSV_REGIONS).stream() + .flatMap(r -> r.getValues().stream()) + .collect(toImmutableSet()); + if (extraRegions.getValues().isEmpty()) { + checkMetadata(csvRegions.size() == 1 && csvRegions.contains(mainRegion), + "inconsistent regions:\nmetadata: %s\nranges table: %s", mainRegion, csvRegions); + } else { + checkMetadata(!extraRegions.getValues().contains(mainRegion), + "invalid metadata: main region is duplicated in 'extra regions' column"); + checkMetadata( + csvRegions.contains(mainRegion) + && csvRegions.containsAll(extraRegions.getValues()) + && csvRegions.size() == extraRegions.getValues().size() + 1, + "inconsistent regions:\nmetadata: %s + %s\nranges table: %s", + mainRegion, extraRegions, csvRegions); + } + ImmutableSet shortcodeRegions = + shortcodes.getKeys().stream().map(ShortcodeKey::getRegion).collect(toImmutableSet()); + checkMetadata(csvRegions.containsAll(shortcodeRegions), + "unexpected regions for shortcodes:\nmetadata: %s\nshortcode regions: %s", + csvRegions, shortcodeRegions); + } + + /** The difference between two CSV snapshots captured as a set of CVS tables. */ + @AutoValue + public abstract static class Diff { + private static Optional>> diff(CsvTable lhs, CsvTable rhs) { + CsvTable> diff = CsvTable.diff(lhs, rhs, DiffMode.CHANGES); + if (diff.getKeys().stream().anyMatch(k -> k.getStatus() != Status.UNCHANGED)) { + return Optional.of(diff); + } + return Optional.empty(); + } + + // Visible for AutoValue + Diff() {} + + /** Returns the contextualized diff of the ranges table. */ + public abstract Optional>> rangesDiff(); + /** Returns the contextualized diff of the shortcodes table. */ + public abstract Optional>> shortcodesDiff(); + /** Returns the contextualized diff of the examples table. */ + public abstract Optional>> examplesDiff(); + /** Returns the contextualized diff of the formats table. */ + public abstract Optional>> formatsDiff(); + /** Returns the contextualized diff of the operators table. */ + public abstract Optional>> operatorsDiff(); + } + + /** Creates the diff between two CSV data snapshots. */ + public static Diff diff(CsvData before, CsvData after) { + // TODO: Add diffing for comments and/or alternate formats. + return new AutoValue_CsvData_Diff( + Diff.diff(before.getRanges(), after.getRanges()), + Diff.diff(before.getShortcodes(), after.getShortcodes()), + Diff.diff(before.getExamples(), after.getExamples()), + Diff.diff(before.getFormats(), after.getFormats()), + Diff.diff(before.getOperators(), after.getOperators())); + } + + // Visible for AutoValue + CsvData() {} + + /** Returns the calling code for this CSV data. */ + public abstract DigitSequence getCallingCode(); + /** + * Returns the single row of the metadata table for the calling code (see + * {@code MetadataTableSchema}). + */ + public abstract CsvTable getMetadata(); + /** Returns the ranges table for the calling code (see {@code RangesTableSchema}) */ + public abstract CsvTable getRanges(); + /** Returns the shortcode table for the calling code (see {@code ShortcodesTableSchema}) */ + public abstract CsvTable getShortcodes(); + /** Returns the examples table for the calling code (see {@code ExamplesTableSchema}). */ + public abstract CsvTable getExamples(); + /** Returns the format table for the calling code (see {@code FormatsTableSchema}). */ + public abstract CsvTable getFormats(); + /** + * Returns the alternate format table for the calling code (see {@code AltFormatsTableSchema}). + */ + public abstract ImmutableList getAltFormats(); + /** Returns the operator table for the calling code (see {@code OperatorsTableSchema}). */ + public abstract CsvTable getOperators(); + /** Returns the set of comments for the calling code. */ + public abstract ImmutableList getComments(); + + @Memoized + public RangeTable getRangesAsTable() { + return RangesTableSchema.toRangeTable(getRanges()); + } + + @Memoized + public ImmutableSortedMap getShortcodesAsTables() { + return ShortcodesTableSchema.toShortcodeTables(getShortcodes()); + } + + @Memoized + public ImmutableTable getExamplesAsTable() { + return ExamplesTableSchema.toExampleTable(getExamples()); + } + + /** Canonicalizes range tables in the CSV data. This is potentially slow for large regions. */ + // TODO: Is there any way to reliably detect canonical CSV for sub-regions? + public final CsvData canonicalizeRangeTables() { + CsvTable ranges = RangesTableSchema.toCsv(getRangesAsTable()); + CsvTable shortcodes = ShortcodesTableSchema.toCsv(getShortcodesAsTables()); + return create( + getCallingCode(), + getMetadata(), + ranges, + shortcodes, + getExamples(), + getFormats(), + getAltFormats(), + getOperators(), + getComments() + ); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java new file mode 100644 index 000000000..5c3e312c4 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.i18n.phonenumbers.metadata.model.ExamplesTableSchema.ExampleNumberKey.ORDERING; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableTable; +import com.google.common.collect.Table; +import com.google.common.collect.Table.Cell; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.CsvKeyMarshaller; +import com.google.i18n.phonenumbers.metadata.table.CsvSchema; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.util.Comparator; +import java.util.List; +import java.util.Optional; +import java.util.stream.Stream; + +/** + * The schema of the "Example Numbers" table with rows keyed by {@link ExampleNumberKey} and + * columns: + *

    + *
  1. {@link #NUMBER}: The national number + *
  2. {@link #COMMENT}: Evidence for why an example number was chosen. + *
+ * + *

Rows keys are serialized via the marshaller and produce leading columns: + *

    + *
  1. {@code Region}: The region code of the example number. + *
  2. {@code Type}: The {@link ValidNumberType} of the example number. + *
+ */ +public final class ExamplesTableSchema { + /** A key for rows in the example numbers table. */ + @AutoValue + public abstract static class ExampleNumberKey { + public static final Comparator ORDERING = + Comparator.comparing(ExampleNumberKey::getRegion).thenComparing(ExampleNumberKey::getType); + + public static ExampleNumberKey of(PhoneRegion region, ValidNumberType type) { + return new AutoValue_ExamplesTableSchema_ExampleNumberKey(region, type); + } + + public abstract PhoneRegion getRegion(); + public abstract ValidNumberType getType(); + } + + /** A number column containing the digit sequence of a national number. */ + public static final Column NUMBER = Column.create( + DigitSequence.class, "Number", DigitSequence.empty(), DigitSequence::of); + + /** A general comment field, usually describing how an example number was determined. */ + public static final Column COMMENT = Column.ofString("Comment"); + + private static final CsvKeyMarshaller MARSHALLER = new CsvKeyMarshaller<>( + ExamplesTableSchema::write, + ExamplesTableSchema::read, + Optional.of(ORDERING), + "Region", + "Type"); + + private static final Schema COLUMNS = Schema.builder() + .add(NUMBER) + .add(COMMENT) + .build(); + + /** Schema instance defining the example numbers CSV table. */ + public static final CsvSchema SCHEMA = CsvSchema.of(MARSHALLER, COLUMNS); + + /** + * Converts a {@link Table} of example numbers into a {@link CsvTable}, using + * {@link ExampleNumberKey}s as row keys. + */ + public static CsvTable toCsv( + Table table) { + ImmutableTable.Builder, Object> out = ImmutableTable.builder(); + out.orderRowsBy(ORDERING).orderColumnsBy(COLUMNS.ordering()); + for (Cell c : table.cellSet()) { + out.put(ExampleNumberKey.of(c.getRowKey(), c.getColumnKey()), NUMBER, c.getValue()); + } + return CsvTable.from(SCHEMA, out.build()); + } + + /** + * Converts a {@link Table} of example numbers into a {@link CsvTable}, using + * {@link ExampleNumberKey}s as row keys. + */ + public static ImmutableTable + toExampleTable(CsvTable csv) { + ImmutableTable.Builder out = + ImmutableTable.builder(); + for (ExampleNumberKey k : csv.getKeys()) { + out.put(k.getRegion(), k.getType(), csv.getOrDefault(k, NUMBER)); + } + return out.build(); + } + + private static Stream write(ExampleNumberKey key) { + return Stream.of(key.getRegion().toString(), key.getType().toString()); + } + + private static ExampleNumberKey read(List parts) { + return ExampleNumberKey.of( + PhoneRegion.of(parts.get(0)), ValidNumberType.valueOf(parts.get(1))); + } + + private ExamplesTableSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java new file mode 100644 index 000000000..396f735e4 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkNotNull; + +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.model.CsvData.CsvDataProvider; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import java.io.IOException; +import java.nio.file.Path; + +/** + * A CSV provider which reads files rooted in a given directory. The file layout should match that + * in the CSV metadata directory ({@code googledata/third_party/i18n/phonenumbers/metadata}). + */ +public final class FileBasedCsvLoader implements CsvDataProvider { + /** Returns a CSV loader which reads files from the given base directory. */ + public static FileBasedCsvLoader using(Path dir) throws IOException { + return new FileBasedCsvLoader(dir); + } + + private final Path root; + private final CsvTable metadata; + + private FileBasedCsvLoader(Path root) throws IOException { + this.root = checkNotNull(root); + this.metadata = MetadataTableSchema.SCHEMA.load(root.resolve("metadata.csv")); + } + + @Override + public CsvTable loadMetadata() { + return metadata; + } + + @Override + public CsvData loadData(DigitSequence cc) throws IOException { + Path ccDir = root.resolve(cc.toString()); + return CsvData.create( + cc, + metadata, + RangesTableSchema.SCHEMA.load(csvFile(ccDir, "ranges")), + ShortcodesTableSchema.SCHEMA.load(csvFile(ccDir, "shortcodes")), + ExamplesTableSchema.SCHEMA.load(csvFile(ccDir, "examples")), + FormatsTableSchema.SCHEMA.load(csvFile(ccDir, "formats")), + AltFormatsSchema.loadAltFormats(csvFile(ccDir, "altformats")), + OperatorsTableSchema.SCHEMA.load(csvFile(ccDir, "operators")), + CommentsSchema.loadComments(csvFile(ccDir, "comments")) + ); + } + + private static Path csvFile(Path dir, String name) { + return dir.resolve(name + ".csv"); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatSpec.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatSpec.java new file mode 100644 index 000000000..d5110ef7e --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatSpec.java @@ -0,0 +1,637 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +import com.google.auto.value.AutoValue; +import com.google.common.base.CharMatcher; +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.function.ToIntFunction; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * A specifier for the three types of format available in a formatting rule, "national", + * "international" and "carrier specific". Each format is represented by a single string which acts + * as a format template, and from which the necessary XML regular expressions can be recovered. + * + *

The basic syntax of a specifier is something like {@code "XX XXX-XXXX"}, where '{@code X}' + * represents a digit from the phone number being formatted. When converted into the legacy XML + * syntax, a national specifier with this format would represent the "pattern" attribute + * {@code "(\d{2})(\d{3})(\d{4})"} and the "format" element {@code "$1 $2-$3"}. + * + *

By adding the '{@code *}' character, one group of variable length may be defined. Thus + * {@code "XX XXX-XX**"} represents the pattern {@code "(\d{2})(\d{3})(\d{2,4})"}. + * + *

If the national prefix should be present, for either national or carrier specific formatting, + * it is represented by the '{@code #}' symbol. Similarly, for carrier specific formatting, the + * '{@code @}' symbol represents the carrier code placeholder (and must be present exactly once in + * any carrier specific format specifier). + * + *

By analyzing the unique prefixes of both national and carrier specific specifiers, the XML + * syntax can be derived. In a fairly simple example, the format specifiers: + *

    + *
  • national: {@code "(#XX) XXX-XXXX"} + *
  • carrier: {@code "#@ XX XXX-XXXX"} + *
  • international: {@code "XX XXX XXXX"} + *
+ * would result in: + *
    + *
  • pattern: {@code "(\d{2})(\d{3})(\d{4})"} + *
  • national_prefix_formatting_rule: {@code "($NP$FG)"} + *
  • carrier_specific_formatting_rule: {@code "$NP$CC $FG"} + *
  • format: {@code "$1 $2-$3"} + *
  • international_format: {@code "$1 $2 $3"} + *
+ * The derived "pattern" groups must be the same between all specifiers, while the "national" and + * "carrier" specifiers must share a common suffix after the "first group". This is a limitation of + * the XML representation which must be preserved here. + * + *

If no carrier specific format specifier is present, the extraction of a format rule will + * still occur (since the formatting rule also affects "as you type" formatting"). Thus: + *

    + *
  • national: {@code "(XX) XXX"} + *
+ * will result in: + *
    + *
  • format: {@code "$1 $2"} + *
  • national_prefix_formatting_rule: {@code "($FG)"} + *
+ * and not: + *
    + *
  • format: {@code "($1) $2"} + *
+ * + *

An international format specifier must exist if international formatting is possible (even if + * it is identical to the national format specifier). If no international specifier exists, then + * the range of phone numbers associated with this format must be a subset of the "no international + * dialling" range, and the derived XML element "intlFormat" will contain the value "NA". + * + *

If literal characters such as "*" are required to be present in the format string, they can + * be escaped via a '{@code \}' (backslash) character. The set of characters that might need + * escaping is '{@code X}', '{@code *}', '{@code #}' and '{@code @}'. Note that the dollar symbol + * '{@code $}' is special, and is prohibited from ever appearing in a format specifier (even though + * it's not strictly part of the syntax). + * + *

A {@code FormatSpec} also defines the ranges of numbers for which this format applies. This + * is a {@link RangeTree}, rather than a {@code PrefixTree}, since length matters (different + * formats are sometimes distinguished purely on the basis of number length). The possible lengths + * of the range tree must match the possible lengths of all defined specifier strings. + */ +@AutoValue +public abstract class FormatSpec { + /** + * Returns a format specifier from the serialized fields. Note that the given non-local + * specifiers must share certain properties (e.g. same number of format groups, same min/max + * length, same trailing group format). Some of this is necessary due to limitations in how + * formats are represented in the legacy XML schema (e.g. between national and carrier specific + * formats). Exceptions are raised when any of these properties are violated. + * + * @param nationalSpec the national format specifier string (can contain \-escaped characters). + * @param carrierSpec the optional carrier format specifier string. + * @param intlSpec the optional international format specifier string. + * @param localSpec additional local format specifier string. + * @param nationalPrefixOptional allows the national prefix omitted during parsing even if + * present in the format, or given during parsing when not present in the format. + * @param comment a free-from comment for this specifier. + */ + public static FormatSpec of( + String nationalSpec, + Optional carrierSpec, + Optional intlSpec, + Optional localSpec, + boolean nationalPrefixOptional, + Optional comment) { + FormatTemplate national = FormatTemplate.parse(nationalSpec); + checkArgument(!national.hasCarrierCode(), + "national format specifier must not contain carrier code: %s", nationalSpec); + Optional carrier = carrierSpec.map(s -> parseCarrierSpec(s, national)); + Optional intl = intlSpec.map(s -> parseIntlSpec(s, national)); + Optional local = localSpec.map(s -> parseLocalSpec(s, national)); + int minLength = national.minLength(); + int maxLength = national.maxLength(); + return new AutoValue_FormatSpec( + national, carrier, intl, local, minLength, maxLength, nationalPrefixOptional, comment); + } + + /** + * Returns a local format specifier for the given template. Local specifiers only have a national + * template and national prefix prohibited. + */ + public static FormatSpec localFormat(FormatTemplate local) { + checkArgument(!local.hasNationalPrefix(), + "a local template must not have national prefix: %s", local); + return new AutoValue_FormatSpec( + local, + Optional.empty(), + Optional.empty(), + Optional.empty(), + local.minLength(), + local.maxLength(), + false, + Optional.empty()); + } + + /** Returns the national format template (e.g. "#XX XXX XXXX"). */ + public abstract FormatTemplate national(); + + /** Returns the carrier specific format template (e.g. "(@ #XX) XXX XXXX"). */ + public abstract Optional carrier(); + + /** Returns the international format template (e.g. "XX-XXX-XXXX"). */ + public abstract Optional international(); + + /** + * Returns the local format template (e.g. "XXX-XXXX"). Local formats must correspond to the + * "Area Code Length" values in at least some of the ranges to which they are assigned. + */ + public abstract Optional local(); + + /** Returns the minimum number of digits which this format matches. */ + public abstract int minLength(); + + /** Returns the maximum number of digits which this format matches. */ + public abstract int maxLength(); + + /** + * Returns whether, for formats without a national prefix specified, it is still possible to + * trigger this format by adding a national prefix (even though its is not shown). Formats for + * which this method returns {@code true} are grouped alongside formats with an explicit national + * prefix (since they must be ordered carefully with respect to each other to account for their + * "leading digits"). + */ + public abstract boolean nationalPrefixOptional(); + + /** Returns the free-form comment associated with this format specifier. */ + public abstract Optional comment(); + + /** + * Returns the length based bounds for this format (e.g. all digit sequences between the minimum + * and maximum lengths). + */ + public RangeTree getLengthBasedBounds() { + return RangeTree.from(IntStream.rangeClosed(minLength(), maxLength()) + .mapToObj(RangeSpecification::any)); + } + + @Override + public final String toString() { + StringBuilder out = new StringBuilder("FormatSpec{national=").append(national()); + carrier().ifPresent(t -> out.append(", carrier=").append(t)); + local().ifPresent(t -> out.append(", local=").append(t)); + international().ifPresent(t -> out.append(", international=").append(t)); + out.append(", minLength=").append(minLength()); + out.append(", maxLength=").append(maxLength()); + comment().ifPresent(c -> out.append(", comment='").append(c).append('\'')); + return out.append('}').toString(); + } + + // ---- RULE PARSING AND CONVERSION METHODS ---- + + private static FormatTemplate parseCarrierSpec(String spec, FormatTemplate national) { + FormatTemplate carrier = FormatTemplate.parse(spec); + checkArgument(carrier.hasCarrierCode(), + "carrier format specifier must contain carrier code: %s", spec); + // This verifies the groups have the same lengths, but does not check for same formatting. + checkArgument(carrier.isCompatibleWith(national), + "carrier format specifier must have compatible groups: %s - %s", + national.getSpecifier(), spec); + // This is really ugly, since carrier formats must share the same format in the legacy XML, but + // can have different formatting rules for the first group. The best way to test this is just + // compare the XML output directly instead of trying to reason about groups, since group replace + // also needs to be taken into account. + checkArgument(carrier.getXmlFormat().equals(national.getXmlFormat()), + "carrier format specifier must have equal trailing groups: %s - %s", + national.getSpecifier(), spec); + // Artificial check (currently true everywhere and likely to never be broken). If this is ever + // relaxed, the nationalPrefixForParsing regeneration code will need changing to take account + // of ordering (e.g. generate "()?" instead of "()?"). + checkArgument(!carrier.hasNationalPrefix() || spec.indexOf('#') < spec.indexOf('@'), + "national prefix must precede carrier code in carrier format: %s", spec); + return carrier; + } + + private static FormatTemplate parseIntlSpec(String spec, FormatTemplate national) { + FormatTemplate intl = FormatTemplate.parse(spec); + // In theory this could be relaxed, but then when the spec is written it cannot just call + // getFormat(). For now, it's always true the international formats don't have "fancy" + // formatting around the first group (i.e. never "(XXX) XXX XXX") which makes sense since + // international formats cannot be assumed to be read by people with local knowledge. + + // TODO: To reactivate this check after we are sure that first digit of + // SN of MX is no more 1 and need not to be swallowed when formatting i.e after parsing change. + // Context: We have disabled the following check to fix a MX formatting issue i.e using this + // logic {X>} to remove the mobile token(1) in international format, which is the first digit of + // the mobile subscriber number. More details in b/111967450. In general, international + // format should not have such special formatting. Can be fixed as part of b/138727490. + + // checkArgument(!intl.getXmlPrefix().isPresent(), + // "international format specifier must not have separate prefix: %s", spec); + checkArgument( + !intl.hasNationalPrefix(), + "international format specifier must not contain national prefix: %s", + spec); + checkArgument(!intl.hasCarrierCode(), + "international format specifier must not contain carrier code: %s", spec); + checkArgument(intl.isCompatibleWith(national), + "international format specifier must have compatible groups: %s - %s", + national.getSpecifier(), spec); + return intl; + } + + private static FormatTemplate parseLocalSpec(String spec, FormatTemplate national) { + FormatTemplate local = FormatTemplate.parse(spec); + checkArgument(!local.getXmlPrefix().isPresent(), + "local format specifier must not have separate prefix: %s", spec); + checkArgument(!local.hasNationalPrefix(), + "local format specifier must not contain national prefix: %s", spec); + checkArgument(!local.hasCarrierCode(), + "local format specifier must not contain carrier code: %s", spec); + checkArgument(local.minLength() < national.minLength(), + "local format specifier must be shorter than the national format: %s - %s", + national.getSpecifier(), spec); + return local; + } + + // ---- TEMPLATE CLASSES ---- + + /** + * A single template corresponding to a format specifier such as {@code "(# XXX) XXX-XXXX"}. + * A template represents one of the types of format (national, international, carrier specific) + * and enforces as much structural correctness as possible. + * + *

Templates bridge between the specifier syntax and the XML syntax, with its split prefixes + * and confusing semantics. As such, there's a lot of slightly subtle business logic in the + * parsing of templates that, over time, might need to adapt to real world changes (e.g. suffix + * separators and precise expectations of format structure). + */ + @AutoValue + public abstract static class FormatTemplate { + // This could be extended, but must never overlap with the escape characters used in the + // "skeleton" string. It must also always be limited to the Basic Multilingual Plane (BMP). + // It's really important that '$' is never a meta-character in this syntax, since we escape + // strings like "$FG" which would otherwise be broken. + private static final CharMatcher VALID_TEMPLATE_CHARS = + CharMatcher.ascii().and(CharMatcher.javaIsoControl().negate()).and(CharMatcher.isNot('$')); + + private static final CharMatcher VALID_METACHARS = CharMatcher.anyOf("#@X*{>}\\"); + // Need to include '$' as a separator, since groups can abut. + private static final CharMatcher SUFFIX_SEPARATOR = CharMatcher.anyOf(". /-$"); + + private static final char NATIONAL_PREFIX = '#'; + private static final char CARRIER_CODE = '@'; + private static final char REQUIRED_DIGIT = 'X'; + private static final char OPTIONAL_DIGIT = '*'; + private static final char SUBSTITUTION_START = '{'; + private static final char SUBSTITUTION_MAP = '>'; + private static final char SUBSTITUTION_END = '}'; + + private static final String ESCAPED_NATIONAL_PREFIX = "$NP"; + private static final String ESCAPED_CARRIER_CODE = "$CC"; + + static FormatTemplate parse(String spec) { + checkArgument(VALID_TEMPLATE_CHARS.matchesAllOf(spec), + "illegal characters in template: %s", spec); + List groups = new ArrayList<>(); + StringBuilder skeleton = new StringBuilder(); + boolean hasNationalPrefix = false; + boolean hasCarrierCode = false; + boolean hasVariableLengthGroup = false; + // Used to avoid abutting groups (i.e. "XXX**XX"). + boolean canStartGroup = true; + for (int n = 0; n < spec.length(); n++) { + char c = spec.charAt(n); + + if (c == REQUIRED_DIGIT) { + checkArgument(canStartGroup, "illegal group start: %s", spec); + FormatGroup group = extractGroup(spec, n); + checkArgument(!(hasVariableLengthGroup && group.isVariableLength()), + "multiple variable length groups not allowed: %s", spec); + hasVariableLengthGroup = group.isVariableLength(); + + groups.add(group); + skeleton.append(escapeGroupNumber(groups.size())); + + // Move to the last character of the group (since we increment again as we loop). + n += group.maxLength() - 1; + canStartGroup = false; + continue; + } + + if (c == SUBSTITUTION_START) { + // Expect {GROUP>REPLACEMENT} where group can have optional digits (but normally won't). + checkArgument(canStartGroup, "illegal group start: %s", spec); + checkArgument(spec.charAt(n + 1) == REQUIRED_DIGIT, + "illegal group replacement start: %s", spec); + FormatGroup group = extractGroup(spec, n + 1); + checkArgument(!(hasVariableLengthGroup && group.isVariableLength()), + "multiple variable length groups not allowed: %s", spec); + hasVariableLengthGroup = group.isVariableLength(); + + // Now expect mapping character and substitution string. + n += group.maxLength() + 1; + checkArgument(spec.charAt(n) == SUBSTITUTION_MAP, + "illegal group replacement (expected %s): '%s'", SUBSTITUTION_MAP, spec); + int end = spec.indexOf(SUBSTITUTION_END, n + 1); + checkArgument(end != -1, "missing group replacement end: %s", spec); + + groups.add(group.withReplacement(spec.substring(n + 1, end))); + skeleton.append(escapeGroupNumber(groups.size())); + // Unlike the "normal" case above, you can start another group immediately after this + // (since the {,} make it unambiguous). + n = end; + continue; + } + + canStartGroup = true; + + if (c == NATIONAL_PREFIX) { + checkArgument(!hasNationalPrefix, "multiple national prefixes not allowed: %s", spec); + hasNationalPrefix = true; + skeleton.append(ESCAPED_NATIONAL_PREFIX); + continue; + } + + if (c == CARRIER_CODE) { + checkArgument(!hasCarrierCode, "multiple carrier codes not allowed: %s", spec); + hasCarrierCode = true; + skeleton.append(ESCAPED_CARRIER_CODE); + continue; + } + + if (c == '\\') { + // Blows up if trailing '\', but that's fine. + c = spec.charAt(++n); + checkArgument(VALID_METACHARS.matches(c), "invalid escaped character '%s': %s", c, spec); + } else { + checkArgument(c != OPTIONAL_DIGIT, "unexpected optional marker: %s", spec); + } + skeleton.append(c); + } + checkArgument(!groups.isEmpty(), "format specifiers must have at least one group: %s", spec); + // Find the first group which has a replacement (one must exist). This is important for + // determining where the prefix and suffix should be split when considering hoisting the + // prefix into a format rule (see getSuffixStart() / getXmlPrefix() / getXmlFormat()). + int fgIndex = 0; + while (fgIndex < groups.size() && groups.get(fgIndex).replacement().isPresent()) { + fgIndex++; + } + checkArgument(fgIndex < groups.size(), "cannot replace all groups in a template: %s", spec); + return new AutoValue_FormatSpec_FormatTemplate( + spec, + hasNationalPrefix, + hasCarrierCode, + ImmutableList.copyOf(groups), + fgIndex, + skeleton.toString()); + } + + /** + * Returns the specifier string (e.g. "# XXX-XXXX") which is the serialized form of the + * template. + */ + public abstract String getSpecifier(); + + /** Whether this template formats a national prefix. */ + public abstract boolean hasNationalPrefix(); + + /** Whether this template formats a carrier selection code prefix. */ + public abstract boolean hasCarrierCode(); + + /** Returns the information about the groups in this template. */ + public abstract ImmutableList getGroups(); + + /** + * Returns the index of the first group which does not have a replacement (at least one must). + */ + public abstract int getFirstAvailableGroupIndex(); + + // This is an internal representation of the format string used by the XML. It differs in that + // it isn't split into prefix and suffix (as required in some situations for the XML). As such + // it only contains "$NP", "$CC", "$", but never "$FG". All valid specifier skeletons must + // contain "$1"..."$" rather than any replacement strings. + abstract String skeleton(); + + /** Returns the minumin number of digits which can be matched by this template. */ + public int minLength() { + return getLength(this, FormatGroup::minLength); + } + + /** Returns the maximum number of digits which can be matched by this template. */ + public int maxLength() { + return getLength(this, FormatGroup::maxLength); + } + + /** + * Returns the maximum number of digits which can be formatted as a single block by this + * template. If no more than this number of digits are entered, they will be formatted as a + * single block by this template. + * + *

This is useful when calculating the leading digits of a format since it might be + * acceptable to match shortcodes to some formats if they would still format the shortcode + * within the first block. This simplifies the leading digits in some cases. + */ + public int getBlockFormatLength() { + // If only one group everything is a block, otherwise take the minimum length of the first + // group. + return (getGroups().size() == 1) ? maxLength() : getGroups().get(0).minLength(); + } + + /** Returns a regex to capture the groups for this template (e.g. "(\d{3})(\d{4,5})") */ + public String getXmlCapturingPattern() { + return getGroups().stream() + .map(FormatGroup::toRegex) + .collect(Collectors.joining(")(", "(", ")")); + } + + /** + * Returns the format string for use in the XML (e.g. "$1 $2-$3"). + * + *

For example given the following templates: + *

    + *
  • {@code "XXX XXX-XXX"} ==> {@code "$1 $2-$3"} + *
  • {@code "(#XXX) XXX-XXX"} ==> {@code "$1 $2-$3"} (the prefix is hoisted) + *
  • {@code "#{XXX>123} XXX-XXX"} ==> {@code "$2-$3"} ($1 was replaced and hoisted) + *
  • {@code "{X>}XXX-XXX"} ==> {@code "$2-$3"} ($1 was removed) + *
+ */ + public String getXmlFormat() { + int fgIndex = getFirstAvailableGroupIndex(); + // Always replace the prefix with $N (which is what $FG maps to). This might be a no-op. + String format = "$" + (fgIndex + 1) + skeleton().substring(getSuffixStart()); + // Finally do any group replacement from the skeleton after the "first available group". + // + // Note that this code isn't exercised in data at the moment (2018) but is here to avoid + // needing to place artificial limitations on where group replacement can occur. + for (int n = fgIndex + 1; n < getGroups().size(); n++) { + Optional replacement = getGroups().get(n).replacement(); + if (replacement.isPresent()) { + format = format.replace("$" + (n + 1), replacement.get()); + } + } + return format; + } + + /** + * Returns the format prefix for use in the XML formatting rules (e.g. "($NP $FG)"). If the + * calculated prefix is just "$FG" then nothing is returned (since that's a no-op value). + * + *

For example given the following templates: + *

    + *
  • {@code "XXX XXX-XXX"} ==> XML prefix is empty + *
  • {@code "(#XXX) XXX-XXX"} ==> {@code "($NP$FG)"} + *
  • {@code "#{XXX>123} XXX-XXX"} ==> {@code "$NP123 $FG"} + *
  • {@code "{X>}XXX-XXX"} ==> XML prefix is empty (but the format will not contain $1) + *
+ */ + public Optional getXmlPrefix() { + String prefix = skeleton().substring(0, getSuffixStart()); + // We know that "$" (substitutions are 1-indexed) is in the prefix and + // should be replaced with "$FG", and everything before that has a replacement. + int fgIndex = getFirstAvailableGroupIndex(); + for (int n = 0; n < fgIndex; n++) { + // Everything before the "first available group" must have a replacement (by definition). + prefix = prefix.replace("$" + (n + 1), getGroups().get(n).replacement().get()); + } + prefix = prefix.replace("$" + (fgIndex + 1), "$FG"); + checkState(prefix.contains("$FG"), + "XML prefix must always contain '$FG' (this must be a code error): %s", prefix); + // After all this work we could still end up with a no-op substitution! + return prefix.equals("$FG") ? Optional.empty() : Optional.of(prefix); + } + + /** + * Returns whether all groups have the same "structure" (i.e. min/max length). They can + * differ in terms of having replacements however. + */ + boolean isCompatibleWith(FormatTemplate other) { + if (getGroups().size() != other.getGroups().size()) { + return false; + } + for (int n = 0; n < getGroups().size(); n++) { + if (!getGroups().get(n).isCompatibleWith(other.getGroups().get(n))) { + return false; + } + } + return true; + } + + private int getSuffixStart() { + // This is only safe because "\$1" cannot be present ('$' cannot be escaped). + int suffixStart = SUFFIX_SEPARATOR.indexIn(skeleton(), skeleton().indexOf("$1") + 1); + // If no suffix start found, the entire skeleton is the prefix. + if (suffixStart == -1) { + suffixStart = skeleton().length(); + } + // Now account for the fact that the first group (and others) could have replacements, which + // pushes the suffix start to just after the "first available group" (which is what becomes + // $FG). If the first available group is "$1" then we just get suffixStart. + int fgNumber = getFirstAvailableGroupIndex() + 1; + checkState(fgNumber < 10, "invalid first group number: %s", fgNumber); + return Math.max(suffixStart, skeleton().indexOf("$" + fgNumber) + 2); + } + + @Override + public final String toString() { + return getSpecifier(); + } + + private static int getLength(FormatTemplate template, ToIntFunction lengthFn) { + return template.getGroups().stream().mapToInt(lengthFn).sum(); + } + + private static FormatGroup extractGroup(String template, int start) { + // We know that 'start' references a group start (i.e. 'X') so length must be at least 1. + int endRequired = findEndOf(REQUIRED_DIGIT, template, start); + int endGroup = findEndOf(OPTIONAL_DIGIT, template, endRequired); + return FormatGroup.of(endRequired - start, endGroup - start); + } + + private static int findEndOf(char c, String template, int start) { + int endRequired = CharMatcher.isNot(c).indexIn(template, start); + return endRequired != -1 ? endRequired : template.length(); + } + + private static String escapeGroupNumber(int n) { + checkArgument(n >= 1 && n <= 9, "bad group number: %s", n); + return "$" + n; + } + } + + /** Represents contiguous digit groups in a format (e.g. "XXX" or "XXX***"). */ + @AutoValue + public abstract static class FormatGroup { + private static FormatGroup of(int min, int max) { + checkArgument(max >= min, "bad group lengths: %s, %s", min, max); + return new AutoValue_FormatSpec_FormatGroup(min, max, Optional.empty()); + } + + private FormatGroup withReplacement(String s) { + return new AutoValue_FormatSpec_FormatGroup(minLength(), maxLength(), Optional.of(s)); + } + + /** Returns the minimum number of digits in this group. */ + public abstract int minLength(); + + /** Returns the maximum number of digits in this group. */ + public abstract int maxLength(); + + /** Returns the optional, arbitrary (possibly empty) replacement string for this group. */ + abstract Optional replacement(); + + /** + * Returns if this group can match a variable number of digits. Only one group in any format + * specifier can have variable length. + */ + private boolean isVariableLength() { + return maxLength() > minLength(); + } + + /** + * Returns whether two groups have the same "structure" (i.e. min/max lengths), but does not + * compare replacement values. Used only for internal checks. + */ + private boolean isCompatibleWith(FormatGroup other) { + return minLength() == other.minLength() && maxLength() == other.maxLength(); + } + + private String toRegex() { + if (maxLength() > minLength()) { + return String.format("\\d{%d,%d}", minLength(), maxLength()); + } else if (minLength() > 1) { + return String.format("\\d{%d}", minLength()); + } else { + return "\\d"; + } + } + + @Override + public final String toString() { + String group = + Strings.repeat("X", minLength()) + Strings.repeat("*", maxLength() - minLength()); + return replacement().map(r -> String.format("{%s>%s}", group, r)).orElse(group); + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java new file mode 100644 index 000000000..a9cbca664 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import com.google.common.collect.ImmutableMap; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.CsvKeyMarshaller; +import com.google.i18n.phonenumbers.metadata.table.CsvSchema; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.util.Optional; + +/** + * The schema of the "Formats" table with rows keyed by ID, and columns: + *
    + *
  1. {@link #NATIONAL}: Required national format (may contain '#' for national prefix). + *
  2. {@link #CARRIER}: Optional carrier format (may contain '#' and '@' for carrier + * specifier). Must be compatible with the national format (same suffix). + *
  3. {@link #INTERNATIONAL}: International format (must not contain '#' or '@'). + *
  4. {@link #LOCAL}: Local format (must not contain '#' or '@', and must correspond to assigned + * area code lengths if present). + *
  5. {@link #COMMENT}: Freeform comment text. + *
+ * + *

Rows keys are serialized via the marshaller and produce the leading column: + *

    + *
  1. {@code Id}: The format ID. + *
+ */ +public final class FormatsTableSchema { + public static final Column NATIONAL = Column.ofString("National"); + public static final Column CARRIER = Column.ofString("Carrier"); + public static final Column INTERNATIONAL = Column.ofString("International"); + public static final Column LOCAL = Column.ofString("Local"); + + public static final Column NATIONAL_PREFIX_OPTIONAL = + Column.ofBoolean("National Prefix Optional"); + /** An arbitrary optional text comment. */ + public static final Column COMMENT = Column.ofString("Comment"); + + private static final CsvKeyMarshaller MARSHALLER = CsvKeyMarshaller.ofSortedString("Id"); + + private static final Schema COLUMNS = + Schema.builder() + .add(NATIONAL) + .add(CARRIER) + .add(INTERNATIONAL) + .add(LOCAL) + .add(NATIONAL_PREFIX_OPTIONAL) + .add(COMMENT) + .build(); + + /** Schema instance defining the operators CSV table. */ + public static final CsvSchema SCHEMA = CsvSchema.of(MARSHALLER, COLUMNS); + + /** Converts a CSV table into a map of format specifiers. */ + public static ImmutableMap toFormatSpecs(CsvTable formats) { + ImmutableMap.Builder specs = ImmutableMap.builder(); + for (String id : formats.getKeys()) { + specs.put( + id, + FormatSpec.of( + formats.getOrDefault(id, NATIONAL), + toOptional(formats.getOrDefault(id, CARRIER)), + toOptional(formats.getOrDefault(id, INTERNATIONAL)), + toOptional(formats.getOrDefault(id, LOCAL)), + formats.getOrDefault(id, NATIONAL_PREFIX_OPTIONAL), + toComment(formats.getOrDefault(id, COMMENT)))); + } + return specs.build(); + } + + private static Optional toOptional(String s) { + return s.isEmpty() ? Optional.empty() : Optional.of(s); + } + + private static Optional toComment(String s) { + return s.isEmpty() ? Optional.empty() : Optional.of(Comment.fromText(s)); + } + + private FormatsTableSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataException.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataException.java new file mode 100644 index 000000000..01f269a15 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataException.java @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import com.google.errorprone.annotations.FormatMethod; + +/** + * Represents an error related to CSV metadata, either structural issues in the CSV or semantic + * errors in the XML representation. MetadataExceptions should only correspond to problems fixable + * by editing the CSV data. + */ +public final class MetadataException extends RuntimeException { + @FormatMethod + public static void checkMetadata(boolean cond, String msg, Object... args) { + if (!cond) { + throw new MetadataException(String.format(msg, args)); + } + } + + public MetadataException(String message) { + super(message); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataTableSchema.java new file mode 100644 index 000000000..87c636fda --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/MetadataTableSchema.java @@ -0,0 +1,168 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static java.util.Comparator.naturalOrder; + +import com.google.common.collect.ImmutableSet; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.Timezones; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.CsvKeyMarshaller; +import com.google.i18n.phonenumbers.metadata.table.CsvSchema; +import com.google.i18n.phonenumbers.metadata.table.MultiValue; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Optional; +import java.util.stream.Stream; + +/** + * The schema of the "Metadata" table with rows keyed by {@link DigitSequence} and columns: + * + *
    + *
  1. {@link #MAIN_REGION}: The primary region associated with a calling code. + *
  2. {@link #EXTRA_REGIONS}: A list of additional regions shared by the calling code. + *
  3. {@link #NATIONAL_PREFIX}: The (optional) prefix used when dialling national numbers. + *
  4. {@link #IDD_PREFIX}: The default international dialling (IDD) prefix. + *
  5. {@link #TIMEZONE}: The default timezone name(s) for a calling code. Multiple timezones + * can be specific if separated by {@code '&'}. + *
  6. {@link #MOBILE_PORTABLE_REGIONS}: A list of regions in which mobile numbers are portable + * between operators. + *
  7. {@link #NATIONAL_PREFIX_OPTIONAL}: True if the national prefix is optional throughout the + * numbering plan (e.g. a prefix is defined, but does not have to be present when numbers are + * used). + *
+ * + *

Rows keys are serialized via the marshaller and produce the leading column: + *

    + *
  1. {@code Calling Code}: The country calling code. + *
+ */ +public final class MetadataTableSchema { + /** Values in the "REGIONS" column are a sorted list of region codes. */ + public static final class Regions extends MultiValue { + private static final Regions EMPTY = new Regions(ImmutableSet.of()); + + public static Column column(String name) { + return Column.create(Regions.class, name, EMPTY, Regions::new); + } + + public static Regions of(PhoneRegion... regions) { + return new Regions(Arrays.asList(regions)); + } + + public static Regions of(Iterable regions) { + return new Regions(regions); + } + + private Regions(Iterable regions) { + super(regions, ',', naturalOrder(), true); + } + + private Regions(String s) { + super(s, PhoneRegion::of, ',', naturalOrder(), true); + } + } + + /** + * Values in the "NATIONAL_PREFIX" column are an (unsorted) list of prefixes, with the preferred + * prefix first. + */ + public static final class DigitSequences extends MultiValue { + private static final DigitSequences EMPTY = new DigitSequences(ImmutableSet.of()); + + public static Column column(String name) { + return Column.create(DigitSequences.class, name, EMPTY, DigitSequences::new); + } + + public static DigitSequences of(DigitSequence... numbers) { + return new DigitSequences(Arrays.asList(numbers)); + } + + private DigitSequences(Iterable numbers) { + super(numbers, ',', naturalOrder(), false); + } + + private DigitSequences(String s) { + super(s, DigitSequence::of, ',', naturalOrder(), false); + } + } + + /** The primary region associated with a calling code (e.g. "US" for NANPA). */ + public static final Column MAIN_REGION = + Column.create(PhoneRegion.class, "Main Region", PhoneRegion.getUnknown(), PhoneRegion::of); + + /** A comma separated list of expected regions for the calling code. */ + public static final Column EXTRA_REGIONS = Regions.column("Extra Regions"); + + /** + * A list of prefixes used when dialling national numbers (e.g. "0" for "US"). If more than one + * prefix is given, the first prefix is assumed to be "preferred" and the others are considered + * alternatives. Having multiple prefixes is useful if a country switches between prefixes and + * a period of "parallel running" is needed. + */ + public static final Column NATIONAL_PREFIX = + DigitSequences.column("National Prefix"); + + /** + * The default international dialling (IDD) prefix. This is a string, rather than a digit + * sequence, because it can optionally contain a single '~' character to indicate a pause while + * dialling (e.g. "8~10" in Russia). This is stripped everywhere except when used to populate + * the "preferredInternationalPrefix" attribute in the libphonenumber XML file. + */ + public static final Column IDD_PREFIX = Column.ofString("IDD Prefix"); + + /** + * The default value for the "Timezone" column in the ranges table (in many regions, this is a + * single constant value). + */ + public static final Column TIMEZONE = RangesTableSchema.TIMEZONE; + + /** A comma separated list of regions in which mobile numbers are portable between carriers. */ + public static final Column MOBILE_PORTABLE_REGIONS = + Regions.column("Mobile Portable Regions"); + + /** Describes whether the "national prefix" is optional when parsing a national number. */ + public static final Column NATIONAL_PREFIX_OPTIONAL = + Column.ofBoolean("National Prefix Optional"); + + /** The preferred prefix for specifying extensions to numbers (e.g. "ext" for "1234 ext 56"). */ + public static final Column EXTENSION_PREFIX = Column.ofString("Extension Prefix"); + + private static final CsvKeyMarshaller MARSHALLER = new CsvKeyMarshaller<>( + k -> Stream.of(k.toString()), + p -> DigitSequence.of(p.get(0)), + Optional.of(Comparator.comparing(Object::toString)), + "Calling Code"); + + private static final Schema COLUMNS = Schema.builder() + .add(MAIN_REGION) + .add(EXTRA_REGIONS) + .add(NATIONAL_PREFIX) + .add(IDD_PREFIX) + .add(TIMEZONE) + .add(MOBILE_PORTABLE_REGIONS) + .add(NATIONAL_PREFIX_OPTIONAL) + .add(EXTENSION_PREFIX) + .build(); + + /** Schema instance defining the metadata CSV table. */ + public static final CsvSchema SCHEMA = CsvSchema.of(MARSHALLER, COLUMNS); + + private MetadataTableSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingScheme.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingScheme.java new file mode 100644 index 000000000..01484b1ab --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingScheme.java @@ -0,0 +1,750 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.i18n.phonenumbers.metadata.model.MetadataException.checkMetadata; +import static com.google.i18n.phonenumbers.metadata.model.XmlRangesSchema.AREA_CODE_LENGTH; +import static com.google.i18n.phonenumbers.metadata.model.XmlRangesSchema.FORMAT; +import static com.google.i18n.phonenumbers.metadata.model.XmlRangesSchema.NATIONAL_ONLY; +import static com.google.i18n.phonenumbers.metadata.model.XmlRangesSchema.PER_REGION_COLUMNS; +import static com.google.i18n.phonenumbers.metadata.model.XmlRangesSchema.REGIONS; +import static java.lang.Boolean.TRUE; +import static java.util.Comparator.comparing; + +import com.google.auto.value.AutoValue; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.ContiguousSet; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedMap; +import com.google.common.collect.ImmutableSortedSet; +import com.google.common.collect.ImmutableTable; +import com.google.common.collect.Ordering; +import com.google.common.collect.Sets; +import com.google.common.collect.Table; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.PrefixTree; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.FormatSpec.FormatTemplate; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment.Anchor; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType; +import com.google.i18n.phonenumbers.metadata.proto.Types.XmlShortcodeType; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.TreeSet; + +/** + * An abstraction of all the phone number metadata known about for a single calling code. + *

+ * Note that there is no builder for NumberingScheme. The expectation is that CSV tables and other + * primary sources will be used to build numbering schemes at a single point in the business logic. + * Handling incremental modification of a builder, or partially built schemes just isn't something + * that's expected to be needed (though there is {@code TestNumberingScheme} for use in unit tests. + */ +@AutoValue +public abstract class NumberingScheme { + // Bitmask for [1-9] (bits 1..9 set, bit 0 clear). + private static final int NOT_ZERO_MASK = 0x3FE; + + /** Top level information about a numbering scheme. */ + @AutoValue + public abstract static class Attributes { + /** Returns a new attributes instance for the given data. */ + public static Attributes create( + DigitSequence cc, + PhoneRegion mainRegion, + Set extraRegions, + ImmutableSet nationalPrefix, + RangeTree carrierPrefixes, + String defaultIddPrefix, + RangeTree allIddRanges, + String extensionPrefix, + Set mobilePortableRegions) { + // In theory there could be IDD prefix for a non-geographic region (and this check could be + // removed) but it's not something we've ever seen and don't have any expectation of. + checkMetadata(!mainRegion.equals(PhoneRegion.getWorld()) || allIddRanges.isEmpty(), + "[%s] IDD prefixes must not be present for non-geographic regions", cc); + checkMetadata(mainRegion.equals(PhoneRegion.getWorld()) || !allIddRanges.isEmpty(), + "[%s] IDD prefixes must be present for all geographic regions", cc); + checkMetadata(nationalPrefix.stream().noneMatch(allIddRanges::contains), + "[%s] National prefix %s and IDD prefixes (%s) must be disjoint", + cc, nationalPrefix, allIddRanges); + checkMetadata(nationalPrefix.stream().noneMatch(carrierPrefixes::contains), + "[%s] National prefix %s and carrier prefixes (%s) must be disjoint", + cc, nationalPrefix, carrierPrefixes); + // Allow exactly one '~' to separate the prefix digits to indicate a pause during dialling + // (this check could be relaxed in future, but it's currently true for all data). + checkMetadata(defaultIddPrefix.isEmpty() || defaultIddPrefix.matches("[0-9]+(?:~[0-9]+)?"), + "[%s] Invalid IDD prefix: %s", cc, defaultIddPrefix); + DigitSequence iddPrefix = DigitSequence.of(defaultIddPrefix.replace("~", "")); + checkMetadata(iddPrefix.isEmpty() || allIddRanges.contains(iddPrefix), + "[%s] IDD ranges must contain the default prefix: %s", cc, iddPrefix); + checkMetadata(!extraRegions.contains(mainRegion), + "[%s] duplicated main region '%s' in extra regions: %s", + cc, mainRegion, extraRegions); + // Main region comes first in iteration order, remaining regions are ordered naturally. + ImmutableSet.Builder set = ImmutableSet.builder(); + set.add(mainRegion); + extraRegions.stream().sorted().forEach(set::add); + ImmutableSet allRegions = set.build(); + checkMetadata(allRegions.containsAll(mobilePortableRegions), + "invalid mobile portable regions: %s", mobilePortableRegions); + return new AutoValue_NumberingScheme_Attributes( + cc, + allRegions, + nationalPrefix, + carrierPrefixes, + defaultIddPrefix, + allIddRanges, + !extensionPrefix.isEmpty() ? Optional.of(extensionPrefix) : Optional.empty(), + ImmutableSortedSet.copyOf(Ordering.natural(), mobilePortableRegions)); + } + + /** Returns the unique calling code of this numbering scheme. */ + public abstract DigitSequence getCallingCode(); + + /** + * Returns the regions represented by this numbering scheme. The main region is always present + * and listed first, and remaining regions are listed in "natural" order. + */ + public abstract ImmutableSet getRegions(); + + /** + * Returns the "main" region for this numbering scheme. The notion of a main region for a + * country calling code is slightly archaic and mostly comes from the way in which the legacy + * XML data is structured. However there are a few places in the public API where the "main" + * region is returned in situations of ambiguity, so it can be useful to know it. + */ + public final PhoneRegion getMainRegion() { + return getRegions().asList().get(0); + } + + /** + * Returns all possible national prefixes which can be used when dialling national numbers. In + * most cases this set just contains the preferred prefix, but alternate values may be present + * when a region switches between prefixes or for other reasons. Any "non preferred" prefixes + * are recognized only during parsing, and otherwise ignored. + * + *

If there is a preferred prefix, it is listed first, otherwise the set is empty. + */ + public abstract ImmutableSet getNationalPrefixes(); + + /** + * Returns the (possibly empty) prefix used when dialling national numbers (e.g. "0" for "US"). + * Not all regions require a prefix for national dialling. + */ + public DigitSequence getPreferredNationalPrefix() { + ImmutableSet prefixes = getNationalPrefixes(); + return prefixes.isEmpty() ? DigitSequence.empty() : prefixes.iterator().next(); + } + + /** + * Returns all carrier prefixes for national dialling. This range must not contain the national + * prefix. + */ + public abstract RangeTree getCarrierPrefixes(); + + /** + * Returns the (possible empty) default international dialling (IDD) prefix, possibly + * containing a '~' to indicate a pause during dialling (e.g. "8~10" for Russia). + */ + public abstract String getDefaultIddPrefix(); + + /** + * Returns all IDD prefixes which may be used for international dialling. If the default prefix + * is not empty it must be contained in this range. + */ + public abstract RangeTree getIddPrefixes(); + + /** Returns the preferred label to use for indicating extensions for numbers. */ + public abstract Optional getExtensionPrefix(); + + /** Returns the regions in which mobile numbers are portable between carriers. */ + public abstract ImmutableSet getMobilePortableRegions(); + } + + /** + * Creates a numbering scheme from a range table and example numbers. No rules are applied to the + * data in the tables, and they are assumed to be complete. + */ + public static NumberingScheme from( + Attributes attributes, + RangeTable xmlTable, + Map shortcodeMap, + Map formats, + ImmutableList altFormats, + Table exampleNumbers, + List comments) { + checkPossibleRegions(attributes.getRegions(), xmlTable); + checkNationalOnly(attributes, xmlTable); + checkUnambiguousIdd(attributes, xmlTable, formats); + ImmutableSortedMap shortcodes = + checkShortCodeConsistency(shortcodeMap, xmlTable); + return new AutoValue_NumberingScheme( + attributes, + xmlTable, + shortcodes, + checkFormatConsistency(attributes, formats, xmlTable, shortcodes), + checkAltFormatConsistency(altFormats, formats, xmlTable), + checkExampleNumbers(attributes.getRegions(), xmlTable, exampleNumbers), + addSyntheticComments(comments, attributes)); + } + + // Adds the first comments for main and auxiliary regions, giving the English name and detailing + // auxiliary region information if necessary. + private static ImmutableList addSyntheticComments( + List comments, Attributes attributes) { + PhoneRegion mainRegion = attributes.getMainRegion(); + if (!mainRegion.equals(PhoneRegion.getWorld())) { + List modified = new ArrayList<>(getRegionNameComments(mainRegion)); + List auxRegions = + attributes.getRegions().asList().subList(1, attributes.getRegions().size()); + if (!auxRegions.isEmpty()) { + String comment = String.format("Main region for '%s'", Joiner.on(',').join(auxRegions)); + modified.add(Comment.create(Comment.anchor(mainRegion), ImmutableList.of(comment))); + for (PhoneRegion r : auxRegions) { + modified.addAll(getRegionNameComments(r)); + String auxComment = + String.format("Calling code and formatting shared with '%s'", mainRegion); + modified.add(Comment.create(Comment.anchor(r), ImmutableList.of(auxComment))); + } + } + // Do this last, since order matters (because anchors are not unique) and we want the + // synthetic comments to come first. + modified.addAll(comments); + comments = modified; + } + return ImmutableList.copyOf(comments); + } + + private static List getRegionNameComments(PhoneRegion region) { + ImmutableList enName = ImmutableList.of(region.getEnglishNameForXmlComments()); + return ImmutableList.of( + Comment.create(Comment.anchor(region), enName), + Comment.create(Comment.shortcodeAnchor(region), enName)); + } + + private static void checkPossibleRegions(Set regions, RangeTable xmlTable) { + ImmutableSet actual = REGIONS.extractGroupColumns(xmlTable.getColumns()).keySet(); + // Allow no region column in the table if there's only one region (since it's implicit). + checkState((actual.isEmpty() && regions.size() == 1) || actual.equals(regions), + "regions added to range table do not match the expected numbering scheme regions\n" + + "expected: %s\n" + + "actual: %s\n", + regions, actual); + } + + // An assumption has generally been that if a range is "national only" then it either: + // a) belongs to only one region (the one it's national only for) + // b) belongs to at least the main region (since in some schemes ranges mostly just overlap all + // possible regions). + // Thus we preclude the possibility of having a "national only" number that appears in multiple + // regions, but not the main region. + // + // If this check is ever removed (because there is real data where this is not the case), then + // the code which generates the "" patterns will have to be revisited. + private static void checkNationalOnly(Attributes attributes, RangeTable xmlTable) { + RangeTree allNationalOnly = xmlTable.getRanges(NATIONAL_ONLY, true); + if (allNationalOnly.isEmpty()) { + return; + } + ImmutableList regions = attributes.getRegions().asList(); + PhoneRegion main = regions.get(0); + // Anything assigned to the main region can be ignored as we allow it to have multiple regions. + // Now we have to ensure that these ranges are assigned to exactly one auxiliary region. + RangeTree remaining = + allNationalOnly.subtract(xmlTable.getRanges(REGIONS.getColumn(main), true)); + if (remaining.isEmpty()) { + return; + } + DigitSequence cc = attributes.getCallingCode(); + for (PhoneRegion r : regions.subList(1, regions.size())) { + RangeTree auxNationalOnly = + xmlTable.getRanges(REGIONS.getColumn(r), true).intersect(allNationalOnly); + // Anything already removed from "remaining" was already accounted for by another region. + checkMetadata(remaining.containsAll(auxNationalOnly), + "[%s] %s has national-only ranges which overlap other regions: %s", + cc, r, auxNationalOnly.subtract(remaining)); + remaining = remaining.subtract(auxNationalOnly); + } + // This is not data issue since it should have been checked already, this is bug. + checkState(remaining.isEmpty(), "[%s] ranges not assigned to any region: %s", cc, remaining); + } + + /** + * Ensures no national range can start with an IDD (international dialling code of any kind). + * This is slightly more complex than just looking for any IDD prefix at the start of a range + * because of cases like India, where "00800..." is a valid range and does start with IDD. + * + *

We allow this because: + *

    + *
  1. The number is required to have the national prefix in front, so must be dialled as + * {@code 000800...} (according to the Indian numbering plan) + *
  2. and {@code 000...} is not a valid sequence that would lead to dialing into another region, + * because all calling codes start with {@code [1-9]}. + *
+ */ + private static void checkUnambiguousIdd( + Attributes attributes, RangeTable xmlTable, Map formats) { + // It can be empty for non-geographic (world) numbering schemes. + if (attributes.getIddPrefixes().isEmpty()) { + return; + } + + // All IDDs extended by one non-zero digit. These are the prefixes which if dialled may end + // up in another region, so they cannot be allowed at the start of any national number. + RangeTree iddPlusOneDigit = attributes.getIddPrefixes().map(r -> r.extendByMask(NOT_ZERO_MASK)); + // We only care about ranges up to this length, which can speed things up. + int maxPrefixLength = iddPlusOneDigit.getLengths().last(); + + // Now prefix any ranges which could be dialled with a national prefix with all possible + // national prefixes, based on how they are formatted (and assume that no format means no + // national prefix). + RangeTree withNationalPrefix = RangeTree.empty(); + RangeTree withoutNationalPrefix = xmlTable.getRanges(FORMAT, FORMAT.defaultValue()); + for (String fid : formats.keySet()) { + FormatSpec spec = formats.get(fid); + // Only bother with ranges up to the maximum prefix length we care about. + RangeTree r = xmlTable.getRanges(FORMAT, fid).slice(0, maxPrefixLength); + if (spec.nationalPrefixOptional()) { + withNationalPrefix = withNationalPrefix.union(r); + withoutNationalPrefix = withoutNationalPrefix.union(r); + } else if (spec.national().hasNationalPrefix()) { + withNationalPrefix = withNationalPrefix.union(r); + } else { + withoutNationalPrefix = withoutNationalPrefix.union(r); + } + } + // Only here due to lambdas requiring an effectively final field (this makes me sad). + RangeTree withNationalPrefixCopy = withNationalPrefix; + RangeTree allDiallablePrefixes = + withoutNationalPrefix + .union(attributes.getNationalPrefixes().stream() + .map(RangeSpecification::from) + .map(p -> withNationalPrefixCopy.prefixWith(p)) + .reduce(RangeTree.empty(), RangeTree::union)); + // These are prefixes which are claimed to be nationally diallable but overlap with the IDD. + RangeTree iddOverlap = PrefixTree.from(iddPlusOneDigit).retainFrom(allDiallablePrefixes); + checkMetadata(iddOverlap.isEmpty(), + "[%s] ranges cannot start with IDD: %s", attributes.getCallingCode(), iddOverlap); + } + + /** + * Ensures the shortcodes are disjoint from main ranges and consistent with each other by format + * (since format information isn't held separately for each shortcode table). + */ + private static ImmutableSortedMap checkShortCodeConsistency( + Map shortcodeMap, RangeTable table) { + ImmutableSortedMap shortcodes = + ImmutableSortedMap.copyOf(shortcodeMap); + shortcodes.forEach((region, shortcodeTable) -> { + RangeTree overlap = table.getAllRanges().intersect(shortcodeTable.getAllRanges()); + checkMetadata(overlap.isEmpty(), + "Shortcode and national numbers overlap for %s: %s", region, overlap); + }); + return shortcodes; + } + + private static final Schema FORMAT_SCHEMA = + Schema.builder().add(AREA_CODE_LENGTH).add(FORMAT).build(); + + // We actually explicitly permit duplicate formats (for now) since the XML has them. Later, once + // everything is settled, it might be possible to add a check here. + private static ImmutableMap checkFormatConsistency( + Attributes attributes, + Map formatMap, + RangeTable table, + Map shortcodes) { + DigitSequence cc = attributes.getCallingCode(); + RangeTable.Builder allFormats = RangeTable.builder(FORMAT_SCHEMA); + allFormats.copyNonDefaultValues(AREA_CODE_LENGTH, table, OverwriteMode.ALWAYS); + allFormats.copyNonDefaultValues(FORMAT, table, OverwriteMode.ALWAYS); + // Throws a RangeException (IllegalArgumentException) if inconsistent write occurs. + shortcodes.values() + .forEach(t -> allFormats.copyNonDefaultValues(FORMAT, t, OverwriteMode.SAME)); + RangeTable formatTable = allFormats.build(); + ImmutableMap formats = ImmutableMap.copyOf(formatMap); + // TODO: Make this "equals" eventually (since it currently sees "synthetic" IDs). + checkMetadata( + formats.keySet().containsAll(formatTable.getAssignedValues(FORMAT)), + "[%s] mismatched format IDs: %s", + cc, Sets.symmetricDifference(formatTable.getAssignedValues(FORMAT), formats.keySet())); + + // If any of the checks relating to carrier formats are relaxed here, it might be necessary to + // re-evaluate the logic around regeneration of nationalPrefixForParsing (so be careful!). + boolean carrierTemplatesExist = false; + boolean nationalPrefixExistsForFormatting = false; + boolean nationalPrefixSometimesOptional = false; + for (String id : formats.keySet()) { + FormatSpec spec = formats.get(id); + RangeTree assigned = allFormats.getRanges(FORMAT, id); + checkMetadata(!assigned.isEmpty(), + "[%s] format specifier '%s' not assigned to any range: %s", cc, id, spec); + checkFormatLengths(cc, spec, assigned); + checkLocalFormatLengths(cc, formatTable, spec, assigned); + carrierTemplatesExist |= spec.carrier().isPresent(); + nationalPrefixExistsForFormatting |= + spec.national().hasNationalPrefix() + || spec.carrier().map(FormatTemplate::hasNationalPrefix).orElse(false); + nationalPrefixSometimesOptional |= spec.nationalPrefixOptional(); + } + checkMetadata(attributes.getCarrierPrefixes().isEmpty() || carrierTemplatesExist, + "[%s] carrier prefixes exist but no formats have carrier templates: %s", + cc, formats.values()); + checkMetadata(!attributes.getNationalPrefixes().isEmpty() || !nationalPrefixExistsForFormatting, + "[%s] if no national prefix exists, it cannot be specified in any format template: %s", + cc, formats.values()); + checkMetadata(!attributes.getNationalPrefixes().isEmpty() || !nationalPrefixSometimesOptional, + "[%s] if no national prefix exists, it cannot be optional for formatting: %s", + cc, formats.values()); + return formats; + } + + // Checks that the ranges to which formats are assigned don't have lengths outside the possible + // lengths of that format (e.g. we don't have "12xx" assigned to the format "XXX-XXX"). + private static void checkFormatLengths(DigitSequence cc, FormatSpec spec, RangeTree assigned) { + TreeSet unexpected = new TreeSet<>(assigned.getLengths()); + unexpected.removeAll(ContiguousSet.closed(spec.minLength(), spec.maxLength())); + if (!unexpected.isEmpty()) { + RangeTree bad = RangeTree.empty(); + for (int n : unexpected) { + bad = bad.union(assigned.intersect(RangeTree.from(RangeSpecification.any(n)))); + } + throw new IllegalArgumentException(String.format( + "[%s] format %s assigned to ranges of invalid length: %s", cc, spec, bad)); + } + } + + // Checks that the local lengths for ranges (as determined by area code length) is compatible + // with the assigned local format specifier. Note that it is allowed to have an area code length + // of zero and still be assigned a format with a local specifier (the specifier may be shared + // with other ranges which do have an area code length). + private static void checkLocalFormatLengths( + DigitSequence cc, RangeTable formatTable, FormatSpec spec, RangeTree assigned) { + if (!spec.local().isPresent()) { + return; + } + ImmutableSet lengths = + formatTable.subTable(assigned, AREA_CODE_LENGTH).getAssignedValues(AREA_CODE_LENGTH); + FormatTemplate local = spec.local().get(); + // Format specifiers either vary length in the area code or the local number, but not both. + int localLength = local.minLength(); + int localVariance = local.maxLength() - local.minLength(); + if (localVariance == 0) { + // If there's no length variation in the "local" part, it means the area code length can + // be variable. + ContiguousSet acls = + ContiguousSet.closed(spec.minLength() - localLength, spec.maxLength() - localLength); + checkMetadata(acls.containsAll(lengths), + "[%s] area code lengths '%s' not supported by format: %s", cc, acls, spec); + } else { + // If the length variation of the format is in the trailing "local" part, we expect the a + // unique area code length (only one "group" in the format can be variable). + checkMetadata((spec.maxLength() - spec.minLength()) == localVariance, + "[%s] invalid local format (bad length) in format specifier %s", cc, spec); + int acl = spec.minLength() - localLength; + checkMetadata(lengths.size() == 1 && lengths.contains(acl), + "[%s] implied area code length(s) %s does not match expected length (%s) of format: %s", + cc, lengths, acl, spec); + } + } + + private static ImmutableList checkAltFormatConsistency( + ImmutableList altFormats, + Map formats, + RangeTable xmlTable) { + for (AltFormatSpec altFormat : altFormats) { + String parentId = altFormat.parentFormatId(); + FormatSpec parent = formats.get(parentId); + checkMetadata(parent != null, "unknown parent format ID in alternate format: %s", altFormat); + Set altLengths = getLengths(altFormat.template()); + checkMetadata(getLengths(parent.national()).containsAll(altLengths), + "alternate format lengths must be bounded by parent format lengths: %s", altFormat); + + // Only care about the parent ranges which have the same length(s) as the alt format. + RangeTree lengthMask = RangeTree.from(altLengths.stream().map(RangeSpecification::any)); + RangeTree ranges = xmlTable.getRanges(FORMAT, parentId).intersect(lengthMask); + RangeTree captured = PrefixTree.from(altFormat.prefix()).retainFrom(ranges); + checkMetadata(!captured.isEmpty(), + "alternate format must capture some of the parent format ranges: %s", altFormat); + int prefixLength = altFormat.prefix().length(); + if (prefixLength > 0) { + // A really ugly, but useful check to find if there's a better prefix. Specifically, it + // determines if the given prefix is "over-capturing" ranges (e.g. prefix is "1[2-8]" but + // only "1[3-6]" exists in the parent format's assigned ranges). Since this is an odd, non + // set-like operation, it's just done "manually" using bit masks. It's not a union of the + // paths, it's a "squashing" (since it results in the smallest single range specification). + // + // Start with all the paths trimmed to the prefix length (e.g. "123", "145", "247"). All + // range specifications in the slice are the same length as the prefix we started with. + RangeTree slice = captured.slice(prefixLength); + // Now union the digit masks at each depth for all paths in the slice (in theory there + // could be a "squash" operation on RangeSpecification to do all this). + int[] masks = new int[prefixLength]; + slice.asRangeSpecifications().forEach(s -> { + for (int n = 0; n < prefixLength; n++) { + masks[n] |= s.getBitmask(n); + } + }); + // Now reconstruct the single "squashed" range specification (e.g. "[12][24][357]"). + RangeSpecification minSpec = RangeSpecification.empty(); + for (int n = 0; n < prefixLength; n++) { + minSpec = minSpec.extendByMask(masks[n]); + } + checkMetadata(minSpec.equals(altFormat.prefix()), + "alternate format prefix '%s' is too broad, it should be '%s' for: %s", + altFormat.prefix(), minSpec, altFormat); + } + } + return altFormats; + } + + private static Set getLengths(FormatTemplate t) { + return ContiguousSet.closed(t.minLength(), t.maxLength()); + } + + // Checks that example numbers are valid numbers in the ranges for their type. + private static ImmutableTable checkExampleNumbers( + Set regions, + RangeTable table, + Table exampleNumbers) { + for (PhoneRegion r : regions) { + RangeTable regionTable = + table.subTable(table.getRanges(REGIONS.getColumn(r), TRUE), XmlRangesSchema.TYPE); + Map regionExamples = exampleNumbers.row(r); + ImmutableSet types = regionTable.getAssignedValues(XmlRangesSchema.TYPE); + checkMetadata(types.equals(regionExamples.keySet()), + "mismatched types for example numbers in region %s\nExpected: %s\nActual: %s", + r, types, regionExamples); + for (ValidNumberType t : types) { + DigitSequence exampleNumber = regionExamples.get(t); + RangeTree ranges = regionTable.getRanges(XmlRangesSchema.TYPE, t); + // Special case, since we permit example numbers for fixed line/mobile to be valid for the + // combined range as well. + // + // This logic smells, since it reveals information about the XML structure (in which fixed + // line and mobile ranges can overlap). However if we insist that a fixed line examples are + // in the "fixed line only" range, we end up with problems if (mobile == fixed line), since + // there is no "fixed line only" range (but there is an example number in the XML). + if (t == ValidNumberType.MOBILE || t == ValidNumberType.FIXED_LINE) { + ranges = ranges.union( + regionTable.getRanges(XmlRangesSchema.TYPE, ValidNumberType.FIXED_LINE_OR_MOBILE)); + } + checkMetadata(ranges.contains(exampleNumber), + "invalid example number '%s' of type %s in region %s", exampleNumber, t, r); + } + } + return ImmutableTable.copyOf(exampleNumbers); + } + + public abstract Attributes getAttributes(); + + // TODO: Inline the wrapper methods below. + + /** Returns the unique calling code of this numbering scheme. */ + public DigitSequence getCallingCode() { + return getAttributes().getCallingCode(); + } + + /** + * Returns the regions represented by this numbering scheme. The main region is always present + * and listed first, and remaining regions are listed in "natural" order. + */ + public ImmutableSet getRegions() { + return getAttributes().getRegions(); + } + + /** + * Returns a range table containing per-range attributes according to + * {@link XmlRangesSchema#COLUMNS}. + */ + public abstract RangeTable getTable(); + + /** + * Returns a RangeTable restricted to the given region, which conforms to the + * {@link XmlRangesSchema} schema, with the exception that no region columns exist. + */ + public final RangeTable getTableFor(PhoneRegion region) { + checkArgument(getRegions().contains(region), + "invalid region '%s' for calling code '%s'", region, getCallingCode()); + return getTable() + .subTable(getTable().getRanges(REGIONS.getColumn(region), TRUE), PER_REGION_COLUMNS); + } + + public abstract ImmutableSortedMap getShortcodes(); + + /** Returns the RangeTable for the shortcodes of the given region. */ + public final Optional getShortcodesFor(PhoneRegion region) { + checkArgument(getRegions().contains(region), + "invalid region '%s' for calling code '%s'", region, getCallingCode()); + return Optional.ofNullable(getShortcodes().get(region)); + } + + /** Returns the map of format ID to format specifier. */ + public abstract ImmutableMap getFormats(); + + /** Returns a list of alternate formats which are also expected for this numbering scheme. */ + public abstract ImmutableList getAlternateFormats(); + + /** Returns a table of example numbers for each region code and number type. */ + public abstract ImmutableTable getExampleNumbers(); + + /** + * Returns all comments known about by this numbering scheme. Internal method, callers should + * always use {@link #getComments(Anchor)} instead. + */ + abstract ImmutableList getAllComments(); + + /** Returns comments with a specified anchor for this numbering scheme. */ + public ImmutableList getComments(Anchor anchor) { + checkArgument(getAttributes().getRegions().contains(anchor.region()), + "invalid region: %s", anchor.region()); + return getAllComments().stream() + .filter(c -> c.getAnchor().equals(anchor)) + .collect(toImmutableList()); + } + + /** + * An encapsulation of a comment to be associated with an element in the XML. Rather than have + * many APIs for setting/getting comments on a {@link NumberingScheme}, the approach taken here + * is to let comments describe for themselves where they go but keep them in one big bucket. + *

+ * This simplifies a lot of the intermediate APIs in the builders, but is less efficient (since + * finding comments is now a linear search). If this is ever an issue, they should be mapped by + * key, using a {@code ListMultimap} (since comments are also ordered by their + * number). + */ + @AutoValue + public abstract static class Comment { + private static final Joiner JOIN_LINES = Joiner.on('\n'); + private static final Splitter SPLIT_LINES = Splitter.on('\n'); + + /** An anchor defining which element, in which territory, a comment should be attached to. */ + @AutoValue + public abstract static class Anchor implements Comparable { + // Special anchor for comments that are not stored in the comment table, but are attached to + // data directly (e.g. formats). + private static final Anchor ANONYMOUS = of(PhoneRegion.getUnknown(), ""); + + private static final Comparator ORDERING = + comparing(Anchor::region).thenComparing(Anchor::label); + + /** Creates a comment anchor from a region and xml type. */ + static Anchor of(PhoneRegion region, String label) { + // TODO: Add check for valid label. + return anchor(region, label); + } + + /** The region of the territory this comment should be attached to. */ + public abstract PhoneRegion region(); + + /** + * The type in the territory this comment should be attached to. If missing, attach this + * comment to the main comment block for the territory. + */ + public abstract String label(); + + @Override + public int compareTo(Anchor that) { + return ORDERING.compare(this, that); + } + } + + // Private since we want to funnel people through type safe factory methods. + private static Anchor anchor(PhoneRegion region, String label) { + return new AutoValue_NumberingScheme_Comment_Anchor(region, label); + } + + /** Returns a key identifying a comment for a region. */ + public static Anchor anchor(PhoneRegion region) { + return anchor(region, "XML"); + } + + /** Returns a key identifying a comment for the validation range of a given type in a region. */ + public static Anchor anchor(PhoneRegion region, XmlNumberType xmlType) { + return anchor(region, xmlType.toString()); + } + + /** + * Returns a key identifying a comment for the validation range of a given shortcode type in + * a region. + */ + public static Anchor shortcodeAnchor(PhoneRegion region) { + return anchor(region, "SC"); + } + + /** + * Returns a key identifying a comment for the validation range of a given shortcode type in + * a region. + */ + public static Anchor shortcodeAnchor(PhoneRegion region, XmlShortcodeType xmlType) { + return anchor(region, xmlType.toString()); + } + + /** Creates a comment the applies to data identified by the specified key. */ + public static Comment create(Anchor anchor, List lines) { + return new AutoValue_NumberingScheme_Comment(anchor, ImmutableList.copyOf(lines)); + } + + /** Creates a comment the applies to data identified by the specified key. */ + public static Comment createAnonymous(List lines) { + return new AutoValue_NumberingScheme_Comment(Anchor.ANONYMOUS, ImmutableList.copyOf(lines)); + } + + public static Comment fromText(Anchor anchor, String text) { + return create(anchor, SPLIT_LINES.splitToList(text)); + } + + public static Comment fromText(String text) { + return createAnonymous(SPLIT_LINES.splitToList(text)); + } + + /** + * Returns the key which defines what this comment relates to (and thus where it should appear + * in the XML file). + */ + public abstract Anchor getAnchor(); + + /** The lines of a single mulit-line comment. */ + // TODO: Switch to a single string (with newlines) which is what's done elsewhere. + public abstract ImmutableList getLines(); + + public String toText() { + return JOIN_LINES.join(getLines()); + } + + // Visible for AutoValue. + Comment() {} + } + + // Visible for AutoValue. + NumberingScheme() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingSchemes.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingSchemes.java new file mode 100644 index 000000000..4c4086664 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/NumberingSchemes.java @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.util.function.Function.identity; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.MetadataKey; +import java.util.List; + +/** + * Collection of numbering schemes, mapped primarily by calling code, but available via other + * mappings (e.g. metadata key) for convenience. + */ +// TODO: Delete this (it's hardly used and very little more than a simple collection). +@AutoValue +public abstract class NumberingSchemes { + /** + * Aggregates a list of numbering schemes into a single collection which mirrors the structure and + * mapping of the libphonenumber XML metadata file. + */ + public static NumberingSchemes from(List schemes) { + ImmutableMap map = + schemes.stream().collect(toImmutableMap(NumberingScheme::getCallingCode, identity())); + ImmutableSet allKeys = map.values().stream() + .flatMap(s -> s.getRegions().stream().map(r -> MetadataKey.create(r, s.getCallingCode()))) + .collect(toImmutableSet()); + return new AutoValue_NumberingSchemes(map, allKeys); + } + + /** Returns a mapping of top-level numbering schemes by calling code. */ + // TODO: Rename to getSchemeMap() since it's confusing, or add a direct getter. + public abstract ImmutableMap getSchemes(); + + /** Returns the set of all calling codes for top-level schemes in this collection. */ + public ImmutableSet getCallingCodes() { + return getSchemes().keySet(); + } + + /** Returns the set of all metadata keys for regional schemes in this collection. */ + public abstract ImmutableSet getKeys(); + + // Visible for AutoValue. + NumberingSchemes() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/OperatorsTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/OperatorsTableSchema.java new file mode 100644 index 000000000..3d8a66a3c --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/OperatorsTableSchema.java @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import com.google.i18n.phonenumbers.metadata.i18n.SimpleLanguageTag; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.ColumnGroup; +import com.google.i18n.phonenumbers.metadata.table.CsvKeyMarshaller; +import com.google.i18n.phonenumbers.metadata.table.CsvSchema; +import com.google.i18n.phonenumbers.metadata.table.Schema; + +/** + * The schema of the "Operators" table with rows keyed by operator ID and columns: + *

    + *
  1. {@link #SELECTION_CODES}: Operator selection codes for national dialling. + *
  2. {@link #IDD_PREFIXES}: International direct dialling codes. + *
  3. {@link #NAMES}: A group of columns containing the name of the operator, potential in + * multiple languages. Note that English translations for all operators need not be present. + *
+ * + *

Rows keys are serialized via the marshaller and produce the leading column: + *

    + *
  1. {@code Id}: The operator ID. + *
+ * + *

The default IDD prefix should not be in this table, but is instead stored in the top-level + * {@link MetadataTableSchema#IDD_PREFIX} column. + * + *

Note that there is a special case in which we need to store a selection code or IDD code, but + * it does not below to a operator with an assigned range (e.g. it's a universally available code). + * In these situations, you should ensure that the operator ID starts with "__" (double underscore) + * to prevent consistency checks from complaining about unassigned operators. You can also omit a + * name for the row, but should probably add a comment. + */ +public final class OperatorsTableSchema { + /** + * A comma separated list of "selection codes" (as range specifications) which are added to + * national numbers (not always as a prefix) to select an operator for national dialling. + * This will often contain many of the same values as IDD_CODES but need not be identical. + * + *

Note that while a single operator may have more than one code associated with it, the same + * code cannot appear in more than one row in this table. + */ + public static final Column SELECTION_CODES = Column.ofString("Domestic Selection Codes"); + + /** + * A comma separated list of "International Direct Dialing" codes (as range specifications) which + * are prefixes for international dialling. This will often contain many of the same prefixes as + * SELECTION_CODES but need not be identical. + * + *

Note that while a single operator may have more than one code associated with it, the same + * code cannot appear in more than one row in this table. + */ + public static final Column IDD_PREFIXES = Column.ofString("International Dialling Codes"); + + /** The "Name:XXX" column group in the operator table. */ + public static final ColumnGroup NAMES = + ColumnGroup.byLanguage(Column.ofString("Name")); + + public static final Column COMMENT = RangesTableSchema.COMMENT; + + private static final CsvKeyMarshaller MARSHALLER = CsvKeyMarshaller.ofSortedString("Id"); + + private static final Schema COLUMNS = Schema.builder() + .add(SELECTION_CODES) + .add(IDD_PREFIXES) + .add(NAMES) + .add(COMMENT) + .build(); + + /** Schema instance defining the operators CSV table. */ + public static final CsvSchema SCHEMA = CsvSchema.of(MARSHALLER, COLUMNS); + + private OperatorsTableSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java new file mode 100644 index 000000000..e15fe834e --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java @@ -0,0 +1,396 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.DiscreteDomain.integers; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static java.util.Comparator.comparing; +import static java.util.function.Function.identity; +import static java.util.stream.Collectors.joining; + +import com.google.common.base.Splitter; +import com.google.common.collect.ContiguousSet; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableRangeSet; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedSet; +import com.google.common.collect.Range; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.i18n.SimpleLanguageTag; +import com.google.i18n.phonenumbers.metadata.model.MetadataTableSchema.Regions; +import com.google.i18n.phonenumbers.metadata.proto.Enums.Provenance; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.table.Change; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.ColumnGroup; +import com.google.i18n.phonenumbers.metadata.table.CsvKeyMarshaller; +import com.google.i18n.phonenumbers.metadata.table.CsvSchema; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.MultiValue; +import com.google.i18n.phonenumbers.metadata.table.RangeKey; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.time.ZoneId; +import java.util.List; +import java.util.NavigableSet; +import java.util.Optional; +import java.util.TreeSet; +import java.util.stream.Stream; + +/** + * The schema of the standard "Ranges" table with rows keyed by {@link RangeKey} and columns: + *

    + *
  1. {@link #TYPE}: The semantic type of numbers in a range (note that this is not + * the same a XmlNumberType or ValidNumberType). All ranges should be assigned a type. + *
  2. {@link #TARIFF}: The expected cost of numbers in a range (combining TYPE and TARIFF + * can yield the internal ValidNumberType). All ranges should be assigned a tariff. + *
  3. {@link #AREA_CODE_LENGTH}: The length of an optional prefix which may be removed from + * numbers in a range for local dialling. Local only lengths are derived using this column. + *
  4. {@link #NATIONAL_ONLY}: True if numbers in a range cannot be dialled from outside its + * region. The "noInternationalDialling" ranges are derived from this column. + *
  5. {@link #SMS}: True if numbers in a range are expected to support SMS. + *
  6. {@link #OPERATOR}: The expected operator (carrier) ID for a range (or empty if no carrier + * is known). + *
  7. {@link #FORMAT}: The expected format ID for a range (or empty if no formatting should be + * applied). + *
  8. {@link #TIMEZONE}: The timezone names for a range (or empty to imply the default + * timezones). Multiple timezones can be specific if separated by {@code '&'}. + *
  9. {@link #REGIONS}: A group of boolean columns in the form "Region:XX", where ranges are + * set {@code true} that range is valid within the region {@code XX}. + *
  10. {@link #GEOCODES}: A group of String columns in the form "Geocode:XXX" containing the + * geocode string for a range, where {@code XXX} is the language code of the string. + *
  11. {@link #PROVENANCE}: Indicates the most important reason for a range to be valid. + *
  12. {@link #COMMENT}: Free text field usually containing evidence related to the provenance. + *
+ * + *

Rows keys are serialized via the marshaller and produce leading columns: + *

    + *
  1. {@code Prefix}: The prefix (RangeSpecification) for the ranges in a row (e.g. "12[3-6]"). + *
  2. {@code Length}: A set of lengths for the ranges in a row (e.g. "9", "8,9" or "5,7-9"). + *
+ */ +public final class RangesTableSchema { + /** + * External number type enum. This is technically much better than ValidNumberType since it + * splits type and cost properly. Unfortunately the internal logic of the phonenumber library + * doesn't really cope with this, which is why we convert to {@code XmlRangesSchema} before + * creating legacy data structures. + * + *

This enum can be modified as new types are requested from data providers, providing the + * type mapping to ValidNumberType is updated appropriately. Note that until it's clear that + * mapping types such as {@link #M2M} to {@link ValidNumberType#UNKNOWN} will work okay, we + * should be very careful about using the additional types. Additional types need to be removed + * before the generated table can be turned into a {@link NumberingScheme}. + */ + public enum ExtType { + /** Default value not permitted in real data. */ + UNKNOWN, + /** Maps to {@link ValidNumberType#FIXED_LINE}. */ + FIXED_LINE, + /** Maps to {@link ValidNumberType#MOBILE}. */ + MOBILE, + /** Maps to {@link ValidNumberType#FIXED_LINE_OR_MOBILE}. */ + FIXED_LINE_OR_MOBILE, + /** Maps to {@link ValidNumberType#VOIP}. */ + VOIP, + /** Maps to {@link ValidNumberType#PAGER}. */ + PAGER, + /** Maps to {@link ValidNumberType#PERSONAL_NUMBER}. */ + PERSONAL_NUMBER, + /** Maps to {@link ValidNumberType#UAN}. */ + UAN, + /** Maps to {@link ValidNumberType#VOICEMAIL}. */ + VOICEMAIL, + /** Machine-to-machine numbers (additional type for future support). */ + M2M, + /** ISP dial-up numbers (additional type for future support). */ + ISP; + + private static final ImmutableMap TYPE_MAP = + Stream.of( + ExtType.FIXED_LINE, + ExtType.MOBILE, + ExtType.FIXED_LINE_OR_MOBILE, + ExtType.PAGER, + ExtType.PERSONAL_NUMBER, + ExtType.UAN, + ExtType.VOICEMAIL, + ExtType.VOIP) + .collect(toImmutableMap(identity(), v -> ValidNumberType.valueOf(v.name()))); + + public Optional toValidNumberType() { + return Optional.ofNullable(TYPE_MAP.get(this)); + } + } + + /** + * External tariff enum. By splitting tariff information out from the "line type", we can + * represent a much wider (and more realistic) set of combinations for number ranges. When + * combined with {@link ExtType}, this maps back to {@code ValidNumberType}. + */ + public enum ExtTariff { + /** Does not affect ValidNumberType mapping. */ + STANDARD_RATE, + /** Maps to {@link ValidNumberType#TOLL_FREE}. */ + TOLL_FREE, + /** Maps to {@link ValidNumberType#SHARED_COST}. */ + SHARED_COST, + /** Maps to {@link ValidNumberType#PREMIUM_RATE}. */ + PREMIUM_RATE; + + private static final ImmutableMap TARIFF_MAP = + Stream.of(ExtTariff.TOLL_FREE, ExtTariff.SHARED_COST, ExtTariff.PREMIUM_RATE) + .collect(toImmutableMap(identity(), v -> ValidNumberType.valueOf(v.name()))); + + public Optional toValidNumberType() { + return Optional.ofNullable(TARIFF_MAP.get(this)); + } + } + + /** The value in the "TIMEZONE" column, which is effectively a list of timezone strings. */ + public static final class Timezones extends MultiValue { + public static Column column(String name) { + return Column.create(Timezones.class, name, new Timezones(""), Timezones::new); + } + + public Timezones(Iterable ids) { + super(ids, '&', comparing(ZoneId::getId), true); + } + + public Timezones(String s) { + super(s, ZoneId::of, '&', comparing(ZoneId::getId), true); + } + } + + public static final Column TYPE = Column.of(ExtType.class, "Type", ExtType.UNKNOWN); + + public static final Column TARIFF = + Column.of(ExtTariff.class, "Tariff", ExtTariff.STANDARD_RATE); + + /** + * The "Area Code Length" column in the range table, denoting the length of a prefix which can + * be removed from all numbers in a range to obtain locally diallable numbers. If an + * "area code" is not optional for dialling, then no value should be set here. + */ + public static final Column AREA_CODE_LENGTH = + Column.ofUnsignedInteger("Area Code Length"); + + /** Denotes ranges which cannot be dialled internationally. */ + public static final Column NATIONAL_ONLY = Column.ofBoolean("National Only"); + + /** Denotes ranges which can reasonably be expected to receive SMS. */ + public static final Column SMS = Column.ofBoolean("Sms"); + + /** The ID of the primary/original operator assigned to a range. */ + public static final Column OPERATOR = Column.ofString("Operator"); + + /** The ID of the format assigned to a range. */ + public static final Column FORMAT = Column.ofString("Format"); + + /** An '&'-separated list of timezone IDs associated with this range. */ + public static final Column TIMEZONE = Timezones.column("Timezone"); + + /** The "Region:XX" column group in the range table. */ + public static final ColumnGroup REGIONS = + ColumnGroup.byRegion(Column.ofBoolean("Region")); + + /** The "Regions" column in the CSV table. */ + public static final Column CSV_REGIONS = Regions.column("Regions"); + + /** The "Geocode:XXX" column group in the range table. */ + public static final ColumnGroup GEOCODES = + ColumnGroup.byLanguage(Column.ofString("Geocode")); + + /** The provenance column indicating why a range is considered valid. */ + public static final Column PROVENANCE = + Column.of(Provenance.class, "Provenance", Provenance.UNKNOWN); + + /** An arbitrary text comment, usually (at least) supplying information about the provenance. */ + public static final Column COMMENT = Column.ofString("Comment"); + + /** Marshaller for constructing CsvTable from RangeTable. */ + private static final CsvKeyMarshaller MARSHALLER = new CsvKeyMarshaller<>( + RangesTableSchema::write, + RangesTableSchema::read, + Optional.of(RangeKey.ORDERING), + "Prefix", + "Length"); + + /** The non-key columns of a range table. */ + public static final Schema TABLE_COLUMNS = + Schema.builder() + .add(TYPE) + .add(TARIFF) + .add(AREA_CODE_LENGTH) + .add(NATIONAL_ONLY) + .add(SMS) + .add(OPERATOR) + .add(FORMAT) + .add(TIMEZONE) + .add(REGIONS) + .add(GEOCODES) + .add(PROVENANCE) + .add(COMMENT) + .build(); + + /** + * The columns for the serialized CSV table. Note that the "REGIONS" column group is replaced + * by the CSV regions multi-value. This allows region codes to be serialize in a single column + * (which is far nicer when looking at data in a spreadsheet). In the range table, this is + * normalized into the boolean column group (because that's far nicer to work with). + */ + private static final Schema CSV_COLUMNS = + Schema.builder() + .add(TYPE) + .add(TARIFF) + .add(AREA_CODE_LENGTH) + .add(NATIONAL_ONLY) + .add(SMS) + .add(OPERATOR) + .add(FORMAT) + .add(TIMEZONE) + .add(CSV_REGIONS) + .add(GEOCODES) + .add(PROVENANCE) + .add(COMMENT) + .build(); + + /** Schema instance defining the ranges CSV table. */ + public static final CsvSchema SCHEMA = CsvSchema.of(MARSHALLER, CSV_COLUMNS); + + /** + * Converts a {@link RangeTable} to a {@link CsvTable}, using {@link RangeKey}s as row keys and + * preserving the original table columns. The {@link CsvSchema} of the returned table is not + * guaranteed to be the {@link #SCHEMA} instance if the given table had different columns. + */ + @SuppressWarnings("unchecked") + public static CsvTable toCsv(RangeTable table) { + CsvTable.Builder csv = CsvTable.builder(SCHEMA); + ImmutableSet> regionColumns = + REGIONS.extractGroupColumns(table.getColumns()).values(); + TreeSet regions = new TreeSet<>(); + for (Change c : table.toChanges()) { + for (RangeKey k : RangeKey.decompose(c.getRanges())) { + regions.clear(); + c.getAssignments().forEach(a -> { + // We special case the regions column, converting a group of boolean columns into a + // multi-value of region codes. If the column is in the group, it must hold Booleans. + if (regionColumns.contains(a.column())) { + if (a.value().map(((Column) a.column())::cast).orElse(Boolean.FALSE)) { + regions.add(REGIONS.getKey(a.column())); + } + } else { + csv.put(k, a); + } + }); + // We can do this out-of-sequence because the table will order its columns. + if (!regions.isEmpty()) { + csv.put(k, CSV_REGIONS, Regions.of(regions)); + } + } + } + return csv.build(); + } + + /** + * Converts a {@link RangeKey} based {@link CsvTable} to a {@link RangeTable}, preserving the + * original table columns. The {@link CsvSchema} of the returned table is not guaranteed to be + * the {@link #SCHEMA} instance if the given table had different columns. + */ + public static RangeTable toRangeTable(CsvTable csv) { + RangeTable.Builder out = RangeTable.builder(TABLE_COLUMNS); + for (RangeKey k : csv.getKeys()) { + Change.Builder change = Change.builder(k.asRangeTree()); + csv.getRow(k).forEach((c, v) -> { + // We special case the regions column, converting a comma separated list of region codes + // into a series of boolean column assignments. + if (c.equals(CSV_REGIONS)) { + CSV_REGIONS.cast(v).getValues().forEach(r -> change.assign(REGIONS.getColumn(r), true)); + } else { + change.assign(c, v); + } + }); + out.apply(change.build(), OverwriteMode.NEVER); + } + return out.build(); + } + + // Shared by ShortcodeTableSchema + public static Stream write(RangeKey key) { + return Stream.of(key.getPrefix().toString(), formatLength(key.getLengths())); + } + + // Shared by ShortcodeTableSchema + public static RangeKey read(List parts) { + return RangeKey.create(RangeSpecification.parse(parts.get(0)), parseLengths(parts.get(1))); + } + + private static String formatLength(ImmutableSortedSet lengthSet) { + checkArgument(!lengthSet.isEmpty()); + ImmutableRangeSet r = + ImmutableRangeSet.unionOf( + lengthSet.stream() + .map(n -> Range.singleton(n).canonical(integers())) + .collect(toImmutableList())); + return r.asRanges().stream().map(RangesTableSchema::formatRange).collect(joining(",")); + } + + private static String formatRange(Range r) { + ContiguousSet s = ContiguousSet.create(r, integers()); + switch (s.size()) { + case 1: + return String.valueOf(s.first()); + case 2: + return s.first() + "," + s.last(); + default: + return s.first() + "-" + s.last(); + } + } + + private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults(); + private static final Splitter RANGE_SPLITTER = Splitter.on('-').trimResults().limit(2); + + private static NavigableSet parseLengths(String s) { + NavigableSet lengths = new TreeSet<>(); + for (String lengthOrRange : COMMA_SPLITTER.split(s)) { + if (lengthOrRange.contains("-")) { + List lohi = RANGE_SPLITTER.splitToList(lengthOrRange); + int lo = parseInt(lohi.get(0)); + int hi = parseInt(lohi.get(1)); + checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi); + checkArgument(lengths.isEmpty() || lo > lengths.last(), "Overlapping ranges: %s", s); + lengths.addAll(ContiguousSet.closed(lo, hi)); + } else { + int length = parseInt(lengthOrRange); + checkArgument(lengths.isEmpty() || length > lengths.last(), "Overlapping ranges: %s", s); + lengths.add(length); + } + } + return lengths; + } + + private static int parseInt(String s) { + return Integer.parseUnsignedInt(s, 10); + } + + private RangesTableSchema() {} +} + + diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ShortcodesTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ShortcodesTableSchema.java new file mode 100644 index 000000000..7df5fde98 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ShortcodesTableSchema.java @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableBiMap.toImmutableBiMap; +import static com.google.i18n.phonenumbers.metadata.model.ShortcodesTableSchema.ShortcodeType.EMERGENCY; +import static com.google.i18n.phonenumbers.metadata.model.ShortcodesTableSchema.ShortcodeType.EXPANDED_EMERGENCY; +import static java.util.function.Function.identity; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableBiMap; +import com.google.common.collect.ImmutableSortedMap; +import com.google.common.collect.Maps; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtTariff; +import com.google.i18n.phonenumbers.metadata.proto.Enums.Provenance; +import com.google.i18n.phonenumbers.metadata.proto.Types.XmlShortcodeType; +import com.google.i18n.phonenumbers.metadata.table.Change; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.CsvKeyMarshaller; +import com.google.i18n.phonenumbers.metadata.table.CsvSchema; +import com.google.i18n.phonenumbers.metadata.table.CsvTable; +import com.google.i18n.phonenumbers.metadata.table.RangeKey; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Stream; + +/** + * The schema of the standard "Shortcodes" table with rows keyed by {@link RangeKey} and columns: + *

    + *
  1. {@link #TYPE}: The semantic type of numbers in a range. All ranges should be assigned a + * type. + *
  2. {@link #TARIFF}: The expected cost of numbers in a range. All ranges should be assigned a + * tariff. + *
  3. {@link #SMS}: True if numbers in a range are expected to support SMS. + *
  4. {@link #SUBREGION}: True if numbers in a range are expected to be only diallable from a + * geographic subregion (rather than the whole region). + *
  5. {@link #PROVENANCE}: Indicates the most important reason for a range to be valid. + *
  6. {@link #COMMENT}: Free text field usually containing evidence related to the provenance. + *
+ * + *

Rows keys are serialized via the marshaller and produce leading columns: + *

    + *
  1. {@code Region}: The region code for which this range applies. + *
  2. {@code Prefix}: The prefix (RangeSpecification) for the ranges in a row (e.g. "12[3-6]"). + *
  3. {@code Length}: A set of lengths for the ranges in a row (e.g. "9", "8,9" or "5,7-9"). + *
+ * + *

Note that the region must be part of the key, since some shortcodes have different types + * between different regions. + */ +public final class ShortcodesTableSchema { + /** + * The row key of the shortcode table, specifying region and range key. This permits all + * shortcodes to be stored in a single table (which is very helpful in NANPA, where there are + * many regions, most with only a tiny amount of shortcode information). + */ + @AutoValue + public abstract static class ShortcodeKey { + private static final Comparator ORDERING = Comparator + .comparing(ShortcodeKey::getRegion) + .thenComparing(ShortcodeKey::getRangeKey, RangeKey.ORDERING); + + private static final CsvKeyMarshaller MARSHALLER = new CsvKeyMarshaller<>( + ShortcodeKey::write, + ShortcodeKey::read, + Optional.of(ShortcodeKey.ORDERING), + "Region", + "Prefix", + "Length"); + + private static Stream write(ShortcodeKey key) { + return Stream.concat( + Stream.of(key.getRegion().toString()), + RangesTableSchema.write(key.getRangeKey())); + } + + private static ShortcodeKey read(List parts) { + return ShortcodeKey.create( + PhoneRegion.of(parts.get(0)), + RangesTableSchema.read(parts.subList(1, parts.size()))); + } + + public static ShortcodeKey create(PhoneRegion region, RangeKey rangeKey) { + checkArgument(!region.equals(PhoneRegion.getUnknown()), "region must be valid"); + return new AutoValue_ShortcodesTableSchema_ShortcodeKey(region, rangeKey); + } + + public abstract PhoneRegion getRegion(); + public abstract RangeKey getRangeKey(); + } + + /** Shortcode type enum. */ + public enum ShortcodeType { + /** Default value not permitted in real data. */ + UNKNOWN, + + /** + * General purpose non-governmental services including commercial or charity services. This is + * the default type for shortcodes if no other category is more applicable. + */ + COMMERCIAL, + /** + * Non-emergency, government run public services (e.g. directory enquiries). + */ + PUBLIC_SERVICE, + /** + * Public services which provide important non-emergency information for health or safety + * (e.g. https://www.police.uk/contact/101/). + */ + EXPANDED_EMERGENCY, + /** + * Primary public emergency numbers (i.e. police, fire or ambulance) which are available to + * everyone. Numbers in this category must be toll-free and not carrier specific. Mobile phone + * manufacturers will often allow these numbers to be dialled from a locked device, so it's + * important that they work for everyone. + */ + EMERGENCY; + } + + private static final ImmutableBiMap XML_TARIFF_MAP = + Stream.of(ExtTariff.TOLL_FREE, ExtTariff.STANDARD_RATE, ExtTariff.PREMIUM_RATE) + .collect(toImmutableBiMap(identity(), v -> XmlShortcodeType.valueOf("SC_" + v.name()))); + + private static final ImmutableBiMap XML_TYPE_MAP = + Stream.of(EXPANDED_EMERGENCY, EMERGENCY) + .collect(toImmutableBiMap(identity(), v -> XmlShortcodeType.valueOf("SC_" + v.name()))); + + /** Return the known mapping from the schema shortcode types to the XML type. */ + public static Optional getXmlType(ShortcodeType type) { + return Optional.ofNullable(XML_TYPE_MAP.get(type)); + } + + /** Return the mapping from the schema tariff to the XML type. */ + public static XmlShortcodeType getXmlType(ExtTariff tariff) { + XmlShortcodeType xmlType = XML_TARIFF_MAP.get(tariff); + checkArgument(xmlType != null, "shortcodes do not support tariff: %s", tariff); + return xmlType; + } + + public static final Column TYPE = + Column.of(ShortcodeType.class, "Type", ShortcodeType.UNKNOWN); + + public static final Column TARIFF = RangesTableSchema.TARIFF; + public static final Column SMS = RangesTableSchema.SMS; + public static final Column CARRIER_SPECIFIC = Column.ofBoolean("Carrier Specific"); + public static final Column SUBREGION = Column.ofBoolean("Subregion"); + public static final Column FORMAT = RangesTableSchema.FORMAT; + public static final Column PROVENANCE = RangesTableSchema.PROVENANCE; + public static final Column COMMENT = RangesTableSchema.COMMENT; + + private static final Schema COLUMNS = + Schema.builder() + .add(TYPE) + .add(TARIFF) + .add(SMS) + .add(CARRIER_SPECIFIC) + .add(SUBREGION) + .add(FORMAT) + .add(PROVENANCE) + .add(COMMENT) + .build(); + + /** Schema instance defining the "Shortcodes" CSV table. */ + public static final CsvSchema SCHEMA = + CsvSchema.of(ShortcodeKey.MARSHALLER, COLUMNS); + + /** + */ + public static CsvTable toCsv(Map tables) { + CsvTable.Builder csv = CsvTable.builder(SCHEMA); + tables.forEach((r, t) -> { + for (Change c : t.toChanges()) { + for (RangeKey k : RangeKey.decompose(c.getRanges())) { + csv.put(ShortcodeKey.create(r, k), c.getAssignments()); + } + } + }); + return csv.build(); + } + + /** + * Maps a single shortcode CSV table into a map of region specific range tables. Note that the + * ranges in these tables do not need to be consistent across regions (e.g. "toll free" in one + * might be "premium rate" in the other). + */ + public static ImmutableSortedMap toShortcodeTables( + CsvTable csv) { + // Retain order of regions in the CSV table (not natural region order). + Map builderMap = new LinkedHashMap<>(); + for (ShortcodeKey k : csv.getKeys()) { + // Basically the same as for RangesTableSchema, except that we deal with region codes in the + // key. + Change.Builder change = Change.builder(k.getRangeKey().asRangeTree()); + csv.getRow(k).forEach(change::assign); + PhoneRegion region = k.getRegion(); + RangeTable.Builder table = builderMap.get(region); + if (table == null) { + table = RangeTable.builder(COLUMNS); + builderMap.put(region, table); + } + table.apply(change.build(), OverwriteMode.NEVER); + } + return ImmutableSortedMap.copyOf(Maps.transformValues(builderMap, RangeTable.Builder::build)); + } + + private ShortcodesTableSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/XmlRangesSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/XmlRangesSchema.java new file mode 100644 index 000000000..6a6c1b9fe --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/XmlRangesSchema.java @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.model.MetadataException.checkMetadata; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.UNKNOWN; +import static com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode.NEVER; + +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtTariff; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtType; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.ColumnGroup; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import com.google.i18n.phonenumbers.metadata.table.Schema; +import java.util.Optional; + +/** + * A schema describing the columns which are required for creating a {@link NumberingScheme}. + *

    + *
  1. {@link #TYPE}: The semantic type of numbers in a range (note that this is not the same as + * an {@code XmlNumberType}). All ranges should be assigned a validation type. + *
  2. {@link #AREA_CODE_LENGTH}: The length of an optional prefix which may be removed from + * numbers in a range for local dialling. Local only lengths are derived using this column. + *
  3. {@link #NATIONAL_ONLY}: True if numbers in a range cannot be dialled from outside its + * region. The "noInternationalDialling" ranges are derived from this column. + *
  4. {@link #REGIONS}: A group of boolean columns in the form "Region:XX", where ranges are + * set {@code true} that range is valid within the region {@code XX}. + *
+ * + *

This schema is sufficient for generating {@link NumberingScheme} instances, but isn't what we + * expect to import data from (which is why it doesn't have a {@code CsvKeyMarshaller} associated + * with it. That's covered by the {@code RangesTableSchema}. + */ +public final class XmlRangesSchema { + /** + * The internal "Type" column in the range table This is present in the schema and used is a lot + * of places, but it is not what the type/tariff data is imported as (it's derived from other + * columns). + */ + public static final Column TYPE = + Column.of(ValidNumberType.class, "Type", UNKNOWN); + + /** + * The "Area Code Length" column in the range table, denoting the length of a prefix which can + * be removed from all numbers in a range to obtain locally diallable numbers. If an + * "area code" is not optional for dialling, then no value should be set here. + */ + public static final Column AREA_CODE_LENGTH = RangesTableSchema.AREA_CODE_LENGTH; + + /** Denotes ranges which cannot be dialled internationally. */ + public static final Column NATIONAL_ONLY = RangesTableSchema.NATIONAL_ONLY; + + /** Format specifier IDs. */ + public static final Column FORMAT = RangesTableSchema.FORMAT; + + /** The "Region:XX" column group in the range table. */ + public static final ColumnGroup REGIONS = RangesTableSchema.REGIONS; + + /** The standard columns required for generating a {@link NumberingScheme}. */ + public static final Schema COLUMNS = + Schema.builder() + .add(TYPE) + .add(AREA_CODE_LENGTH) + .add(NATIONAL_ONLY) + .add(FORMAT) + .add(REGIONS) + .build(); + + /** Columns for per-region tables (just {@link #COLUMNS} without {@link #REGIONS}). */ + public static final Schema PER_REGION_COLUMNS = + Schema.builder() + .add(TYPE) + .add(AREA_CODE_LENGTH) + .add(NATIONAL_ONLY) + .add(FORMAT) + .build(); + + public static RangeTable fromExternalTable(RangeTable src) { + checkArgument(RangesTableSchema.TABLE_COLUMNS.isSubSchemaOf(src.getSchema()), + "unexpected schema for source table, should be subschema of %s", + RangesTableSchema.TABLE_COLUMNS); + RangeTree unknown = src.getRanges(RangesTableSchema.TYPE, ExtType.UNKNOWN); + checkMetadata(unknown.isEmpty(), "source table contains unknown type for ranges\n%s", unknown); + checkSourceColumn(src, RangesTableSchema.TYPE); + checkSourceColumn(src, RangesTableSchema.TARIFF); + + // We can copy most columns verbatim. + RangeTable.Builder dst = RangeTable.builder(COLUMNS); + copyColumn(src, dst, AREA_CODE_LENGTH); + copyColumn(src, dst, NATIONAL_ONLY); + copyColumn(src, dst, FORMAT); + REGIONS.extractGroupColumns(src.getColumns()).values().forEach(c -> copyColumn(src, dst, c)); + + // But the type column must be inferred from a combination of the external type and tariff. + // Tariff takes precedence, so we do type first and then overwrite ranges for tariff. + // We also capture unsupported ranges as they must be ignored in this conversion. + RangeTree unsupportedRanges = RangeTree.empty(); + for (ExtType extType : src.getAssignedValues(RangesTableSchema.TYPE)) { + RangeTree ranges = src.getRanges(RangesTableSchema.TYPE, extType); + Optional t = extType.toValidNumberType(); + if (t.isPresent()) { + dst.assign(TYPE, t.get(), ranges, OverwriteMode.NEVER); + } else { + unsupportedRanges = unsupportedRanges.union(ranges); + } + } + // Because we know that both the type and tariff columns have assignments for every range (and + // there's no "unknown" values for these) we can just ignore "standard rate" tariff ranges + // since they must have had a type assigned above already. + for (ExtTariff extTariff : src.getAssignedValues(RangesTableSchema.TARIFF)) { + // Ignore unsupported ranges here (since otherwise they could add ranges based only on the + // tariff, which would be wrong). For example, a toll free ISP number range should NOT be + // in the table as TOLL_FREE, since ISP numbers should not be in the table at all (until + // such time as they are a fully supported type). + RangeTree ranges = + src.getRanges(RangesTableSchema.TARIFF, extTariff).subtract(unsupportedRanges); + extTariff.toValidNumberType() + .ifPresent(t -> dst.assign(TYPE, t, ranges, OverwriteMode.ALWAYS)); + } + return dst.build(); + } + + private static void checkSourceColumn(RangeTable table, Column col) { + checkMetadata(table.getAssignedRanges(col).equals(table.getAllRanges()), + "table is missing assignments in column %s for ranges\n%s", + col, table.getAllRanges().subtract(table.getAssignedRanges(col))); + } + + private static void copyColumn(RangeTable src, RangeTable.Builder dst, Column col) { + if (src.getColumns().contains(col)) { + src.getAssignedValues(col).forEach(v -> dst.assign(col, v, src.getRanges(col, v), NEVER)); + } + } + + private XmlRangesSchema() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Assignment.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Assignment.java new file mode 100644 index 000000000..88ed00ffe --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Assignment.java @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.value.AutoValue; +import com.google.common.base.Splitter; +import java.util.List; +import java.util.Optional; +import javax.annotation.Nullable; + +/** + * A single assignment of a column to a value. This can be used to change values in a + * {@code RangeTable} and well as query for ranges with its value. + */ +@AutoValue +public abstract class Assignment> { + private static final Splitter SPLITTER = Splitter.on("=").limit(2).trimResults(); + + /** + * Parses a string of the form {@code "="} to create an assignment using the given + * schema. The named column must exist in the schema, and the associated value must be a valid + * value within that column. + *

+ * Whitespace before and after the column or value is ignored. If the value is omitted, then an + * unassignment is returned. + */ + public static Assignment parse(String s, Schema schema) { + List parts = SPLITTER.splitToList(s); + checkArgument(parts.size() == 2, "invalid assigment string: %s", s); + Column column = schema.getColumn(parts.get(0)); + return create(column, column.parse(parts.get(1))); + } + + // Type capture around AutoValue is a little painful, so this static helper ... helps. + private static > Assignment create(Column c, @Nullable Object v) { + T value = c.cast(v); + return new AutoValue_Assignment<>(c, Optional.ofNullable(value)); + } + + /** + * Returns an assignment in the given column for the specified, non null, value. + *

+ * Note that an assignment for the default value of a column will return an explicit assignment + * for that value, rather than an "unassignment" in that column; so + * {@code Assignment.of(c, c.defaultValue())} is not equal to {@code unassign(c)}, even though + * they may have the same effect when applied to a range table, and may even have the same + * {@link #toString()} representation (in the case of String columns). + */ + public static > Assignment of(Column c, Object v) { + return new AutoValue_Assignment<>(c, Optional.of(c.cast(v))); + } + + @SuppressWarnings("unchecked") + public static > Assignment ofOptional(Column c, Optional v) { + // Casting the value makes the optional cast below safe. + v.ifPresent(c::cast); + return new AutoValue_Assignment<>(c, (Optional) v); + } + + /** + * Returns an unassignment in the given column. The {@link #value()} of this assignment is empty. + */ + public static > Assignment unassign(Column c) { + return new AutoValue_Assignment<>(c, Optional.empty()); + } + + /** The column in which the assignment applies. */ + public abstract Column column(); + + /** The value in the column, or empty to signify unassignment. */ + public abstract Optional value(); + + @Override + public final String toString() { + return String.format("%s=%s", column().getName(), value().map(Object::toString).orElse("")); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Change.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Change.java new file mode 100644 index 000000000..27d96403b --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Change.java @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Optional; + +/** + * A change which can be applied to a range table. Changes are applied sequentially to build a + * range table and new changes overwrite existing mappings. Changes are additive, and cannot be + * used to remove ranges from a table (but they can unassign previous assignments). + */ +@AutoValue +public abstract class Change { + private static final Change EMPTY = of(RangeTree.empty(), ImmutableList.of()); + + /** A builder for changes that supports assigning and unassigning column values for a range. */ + public static final class Builder { + private final RangeTree ranges; + private final Map, Assignment> assignments = new LinkedHashMap<>(); + + private Builder(RangeTree ranges) { + this.ranges = checkNotNull(ranges); + } + + /** + * Assigns the optional value in the given column for the ranges of this builder (an empty + * value has the effect of unassigning the value in the table that this change is applied to). + */ + public Builder assign(Assignment assignment) { + checkArgument(assignments.put(assignment.column(), assignment) == null, + "Column already assigned: %s", assignment.column()); + return this; + } + + /** Assigns the non-null value in the given column for the ranges of this builder. */ + public Builder assign(Column column, Object value) { + return assign(Assignment.of(column, value)); + } + + /** Unassigns any values in the given column for the ranges of this builder. */ + public Builder unassign(Column column) { + return assign(Assignment.unassign(column)); + } + + /** Builds an immutable change from the current state of this builder. */ + public Change build() { + return Change.of(ranges, assignments.values()); + } + } + + public static Builder builder(RangeTree ranges) { + return new Builder(ranges); + } + + /** Returns the empty change which has no effect when applied to any table. */ + public static Change empty() { + return EMPTY; + } + + /** Builds a change from a set of assignments (columns must be unique). */ + public static Change of(RangeTree ranges, Iterable> assignments) { + ImmutableList> a = ImmutableList.copyOf(assignments); + checkArgument(a.size() == a.stream().map(Assignment::column).distinct().count(), + "cannot supply different assignments for the same column: %s", a); + return new AutoValue_Change(ranges, a); + } + + /** + * Returns the ranges affected by this change. These ranges are added to the table and + * optionally assigned category values according to {@link #getAssignments()}. No other ranges + * will be affected by this change. + */ + public abstract RangeTree getRanges(); + + /** + * Returns a list of assignments to be applied for this change. Note that the set of columns for + * these assignments is itself also a set (i.e. no two assignments in a change ever share the + * same column). + */ + public abstract ImmutableList> getAssignments(); + + /** Returns whether this change contains any of the specified values in a given column. */ + @SafeVarargs + public final > boolean hasAssignment(Column column, T... values) { + for (Assignment a : getAssignments()) { + if (column.equals(a.column())) { + return a.value().map(v -> Arrays.asList(values).contains(column.cast(v))).orElse(false); + } + } + return false; + } + + /** + * Returns the value of the column in this change (or empty if there was not value or the value + * was empty. This because it conflates "no value" and "explicitly empty value", this method + * might not be suitable for Changes that unassign values. + */ + public final > Optional getAssignment(Column column) { + for (Assignment a : getAssignments()) { + if (column.equals(a.column())) { + return a.value().map(column::cast); + } + } + return Optional.empty(); + } + + // Visible for AutoValue. + Change() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Column.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Column.java new file mode 100644 index 000000000..d78a2b8de --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Column.java @@ -0,0 +1,217 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.CharMatcher.inRange; +import static com.google.common.base.CharMatcher.whitespace; +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.Boolean.FALSE; +import static java.lang.Boolean.TRUE; + +import com.google.auto.value.AutoValue; +import com.google.common.base.CaseFormat; +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableMap; +import java.util.function.Function; +import javax.annotation.Nullable; + +/** + * A column specifier which holds a set of values that are allowed with a column. + */ +@AutoValue +public abstract class Column> { + private static final ImmutableMap BOOLEAN_MAP = + ImmutableMap.of("true", TRUE, "TRUE", TRUE, "false", FALSE, "FALSE", false); + private static final CharMatcher ASCII_LETTER_OR_DIGIT = + inRange('a', 'z').or(inRange('A', 'Z')).or(inRange('0', '9')); + private static final CharMatcher LOWER_ASCII_LETTER_OR_DIGIT = + inRange('a', 'z').or(inRange('0', '9')); + private static final CharMatcher LOWER_UNDERSCORE = + CharMatcher.is('_').or(LOWER_ASCII_LETTER_OR_DIGIT); + + + /** + * Returns a column for the specified type with a given parsing function. Use alternate helper + * methods for creating columns of common types. + */ + public static > Column create( + Class clazz, String name, T defaultValue, Function parseFn) { + return new AutoValue_Column<>( + checkName(name), clazz, parseFn, String::valueOf, defaultValue, null); + } + + /** + * Returns a column for the specified enum type. The string representation of a value in this + * column is just the {@code toString()} value of the enum. + */ + public static > Column of(Class clazz, String name, T defaultValue) { + return create(clazz, name, defaultValue, s -> Enum.valueOf(clazz, toEnumName(s))); + } + + /** + * Returns a column for strings. In there serialized form, strings do not preserve leading or + * trailing whitespace, unless surrounded by double-quotes (e.g. {@code " foo "}). The quotes are + * stripped on parsing and added back for any String value with leading/trailing whitespace. The + * default value is the empty string. + */ + public static Column ofString(String name) { + return new AutoValue_Column<>( + checkName(name), String.class, Column::trimOrUnquote, Column::maybeQuote, "", null); + } + + /** + * Returns a column for unsigned integers. The string representation of a value in this column + * matches the {@link Integer#toString(int)} value. The default value is {@code 0}. + */ + public static Column ofUnsignedInteger(String name) { + return create(Integer.class, name, 0, Integer::parseUnsignedInt); + } + + /** + * Returns a column for booleans. The string representation of a value in this column can be any + * of "true", "false", "TRUE", "FALSE" (but not things like "True", "T" or "YES"). The default + * value is {@code false}. + */ + public static Column ofBoolean(String name) { + return create(Boolean.class, name, false, BOOLEAN_MAP::get); + } + + private static String checkName(String name) { + checkArgument(name.indexOf(':') == -1, "invalid column name: %s", name); + return name; + } + + // Converts to UPPER_UNDERSCORE naming for enums. + private static String toEnumName(String name) { + // Allow conversion for lower_underscore and lowerCamel, since UPPER_UNDERSCORE is so "LOUD". + // We can be sloppy with respect to errors here since all runtime exceptions are handled. + if (LOWER_ASCII_LETTER_OR_DIGIT.matches(name.charAt(0))) { + if (LOWER_UNDERSCORE.matchesAllOf(name)) { + name = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.UPPER_UNDERSCORE, name); + } else if (ASCII_LETTER_OR_DIGIT.matchesAllOf(name)) { + name = CaseFormat.LOWER_CAMEL.to(CaseFormat.UPPER_UNDERSCORE, name); + } else { + // Message/type not important here since all exceptions are replaced anyway. + throw new IllegalArgumentException(); + } + } + return name; + } + + // Trims whitespace from a serialize string, unless the value is surrounded by double-quotes (in + // which case the quotes are removed). This is done to permit the rare use of leading/trailing + // whitespace in data in a visually distinct and deliberate way. + private static String trimOrUnquote(String s) { + if (s.length() >= 2 && s.startsWith("\"") && s.endsWith("\"")) { + return s.substring(1, s.length() - 1); + } + return whitespace().trimFrom(s); + } + + // Surrounds any string with whitespace at either end with double quotes. + private static String maybeQuote(String s) { + if (s.length() > 0 + && (whitespace().matches(s.charAt(0)) || whitespace().matches(s.charAt(s.length() - 1)))) { + return '"' + s + '"'; + } + return s; + } + + /** Returns the column name (which can be used as a human readable title if needed). */ + public abstract String getName(); + + abstract Class type(); + + // The parsing function from a string to a value. + abstract Function parseFn(); + // The serialization function from a value to a String. This must be the inverse of the parseFn. + abstract Function serializeFn(); + + /** Default value for this column (inferred for unassigned ranges when a snapshot is built). */ + public abstract T defaultValue(); + + // This is very private and should only be used in this class. + @Nullable abstract Column owningGroup(); + + /** Attempts to cast the given instance to the runtime type of this column. */ + @Nullable public final T cast(@Nullable Object value) { + return type().cast(value); + } + + /** + * Returns the value of this column based on its serialized representation (which is not + * necessarily its {@code toString()} representation). + */ + @Nullable public final T parse(String id) { + if (id.isEmpty()) { + return null; + } + try { + // TODO: Technically wrong, since for String columns this will unquote strings. + // Hopefully this won't be an issue, since quoting is really only likely to be used for + // preserving whitespace (which i + + T value = parseFn().apply(id); + if (value != null) { + return value; + } + } catch (RuntimeException e) { + // fall through + } + throw new IllegalArgumentException( + String.format("unknown value '%s' in column '%s'", id, getName())); + } + + /** + * Returns the serialized representation of a value in this column. This is the stored + * representation of the value, not the value itself. + */ + public final String serialize(@Nullable Object value) { + return (value != null) ? serializeFn().apply(cast(value)) : ""; + } + + // Only to be called by ColumnGroup. + final Column fromPrototype(String suffix) { + String name = getName() + ":" + checkName(suffix); + return new AutoValue_Column(name, type(), parseFn(), serializeFn(), defaultValue(), this); + } + + final boolean isIn(ColumnGroup group) { + return group.prototype().equals(owningGroup()); + } + + @Override + public final String toString() { + return "Column{'" + getName() + "'}"; + } + + @Override + public final boolean equals(Object obj) { + if (!(obj instanceof Column)) { + return false; + } + Column c = (Column) obj; + return c.getName().equals(getName()) && c.type().equals(type()); + } + + @Override + public final int hashCode() { + return getName().hashCode() ^ type().hashCode(); + } + + // Visible only for AutoValue + Column() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroup.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroup.java new file mode 100644 index 000000000..d4e78efa6 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroup.java @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableBiMap.toImmutableBiMap; +import static java.util.function.Function.identity; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableBiMap; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.i18n.SimpleLanguageTag; +import java.util.Set; +import java.util.function.Function; + +/** A group of {@link RangeTable} columns. */ +@AutoValue +public abstract class ColumnGroup> { + /** + * Returns a group for columns with the same type as the given "prototype" column and which has a + * a prefix that's the name of the prototype. Suffix values are parsed using the given function. + */ + public static > ColumnGroup of( + Column prototype, Function parseFn) { + return new AutoValue_ColumnGroup<>(prototype, parseFn); + } + + /** Returns a group for the specified prototype column keyed by {@link PhoneRegion}. */ + public static > ColumnGroup byRegion( + Column prototype) { + return of(prototype, PhoneRegion::of); + } + + /** Returns a group for the specified prototype column keyed by {@link SimpleLanguageTag}. */ + public static > ColumnGroup byLanguage( + Column prototype) { + return of(prototype, SimpleLanguageTag::of); + } + + // Internal use only. + abstract Column prototype(); + abstract Function parseFn(); + + /** Returns the column for a specified key. */ + public Column getColumn(K key) { + // The reason this does not just call "prototype().fromPrototype(...)" is that the key may not + // be parsable by the function just because it's the "right" type. This allows people to pass + // in a function that limits columns to some subset of the domain (e.g. a subset of region + // codes). + return getColumnFromId(key.toString()); + } + + /** Returns the column for a specified ID string. */ + public Column getColumnFromId(String id) { + try { + Object unused = parseFn().apply(id); + } catch (RuntimeException e) { + throw new IllegalArgumentException( + String.format("invalid column %s, not in group: %s", id, this), e); + } + return prototype().fromPrototype(id); + } + + /** Returns the key of a column in this group. */ + @SuppressWarnings("unchecked") + public K getKey(Column c) { + checkArgument(c.isIn(this), "column %s in not group %s", c, this); + // Cast is safe since any column in this group is a Column. + return extractKey((Column) c); + } + + /** Returns a bidirectional mapping from group key to column, for columns in this group. */ + @SuppressWarnings("unchecked") + public ImmutableBiMap> extractGroupColumns(Set> columns) { + return columns.stream() + .filter(c -> c.isIn(this)) + // Cast is safe since any column in this group is a Column. + .map(c -> (Column) c) + .collect(toImmutableBiMap(this::extractKey, identity())); + } + + // Assumes we've already verified that the column is in this group. + private K extractKey(Column column) { + String name = column.getName(); + return parseFn().apply(name.substring(name.lastIndexOf(':') + 1)); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvKeyMarshaller.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvKeyMarshaller.java new file mode 100644 index 000000000..327acfeea --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvKeyMarshaller.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkNotNull; +import static java.util.Comparator.naturalOrder; + +import com.google.common.collect.ImmutableList; +import java.util.Comparator; +import java.util.List; +import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Stream; + +/** Marshaller to handle key serialization and ordering in {@code CsvTable}. */ +public final class CsvKeyMarshaller { + private final Function> serialize; + private final Function, K> deserialize; + private final Optional> ordering; + private final ImmutableList columns; + + public static CsvKeyMarshaller ofSortedString(String columnName) { + return new CsvKeyMarshaller( + Stream::of, p -> p.get(0), Optional.of(naturalOrder()), columnName); + } + + public CsvKeyMarshaller( + Function> serialize, + Function, K> deserialize, + Optional> ordering, + String... columns) { + this(serialize, deserialize, ordering, ImmutableList.copyOf(columns)); + } + + public CsvKeyMarshaller( + Function> serialize, + Function, K> deserialize, + Optional> ordering, + List columns) { + this.serialize = checkNotNull(serialize); + this.deserialize = checkNotNull(deserialize); + this.ordering = checkNotNull(ordering); + this.columns = ImmutableList.copyOf(columns); + } + + public ImmutableList getColumns() { + return columns; + } + + Stream serialize(K key) { + return serialize.apply(key); + } + + K deserialize(List keyParts) { + return deserialize.apply(keyParts); + } + + Optional> ordering() { + return ordering; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java new file mode 100644 index 000000000..1138cfe8f --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java @@ -0,0 +1,241 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.CharMatcher.isNot; +import static com.google.common.base.CharMatcher.javaIsoControl; +import static com.google.common.base.CharMatcher.whitespace; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Streams; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.function.Consumer; +import java.util.stream.Stream; +import javax.annotation.Nullable; + +/** + * An efficient, fluent CSV parser which operates on a {@link Stream} of lines. It handles quoting + * of values, whitespace trimming and mapping values via a "schema" row. + * + *

This class is sadly necessary since the one in {@code com.google.common.text} doesn't support + * ignoring whitespace (and making it do so would take longer than writing this). + * + *

This class is immutable and thread-safe. + */ +// TODO: Investigate other "standard" CSV parsers such as org.apache.commons.csv. +public final class CsvParser { + /** + * A consumer for CSV rows which can automatically map values according to a header row. + * + *

This class is immutable and thread-safe. + */ + public static final class RowMapper { + @Nullable private final Consumer> headerHandler; + + private RowMapper(Consumer> headerHandler) { + this.headerHandler = headerHandler; + } + + public Consumer> mapTo(Consumer> handler) { + return new Consumer>() { + private ImmutableList header = null; + + @Override + public void accept(Stream row) { + if (header == null) { + // Can contain duplicates (but that's bad for mapping). + header = row.collect(toImmutableList()); + checkArgument( + header.size() == header.stream().distinct().count(), + "duplicate values in CSV header: %s", + header); + if (headerHandler != null) { + headerHandler.accept(header); + } + } else { + ImmutableMap.Builder map = ImmutableMap.builder(); + int i = 0; + for (String v : Streams.iterating(row)) { + checkArgument(i < header.size(), + "too many columns (expected %s): %s", header.size(), map); + if (!v.isEmpty()) { + map.put(header.get(i++), v); + } + } + handler.accept(map.build()); + } + } + }; + } + } + + private static final CharMatcher NON_WHITESPACE = CharMatcher.whitespace().negate(); + private static final char QUOTE = '"'; + private static final CharMatcher VALID_DELIMITER_CHAR = + NON_WHITESPACE.and(javaIsoControl().negate()).and(isNot(QUOTE)).or(CharMatcher.anyOf(" \t")); + + public static CsvParser withSeparator(char delimiter) { + return new CsvParser(delimiter, false, false); + } + + public static CsvParser commaSeparated() { + return withSeparator(','); + } + + public static CsvParser tabSeparated() { + return withSeparator('\t'); + } + + public static RowMapper rowMapper() { + return new RowMapper(null); + } + + public static RowMapper rowMapper(Consumer> headerHandler) { + return new RowMapper(headerHandler); + } + + private final char delimiter; + private final boolean trimWhitespace; + private final boolean allowMultiline; + + private CsvParser(char delimiter, boolean trimWhitespace, boolean allowMultiline) { + checkArgument(VALID_DELIMITER_CHAR.matches(delimiter), + "invalid delimiter: %s", delimiter); + this.delimiter = delimiter; + this.trimWhitespace = trimWhitespace; + this.allowMultiline = allowMultiline; + } + + public CsvParser trimWhitespace() { + checkArgument(NON_WHITESPACE.matches(delimiter), + "cannot trim whitespace if delimiter is whitespace"); + return new CsvParser(delimiter, true, allowMultiline); + } + + public CsvParser allowMultiline() { + return new CsvParser(delimiter, trimWhitespace, true); + } + + public void parse(Stream lines, Consumer> rowCallback) { + // Allow whitespace delimiter if we aren't also trimming whitespace. + List row = new ArrayList<>(); + StringBuilder buffer = new StringBuilder(); + Iterator it = lines.iterator(); + while (parseRow(it, row, buffer)) { + rowCallback.accept(row.stream()); + row.clear(); + } + } + + private boolean parseRow(Iterator lines, List row, StringBuilder buffer) { + if (!lines.hasNext()) { + return false; + } + // First line of potentially several which make up this row. + String line = lines.next(); + int start = maybeTrimWhitespace(line, 0); + while (start < line.length()) { + // "start" is the start of the next part and must be a valid index into current "line". + // Could be high or low surrogate if badly formed string, or just point at the delimiter. + char c = line.charAt(start); + int pos; + if (c == QUOTE) { + // Quoted value, maybe parse and unescape multiple lines here. + pos = ++start; + while (true) { + if (pos == line.length()) { + buffer.append(line, start, pos); + checkArgument(allowMultiline && lines.hasNext(), + "unterminated quoted value: %s", buffer); + buffer.append('\n'); + line = lines.next(); + start = 0; + pos = 0; + } + c = line.charAt(pos); + if (c == QUOTE) { + buffer.append(line, start, pos++); + if (pos == line.length()) { + break; + } + if (line.charAt(pos) != QUOTE) { + pos = maybeTrimWhitespace(line, pos); + checkArgument(pos == line.length() || line.codePointAt(pos) == delimiter, + "unexpected character (expected delimiter) in: %s", line); + break; + } + // "Double double quotes, what does it mean?" (oh yeah, a single double quote). + buffer.append(QUOTE); + start = pos + 1; + } + pos++; + } + row.add(buffer.toString()); + buffer.setLength(0); + } else if (c == delimiter) { + // Empty unquoted empty value (e.g. "foo,,bar"). + row.add(""); + pos = start; + } else { + // Non-empty unquoted value. + pos = line.indexOf(delimiter, start + 1); + if (pos == -1) { + pos = line.length(); + } + String value = line.substring(start, maybeTrimTrailingWhitespace(line, pos)); + checkArgument(value.indexOf(QUOTE) == -1, + "quotes cannot appear in unquoted values: %s", value); + row.add(value); + } + if (pos == line.length()) { + // We hit end-of-line at the end of a value, so just return (no trailing empty value). + return true; + } + // If not end-of-line, "pos" points at the last delimiter, so we can find the next start. + start = maybeTrimWhitespace(line, pos + 1); + } + // We hit end-of-line either immediately, or after a delimiter. Either way we always need to + // add a trailing empty value for consistency. + row.add(""); + return true; + } + + private int maybeTrimWhitespace(String s, int i) { + if (trimWhitespace) { + i = NON_WHITESPACE.indexIn(s, i); + if (i == -1) { + i = s.length(); + } + } + return i; + } + + private int maybeTrimTrailingWhitespace(String s, int i) { + if (trimWhitespace) { + // There is no "lastIndexIn(String, int)" sadly. + while (i > 0 && whitespace().matches(s.charAt(i - 1))) { + i--; + } + } + return i; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvSchema.java new file mode 100644 index 000000000..0e59bf17b --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvSchema.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Optional; +import java.util.function.BiConsumer; + +/** + * A CSV schema is a combination of a key marshaller and table columns. A CSV schema defines a + * CSV table with key columns, followed by non-key columns. + */ +@AutoValue +public abstract class CsvSchema { + /** + * Returns a schema for a CSV file using the given marshaller to define key columns, and a table + * schema to define any additional columns in a row. + */ + public static CsvSchema of(CsvKeyMarshaller marshaller, Schema columns) { + return new AutoValue_CsvSchema<>(marshaller, columns); + } + + /** The marshaller defining table keys and how they are serialized in CSV. */ + public abstract CsvKeyMarshaller keyMarshaller(); + + /** The table schema defining non-key columns in the table. */ + public abstract Schema columns(); + + /** Returns the ordering for keys in the CSV table, as defined by the key marshaller. */ + public Optional> rowOrdering() { + return keyMarshaller().ordering(); + } + + /** + * Returns the ordering for additional non-key columns in the CSV table as defined by the table + * schema. + */ + public Comparator> columnOrdering() { + return columns().ordering(); + } + + /** + * Extracts the non-key columns of a table from the header row. The header row is expected to + * contain the names of all columns (including key columns) in the CSV table and this method + * verifies that the key columns are present as expected before resolving the non-key columns + * in order. + */ + public ImmutableList> parseHeader(List header) { + int hsize = keyMarshaller().getColumns().size(); + checkArgument(header.size() >= hsize, "CSV header too short: %s", header); + checkArgument(header.subList(0, hsize).equals(keyMarshaller().getColumns()), + "Invalid CSV header: %s", header); + ImmutableList.Builder> columns = ImmutableList.builder(); + header.subList(hsize, header.size()).forEach(s -> columns.add(columns().getColumn(s))); + return columns.build(); + } + + /** Parses a row from a CSV table containing unescaped values. */ + public void parseRow( + ImmutableList> columns, List row, BiConsumer>> fn) { + int hsize = keyMarshaller().getColumns().size(); + checkArgument(row.size() >= hsize, "CSV row too short: %s", row); + K key = keyMarshaller().deserialize(row.subList(0, hsize)); + List> rowAssignments = new ArrayList<>(); + for (int n = 0; n < row.size() - hsize; n++) { + Column c = columns.get(n); + rowAssignments.add( + Assignment.ofOptional(c, Optional.ofNullable(c.parse(row.get(n + hsize))))); + } + fn.accept(key, rowAssignments); + } + + public CsvTable load(Path file) throws IOException { + if (!Files.exists(file)) { + return CsvTable.builder(this).build(); + } + try (Reader csv = Files.newBufferedReader(file)) { + return CsvTable.importCsv(this, csv); + } + } + + public CsvTable load(Reader reader) throws IOException { + return CsvTable.importCsv(this, reader); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java new file mode 100644 index 000000000..7a96596c9 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java @@ -0,0 +1,589 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.ImmutableSortedSet.toImmutableSortedSet; +import static com.google.i18n.phonenumbers.metadata.table.DiffKey.Status.LHS_CHANGED; +import static com.google.i18n.phonenumbers.metadata.table.DiffKey.Status.LHS_ONLY; +import static com.google.i18n.phonenumbers.metadata.table.DiffKey.Status.RHS_CHANGED; +import static com.google.i18n.phonenumbers.metadata.table.DiffKey.Status.RHS_ONLY; +import static com.google.i18n.phonenumbers.metadata.table.DiffKey.Status.UNCHANGED; + +import com.google.auto.value.AutoValue; +import com.google.common.base.CharMatcher; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedSet; +import com.google.common.collect.Maps; +import com.google.common.collect.Ordering; +import com.google.common.collect.Sets; +import com.google.common.collect.Table; +import com.google.common.collect.Tables; +import com.google.common.collect.TreeBasedTable; +import com.google.common.escape.CharEscaperBuilder; +import com.google.common.escape.Escaper; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.io.StringWriter; +import java.io.Writer; +import java.util.Arrays; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; +import java.util.TreeMap; +import java.util.function.Consumer; +import java.util.function.Predicate; +import java.util.function.Supplier; +import java.util.stream.Stream; +import javax.annotation.Nullable; + +/** + * A general tabular representation of {@link Column} based data, which can include range data + * (via {@link RangeTable}) or other tabular data using a specified row key implementation. + * + * @param the row key type. + */ +@AutoValue +public abstract class CsvTable { + // Trim whitespace (since CSV files may be textually aligned) but don't allow multiline values + // (we handle that by JSON style escaping to keep the "one row per line" assumption true). + public static final String DEFAULT_DELIMETER = ";"; + private static final CsvParser CSV_PARSER = + CsvParser.withSeparator(DEFAULT_DELIMETER.charAt(0)).trimWhitespace(); + + /** + * Mode to control how diffs are generated. If a diff table, rows have an additional + * {@code Status} applied to describe whether they are unchanged, modified or exclusive (i.e. + * exist only in one of the source tables). + */ + public enum DiffMode { + /** Include all rows in the "diff table" (unchanged, modified or exclusive). */ + ALL, + /** Include only changed rows in the "diff table" (modified or exclusive). */ + CHANGES, + /** Include only left-hand-side rows in the "diff table" (unchanged, modified or exclusive). */ + LHS, + /** Include only right-hand-side rows in the "diff table" (unchanged, modified or exclusive). */ + RHS, + } + + /** A simple builder for programmatic generation of CSV tables. */ + public static final class Builder { + private final CsvSchema schema; + private final Table, Object> table; + + private Builder(CsvSchema schema) { + this.schema = checkNotNull(schema); + + // Either use insertion order or sorted order for rows (depends on schema). + if (schema.rowOrdering().isPresent()) { + this.table = TreeBasedTable.create(schema.rowOrdering().get(), schema.columnOrdering()); + } else { + this.table = Tables.newCustomTable( + new LinkedHashMap<>(), + () -> new TreeMap<>(schema.columnOrdering())); + } + } + + /** + * Puts a row into the table using the specific mappings (potentially overwriting any existing + * row). + */ + public Builder putRow(T key, Map, ?> row) { + table.rowMap().remove(key); + return addRow(key, row); + } + + /** + * Adds a new row to the table using the specific mappings (the row must not already be + * present). + */ + public Builder addRow(T key, Map, ?> row) { + checkArgument(!table.containsRow(key), "row '%s' already added\n%s", key, this); + row.forEach((c, v) -> table.put(key, c, v)); + return this; + } + + /** + * Adds a new row to the table using the specific mappings (the row must not already be + * present). + */ + public Builder addRow(T key, List> row) { + checkArgument(!table.containsRow(key), "row '%s' already added\n%s", key, this); + put(key, row); + return this; + } + + /** Puts (overwrites) a single value in the table. */ + public > Builder put(T key, Column c, @Nullable V v) { + schema.columns().checkColumn(c); + if (v != null) { + table.put(key, c, c.cast(v)); + } else { + table.remove(key, c); + } + return this; + } + + /** Puts (overwrites) a sequence of values in the table. */ + public Builder put(T key, Iterable> assign) { + for (Assignment a : assign) { + if (a.value().isPresent()) { + table.put(key, a.column(), a.value().get()); + } else { + table.remove(key, a.column()); + } + } + return this; + } + + /** Puts (overwrites) a sequence of values in the table. */ + public Builder put(T key, Assignment... assign) { + return put(key, Arrays.asList(assign)); + } + + /** Returns an unmodifiable view of the keys for the table. */ + public Set getKeys() { + return Collections.unmodifiableSet(table.rowKeySet()); + } + + /** Gets a single value in the table (or null). */ + public > V get(T key, Column c) { + return c.cast(table.get(key, c)); + } + + /** Removes an entire row from the table (does nothing if the row did no exist). */ + public Builder removeRow(T key) { + table.rowKeySet().remove(key); + return this; + } + + /** Filters the rows of a table, keeping those which match the given predicate. */ + public Builder filterRows(Predicate predicate) { + Set rows = table.rowKeySet(); + // Copy to avoid concurrent modification exception. + for (T key : ImmutableSet.copyOf(table.rowKeySet())) { + if (!predicate.test(key)) { + rows.remove(key); + } + } + return this; + } + + /** Filters the columns of a table, keeping only those which match the given predicate. */ + public Builder filterColumns(Predicate> predicate) { + Set> toRemove = + table.columnKeySet().stream().filter(predicate.negate()).collect(toImmutableSet()); + table.columnKeySet().removeAll(toRemove); + return this; + } + + /** Builds the immutable CSV table. */ + public CsvTable build() { + return from(schema, table); + } + + @Override + public String toString() { + return build().toString(); + } + } + + /** Returns a builder for a CSV table with the expected key and column semantics. */ + public static Builder builder(CsvSchema schema) { + return new Builder<>(schema); + } + + /** Returns a CSV table based on the given table with the expected key and column semantics. */ + public static CsvTable from(CsvSchema schema, Table, Object> table) { + ImmutableSet> columns = table.columnKeySet().stream() + .sorted(schema.columnOrdering()) + .collect(toImmutableSet()); + columns.forEach(schema.columns()::checkColumn); + return new AutoValue_CsvTable<>( + schema, + ImmutableMap.copyOf(Maps.transformValues(table.rowMap(), ImmutableMap::copyOf)), + columns); + } + + /** + * Imports a semicolon separated CSV file. The CSV file needs to have the following layout: + *

+   * Key1 ; Key2 ; Column1 ; Column2 ; Column3
+   * k1   ; k2   ; OTHER   ; "Text"  ; true
+   * ...
+   * 
+ * Where the first {@code N} columns represent the row key (as encapsulated by the key + * {@link CsvKeyMarshaller}) and the remaining columns correspond to the given {@link Schema} + * via the column names. + *

+ * Column values are represented in a semi-typed fashion according to the associated column (some + * columns require values to be escaped, others do not). Note that it's the column that defines + * whether the value needs escaping, not the content of the value itself (all values in a String + * column are required to be quoted). + */ + public static CsvTable importCsv(CsvSchema schema, Reader csv) throws IOException { + return importCsv(schema, csv, CSV_PARSER); + } + + /** Imports a CSV file using a specified parser. */ + public static CsvTable importCsv(CsvSchema schema, Reader csv, CsvParser csvParser) + throws IOException { + TableParser parser = new TableParser<>(schema); + try (BufferedReader r = new BufferedReader(csv)) { + csvParser.parse( + r.lines(), + row -> parser.accept( + row.map(CsvTable::unescapeSingleLineCsvText).collect(toImmutableList()))); + } + return parser.done(); + } + + /** + * Imports a sequence of rows to create a CSV table. The values in the rows are unescaped and + * require no explicit parsing. + */ + public static CsvTable importRows(CsvSchema schema, Supplier> rows) { + TableParser parser = new TableParser<>(schema); + List row; + while ((row = rows.get()) != null) { + parser.accept(row); + } + return parser.done(); + } + /** + * Creates a "diff table" based on the given left and right table inputs. The resulting table + * has a new key column which indicates (via the {@code Status} enum) how rows difference between + * the left and right tables. + */ + public static CsvTable> diff(CsvTable lhs, CsvTable rhs, DiffMode mode) { + checkArgument(lhs.getSchema().equals(rhs.getSchema()), "Cannot diff with different schemas"); + checkNotNull(mode, "Must specify a diff mode"); + + CsvKeyMarshaller> marshaller = DiffKey.wrap(lhs.getSchema().keyMarshaller()); + CsvSchema> diffSchema = CsvSchema.of(marshaller, lhs.getSchema().columns()); + + Builder> diff = CsvTable.builder(diffSchema); + if (mode != DiffMode.RHS) { + Sets.difference(lhs.getKeys(), rhs.getKeys()) + .forEach(k -> diff.addRow(DiffKey.of(LHS_ONLY, k), lhs.getRow(k))); + } + if (mode != DiffMode.LHS) { + Sets.difference(rhs.getKeys(), lhs.getKeys()) + .forEach(k -> diff.addRow(DiffKey.of(RHS_ONLY, k), rhs.getRow(k))); + } + for (K key : Sets.intersection(lhs.getKeys(), rhs.getKeys())) { + Map, Object> lhsRow = lhs.getRow(key); + Map, Object> rhsRow = rhs.getRow(key); + if (lhsRow.equals(rhsRow)) { + if (mode != DiffMode.CHANGES) { + diff.addRow(DiffKey.of(UNCHANGED, key), lhsRow); + } + } else { + if (mode != DiffMode.RHS) { + diff.addRow(DiffKey.of(LHS_CHANGED, key), lhsRow); + } + if (mode != DiffMode.LHS) { + diff.addRow(DiffKey.of(RHS_CHANGED, key), rhsRow); + } + } + } + return diff.build(); + } + + /** Returns the schema for this table. */ + public abstract CsvSchema getSchema(); + + /** Returns the rows of the table (not public to avoid access to untyped access). */ + // Note that this cannot easily be replaced by ImmutableTable (as of Jan 2019) because + // ImmutableTable has severe limitations on how row/column ordering is handled that make the + // row/column ordering required in CsvTable currently impossible. + abstract ImmutableMap, Object>> getRows(); + + /** + * Returns the set of columns for the table (excluding the synthetic key columns, which are + * handled by the marshaller). + */ + public abstract ImmutableSet> getColumns(); + + /** Returns whether a row is in the table. */ + public boolean isEmpty() { + return getRows().isEmpty(); + } + + /** Returns the set of keys for the table. */ + public ImmutableSet getKeys() { + return getRows().keySet(); + } + + /** Returns a single row as a map of column assignments. */ + public ImmutableMap, Object> getRow(K rowKey) { + ImmutableMap, Object> row = getRows().get(rowKey); + return row != null ? row : ImmutableMap.of(); + } + + /** Returns whether a row is in the table. */ + public boolean containsRow(K rowKey) { + return getKeys().contains(rowKey); + } + + public Builder toBuilder() { + Builder builder = builder(getSchema()); + getRows().forEach(builder::putRow); + return builder; + } + + /** Returns the table column names, including the key columns, in schema order. */ + public Stream getCsvHeader() { + return Stream.concat( + getSchema().keyMarshaller().getColumns().stream(), + getColumns().stream().map(Column::getName)); + } + + /** Returns the unescaped CSV values for the specified row, in order. */ + public Stream getCsvRow(K key) { + checkArgument(getKeys().contains(key), "no such row: %s", key); + // Note that we pass the raw value (possibly null) to serialize so that we don't conflate + // missing and default values. + return Stream.concat( + getSchema().keyMarshaller().serialize(key), + getColumns().stream().map(c -> c.serialize(getOrNull(key, c)))); + } + + /** + * Exports the given table by writing its values as semicolon separated "CSV", with or without + * alignment. For example (with alignment): + * + *

+   * Key1 ; Key2 ; Column1 ; Column2 ; Column3
+   * k1   ; k2   ; OTHER   ; "Text"  ; true
+   * ...
+   * 
+ * + * Where the first {@code N} columns represent the row key (as encapsulated by the key {@link + * CsvKeyMarshaller}) and the remaining columns correspond to the given {@link Schema} via the + * column names. + */ + public boolean exportCsv(Writer writer, boolean align) { + return exportCsvHelper(writer, align, getColumns()); + } + + /** + * Exports the given table by writing its values as semicolon separated "CSV", with or without + * alignment. For example (with alignment): + * + *
+   * Key1 ; Key2 ; Column1 ; Column2 ; Column3
+   * k1   ; k2   ; OTHER   ; "Text"  ; true
+   * ...
+   * 
+ * + * Where the first {@code N} columns represent the row key (as encapsulated by the key {@link + * CsvKeyMarshaller}) and the remaining columns correspond to the given {@link Schema} via the + * column names. This will add columns that are part of the schema for the given table but have no + * assigned values. + */ + public boolean exportCsvWithEmptyColumnsPresent(Writer writer, boolean align) { + + return exportCsvHelper( + writer, + align, + Stream.concat(getSchema().columns().getColumns().stream(), getColumns().stream()) + .collect(ImmutableSet.toImmutableSet())); + } + + private boolean exportCsvHelper( + Writer writer, boolean align, ImmutableSet> columnsToExport) { + + if (isEmpty()) { + // Exit for empty tables (CSV file is truncated). The caller may then delete the empty file. + return false; + } + CsvTableCollector collector = new CsvTableCollector(align); + collector.accept( + Stream.concat( + getSchema().keyMarshaller().getColumns().stream(), + columnsToExport.stream().map(Column::getName)) + .distinct()); + for (K k : getKeys()) { + // Format raw values (possibly null) to avoid default values everywhere. + collector.accept( + Stream.concat( + getSchema().keyMarshaller().serialize(k), + columnsToExport.stream().map(c -> formatValue(c, getOrNull(k, c))))); + } + collector.writeCsv(writer); + return true; + } + + @Nullable private > T getOrNull(K rowKey, Column column) { + return column.cast(getRow(rowKey).get(column)); + } + + /** + * Returns the value from the underlying table for the given row and column if present. + */ + public > Optional get(K rowKey, Column column) { + return Optional.ofNullable(getOrNull(rowKey, column)); + } + + /** + * Returns the value from the underlying table for the given row and column, or the (non-null) + * default value. + */ + public > T getOrDefault(K rowKey, Column column) { + T value = getOrNull(rowKey, column); + return value != null ? value : column.defaultValue(); + } + + /** + * Returns the set of unique values in the given column. Note that if some rows do not have a + * value, then this will NOT result in the column default value being in the returned set. An + * empty column will result in an empty set being returned here. + */ + public > ImmutableSortedSet getValues(Column column) { + return getKeys().stream() + .map(k -> getOrNull(k, column)) + .filter(Objects::nonNull) + .collect(toImmutableSortedSet(Ordering.natural())); + } + + @Override + public final String toString() { + StringWriter w = new StringWriter(); + exportCsv(w, true); + return w.toString(); + } + + /** Parses CSV data on per-row basis, deserializing keys and adding values to a table. */ + static class TableParser implements Consumer> { + private final Builder table; + // Set when the header row is processed. + private ImmutableList> columns = null; + + TableParser(CsvSchema schema) { + this.table = builder(schema); + } + + @Override + public void accept(List row) { + if (columns == null) { + columns = table.schema.parseHeader(row); + } else { + table.schema.parseRow(columns, row, table::addRow); + } + } + + public CsvTable done() { + return table.build(); + } + } + + // Newlines can, in theory, be emitted "raw" in the CSV output inside a quoted string, but + // this breaks all sorts of nice properties of CSV files, since there's no longer one row per + // line. This export process escapes literal newlines and other control characters into Json + // like escape sequences ('\n', '\t', '\\' etc...). Unlike Json however, any double-quotes are + // _not_ escaped via '\' since the CSV way to escape those is via doubling. We leave other + // non-ASCII characters as-is, since this is meant to be as human readable as possible. + private static final Escaper ESCAPER = new CharEscaperBuilder() + .addEscape('\n', "\\n") + .addEscape('\r', "\\r") + .addEscape('\t', "\\t") + .addEscape('\\', "\\\\") + // This is a special case only required when writing CSV file (since the parser handles + // unescaping quotes when they are read back in). In theory it should be part of a separate + // step during CSV writing, but it's not worth splitting it out. This is not considered an + // unsafe char (since it definitely does appear). + .addEscape('"', "\"\"") + .toEscaper(); + + private static final CharMatcher ESCAPED_CHARS = CharMatcher.anyOf("\n\r\t\\"); + private static final CharMatcher UNSAFE_CHARS = + CharMatcher.javaIsoControl().and(ESCAPED_CHARS.negate()); + + private static String formatValue(Column column, @Nullable Object value) { + String unescaped = column.serialize(value); + if (unescaped.isEmpty()) { + return unescaped; + } + // Slightly risky with enums, since an enum could have ';' in its toString() representation. + // However since columns and their semantics are tightly controlled, this should never happen. + if (Number.class.isAssignableFrom(column.type()) + || column.type() == Boolean.class + || column.type().isEnum()) { + checkArgument(ESCAPED_CHARS.matchesNoneOf(unescaped), "Bad 'safe' value: %s", unescaped); + return unescaped; + } + return escapeForSingleLineCsv(unescaped); + } + + /** + * Escapes and quotes an arbitrary text string, ensuring it is safe for use as a single-line CSV + * value. Newlines, carriage returns and tabs are backslash escaped (as is backslash itself) and + * other ISO control characters are not permitted. + * + *

The purpose of this method is to make arbitrary Unicode text readable in a single line of + * a CSV file so that we can rely on per-line processing tools, such as "grep" or "sed" if needed + * without requiring expensive conversion to/from a spreadsheet. + */ + public static String escapeForSingleLineCsv(String unescaped) { + checkArgument(UNSAFE_CHARS.matchesNoneOf(unescaped), "Bad string value: %s", unescaped); + return '"' + ESCAPER.escape(unescaped) + '"'; + } + + /** + * Unescapes a line of text escaped by {@link #escapeForSingleLineCsv(String)} to restore literal + * newlines and other backslash-escaped characters. Note that if the given string already has + * newlines present, they are preserved but will then be escaped if the text is re-escaped later. + */ + public static String unescapeSingleLineCsvText(String s) { + int i = s.indexOf('\\'); + if (i == -1) { + return s; + } + StringBuilder out = new StringBuilder(); + int start = 0; + do { + out.append(s, start, i); + char c = s.charAt(++i); + out.append(checkNotNull(UNESCAPE.get(c), "invalid escape sequence: \\%s", c)); + start = i + 1; + i = s.indexOf('\\', start); + } while (i != -1); + return out.append(s, start, s.length()).toString(); + } + + private static final ImmutableMap UNESCAPE = + ImmutableMap.builder() + .put('n', '\n') + .put('r', '\r') + .put('t', '\t') + .put('\\', '\\') + .build(); + + // Visible for AutoValue only. + CsvTable() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTableCollector.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTableCollector.java new file mode 100644 index 000000000..ae852786b --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTableCollector.java @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static java.util.stream.Collectors.joining; + +import com.google.common.collect.ImmutableList; +import java.io.PrintWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.List; +import java.util.NavigableMap; +import java.util.TreeMap; +import java.util.function.Consumer; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** Collects cell values and tracks maximum cell width to make it easy to output aligned CSV. */ +public final class CsvTableCollector implements Consumer> { + private final NavigableMap maxLengths = new TreeMap<>(); + private final List> cells = new ArrayList<>(); + private final boolean align; + + public CsvTableCollector(boolean align) { + this.align = align; + } + + /** + * Writes the contents of this table, with optional alignment, as a CSV table. Returns whether + * anything was written. + */ + public void writeCsv(Writer writer) { + try (PrintWriter out = new PrintWriter(writer)) { + // Pad elements with whitespace when aligning (since we've gone to all the effort of padding + // everything else). + String joiner = align ? " ; " : ";"; + for (int rowIndex = 0; rowIndex < cells.size(); rowIndex++) { + // No need to use CharMatcher to trim "properly" since only ASCII space is possible. + out.println(getRow(rowIndex).collect(joining(joiner)).trim()); + } + } + } + + /** + * Accepts the next row in the CSV table. Note that the first consumer returned is expected to + * have the title row written to it. + * + *

Values passed into the accept method of the returned consumer are expected to have already + * been escaped if necessary. The caller must call the {@link Consumer#accept(Object)} method for + * every column of the table, even if only to pass an empty string to indicate an empty cell. + */ + @Override + public void accept(Stream row) { + ImmutableList rowValues = row.collect(toImmutableList()); + for (int i = 0; i < rowValues.size(); i++) { + updateMaxLength(rowValues.get(i), i); + } + cells.add(rowValues); + } + + private Stream getRow(int index) { + List row = cells.get(index); + int length = row.size(); + while (length > 0 && row.get(length - 1).isEmpty()) { + length--; + } + if (align) { + return IntStream.range(0, length).mapToObj(n -> pad(row.get(n), maxLength(n))); + } + return row.stream().limit(length); + } + + private static String pad(String s, int len) { + return len > 0 ? String.format("%-" + len + "s", s) : ""; + } + + private int maxLength(int index) { + return maxLengths.getOrDefault(index, 0); + } + + private void updateMaxLength(String s, int index) { + // Note: This isn't Unicode aware, but in reality it's not that important. + maxLengths.put(index, Math.max(s.length(), maxLength(index))); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DiffKey.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DiffKey.java new file mode 100644 index 000000000..075b1fafe --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DiffKey.java @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.List; +import java.util.Optional; +import java.util.function.Function; +import java.util.stream.Stream; + +/** Key for use in "diff" tables, allowing rows to be marked with a diff status. */ +@AutoValue +public abstract class DiffKey { + /** + * Status for rows in a "diff table". Every row in a diff table has a {@code DiffKey}, with a + * status. Modified rows appear twice in the diff table, once for the left-side row, and once for + * the right-side row. + */ + public enum Status { + /** A row which appears exclusively in the left-hand-side of the diff. */ + LHS_ONLY("----"), + /** A row which appears exclusively in the right-hand-side of the diff. */ + RHS_ONLY("++++"), + /** The left-hand-side row which was modified by the diff. */ + LHS_CHANGED("<<<<"), + /** The right-hand-side row which was modified by the diff. */ + RHS_CHANGED(">>>>"), + /** A row unchanged by the diff. */ + UNCHANGED("===="); + + private static final ImmutableMap MAP = + Maps.uniqueIndex(EnumSet.allOf(Status.class), Status::getLabel); + + private final String label; + + Status(String label) { + this.label = label; + } + + String getLabel() { + return label; + } + + static Status parse(String s) { + return MAP.get(s); + } + } + + static CsvKeyMarshaller> wrap(CsvKeyMarshaller keyMarshaller) { + List keyColumns = new ArrayList<>(); + keyColumns.add("Diff"); + keyColumns.addAll(keyMarshaller.getColumns()); + return new CsvKeyMarshaller<>( + serialize(keyMarshaller), deserialize(keyMarshaller), ordering(keyMarshaller), keyColumns); + } + + static DiffKey of(Status status, K key) { + return new AutoValue_DiffKey<>(status, key); + } + + public abstract Status getStatus(); + + public abstract K getOriginalKey(); + + private static Function, Stream> serialize(CsvKeyMarshaller m) { + return k -> Stream.concat(Stream.of(k.getStatus().getLabel()), m.serialize(k.getOriginalKey())); + } + + private static Function, DiffKey> deserialize(CsvKeyMarshaller m) { + return r -> + new AutoValue_DiffKey<>(Status.parse(r.get(0)), m.deserialize(r.subList(1, r.size()))); + } + + private static Optional>> ordering(CsvKeyMarshaller m) { + return m.ordering().map(o -> { + // Weird bug (possibly IntelliJ) means it really doesn't do well inferring types over lambdas + // for this sort of chained API call. Pulling into separate variables works fine. + Comparator> keyFn = Comparator.comparing(DiffKey::getOriginalKey, o); + return keyFn.thenComparing(DiffKey::getStatus); + }); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DisjointRangeMap.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DisjointRangeMap.java new file mode 100644 index 000000000..5ed4d4404 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/DisjointRangeMap.java @@ -0,0 +1,186 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.collect.Maps.filterValues; +import static com.google.common.collect.Maps.transformValues; + +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedMap; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import java.util.Map.Entry; +import java.util.SortedMap; +import java.util.TreeMap; +import javax.annotation.Nullable; + +/** + * A mapping from category values to a set of disjoint ranges. This is used only by the RangeTable + * class to represent a column of values. + */ +final class DisjointRangeMap> { + + static final class Builder> { + private final Column column; + private final SortedMap map = new TreeMap<>(); + // Cache of all assigned ranges, used repeatedly by RangeTable . This could be recalculated + // every time it's needed, but it's just as easy to keep it cached here. + private RangeTree assignedRanges = RangeTree.empty(); + + Builder(Column column) { + this.column = checkNotNull(column); + } + + /** + * Returns the ranges assigned to the given value (returns the empty range if the given value + * is unassigned in this column). Note that unlike table operations, it makes no sense to allow + * {@code null} to be used to determine the unassigned ranges, since calculating that requires + * knowledge of the table in which this column exists. + */ + RangeTree getRanges(Object value) { + T checkedValue = column.cast(checkNotNull(value)); + return map.getOrDefault(checkedValue, RangeTree.empty()); + } + + /** Returns the currently assigned ranges for this column. */ + RangeTree getAssignedRanges() { + return assignedRanges; + } + + /** + * Checks whether the "proposed" assignment would succeed with the specified overwrite mode + * (assignments always succeed if the mode is {@link OverwriteMode#ALWAYS} ALWAYS). If the + * given value is {@code null} and the mode is not {@code ALWAYS}, this method ensures that + * none of the given ranges are assigned to any value in this column. + *

+ * This is useful as a separate method when multiple changes are to be made which cannot be + * allowed to fail halfway through. + * + * @throws IllegalArgumentException if the value cannot be added to the column. + * @throws RangeException if the write is not possible with the given mode. + */ + T checkAssign(@Nullable Object value, RangeTree ranges, OverwriteMode mode) { + // Always check the proposed value (for consistency). + T checkedValue = column.cast(value); + if (mode != OverwriteMode.ALWAYS) { + checkArgument(checkedValue != null, + "Assigning a null value (unassignment) with mode other than ALWAYS makes no sense: %s", + mode); + if (mode == OverwriteMode.SAME) { + // Don't care about ranges that are already in the map. + ranges = ranges.subtract(map.getOrDefault(checkedValue, RangeTree.empty())); + } + RangeException.checkDisjoint(column, checkedValue, assignedRanges, ranges, mode); + } + return checkedValue; + } + + /** + * Assigns the given ranges to the specified value in this column. After a call to + * {@code assign()} with a non-null value it is true that: + *

    + *
  • The result of {@code getRanges(value)} will contain at least the given ranges. + *
  • No ranges assigned to any other category value will intersect with the given ranges. + *
+ * If ranges are "assigned" to {@code null}, it has the effect of unassigning them. + * + * @param value the category value to assign ranges to, or {@code null} to unassign. + * @param ranges the ranges to assign to the category value with ID {@code id}. + * @param mode the overwrite mode describing how to handle existing assignments. + * @throws IllegalArgumentException if the assignment violates the given {@link OverwriteMode}. + */ + void assign(@Nullable Object value, RangeTree ranges, OverwriteMode mode) { + T checkedValue = checkAssign(value, ranges, mode); + // Now unassign the ranges for all other values (only necessary if mode is "ALWAYS" since in + // other modes we've already ensured there's no intersection). + if (mode == OverwriteMode.ALWAYS) { + RangeTree overlap = assignedRanges.intersect(ranges); + if (!overlap.isEmpty()) { + for (Entry e : map.entrySet()) { + // Skip needless extra work for the value we are about to assign. + if (!e.getKey().equals(checkedValue)) { + e.setValue(e.getValue().subtract(overlap)); + } + } + } + } + if (checkedValue != null) { + map.put(checkedValue, ranges.union(map.getOrDefault(checkedValue, RangeTree.empty()))); + assignedRanges = assignedRanges.union(ranges); + } else { + assignedRanges = assignedRanges.subtract(ranges); + } + } + + /** Builds the range map. */ + DisjointRangeMap build() { + return new DisjointRangeMap(column, map, assignedRanges); + } + } + + private final Column column; + private final ImmutableSortedMap map; + private final RangeTree assignedRanges; + + private DisjointRangeMap( + Column column, SortedMap map, RangeTree assignedRanges) { + this.column = checkNotNull(column); + this.map = ImmutableSortedMap.copyOfSorted(filterValues(map, r -> !r.isEmpty())); + this.assignedRanges = assignedRanges; + } + + /** + * Returns the ranges assigned to the given value. + * + * @throws IllegalArgumentException if {@code value} is not a value in this category. + */ + RangeTree getRanges(Object value) { + return map.get(column.cast(value)); + } + + /** Returns all values assigned to non-empty ranges in this column. */ + ImmutableSet getAssignedValues() { + return map.keySet(); + } + + /** Returns the union of all assigned ranges in this column. */ + RangeTree getAssignedRanges() { + return assignedRanges; + } + + /** Intersects this column with the given bounds. */ + DisjointRangeMap intersect(RangeTree bounds) { + return new DisjointRangeMap( + column, transformValues(map, r -> r.intersect(bounds)), assignedRanges.intersect(bounds)); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof DisjointRangeMap)) { + return false; + } + // No need to check "assignedRanges" since it's just a cache of other values anyway. + DisjointRangeMap other = (DisjointRangeMap) obj; + return this == other || (column.equals(other.column) && map.equals(other.map)); + } + + @Override + public int hashCode() { + return column.hashCode() ^ map.hashCode(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/MultiValue.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/MultiValue.java new file mode 100644 index 000000000..462bb051f --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/MultiValue.java @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.CharMatcher.whitespace; +import static com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Comparators; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSortedSet; +import java.util.Comparator; +import java.util.function.Function; + +/** + * A wrapper to permit sets of values to be specified as a single "cell" in a CsvTable or + * RangeTable. Currently only sets of values are permitted (not lists) so duplicate elements are + * not allowed. This is easy to change in future, but the real data suggests no use case for that. + * + *

The expectation of this class is that specific, non-generic subclasses will be made to + * "solidify" the choice of value type, separator and value ordering. This is why those specific + * attributes are not tested in the equals()/hashCode() methods, since they are expected to be + * constant for a given implementation. Subclasses should be final, and look something like: + *

 {@code
+ * public static final class Foos extends MultiValue {
+ *   private static final Foos EMPTY = new Foos(ImmutableSet.of());
+ *
+ *   public static Column column(String name) {
+ *     return Column.create(Foos.class, name, EMPTY, Foos::new);
+ *   }
+ *
+ *   public static Foos of(Iterable foos) {
+ *     return new Foos(foos);
+ *   }
+ *
+ *   private Foos(Iterable foos) { super(foos, , , ); }
+ *   private Foos(String s) { super(s, , , , ); }
+ * }
+ * }
+ * where {@code }, {@code } and {@code } are the same constants in + * both places. + */ +public abstract class MultiValue> + implements Comparable { + + private final ImmutableSet values; + private final char separator; + private final Comparator> comparator; + + protected MultiValue( + String s, Function fn, char separator, Comparator comparator, boolean sorted) { + this(parse(s, fn, separator), separator, comparator, sorted); + } + + protected MultiValue( + Iterable values, char separator, Comparator comparator, boolean sorted) { + this.separator = separator; + this.values = + sorted ? ImmutableSortedSet.copyOf(comparator, values) : ImmutableSet.copyOf(values); + this.comparator = Comparators.lexicographical(comparator); + } + + private static ImmutableList parse(String s, Function fn, char separator) { + Splitter splitter = Splitter.on(separator).omitEmptyStrings().trimResults(whitespace()); + return splitter.splitToList(s).stream().map(fn).collect(toImmutableList()); + } + + public final ImmutableSet getValues() { + return values; + } + + public final char separator() { + return separator; + } + + @Override + public final int compareTo(M that) { + // The separator doesn't factor in here since it's always the same. + return comparator.compare(this.getValues(), that.getValues()); + } + + @Override + @SuppressWarnings({"unchecked", "EqualsGetClass"}) + public final boolean equals(Object obj) { + // Check exact subclass, since we expect separators and ordering to always be the same. + if (obj == null || obj.getClass() != getClass()) { + return false; + } + return getValues().equals(((MultiValue) obj).getValues()); + } + + @Override + public final int hashCode() { + return getValues().hashCode(); + } + + @Override + public final String toString() { + return Joiner.on(separator()).join(getValues()); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeException.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeException.java new file mode 100644 index 000000000..b497949ed --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeException.java @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static java.util.stream.Collectors.joining; + +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import javax.annotation.Nullable; + +/** A structured exception which should be used whenever structural errors occur in table data. */ +public final class RangeException extends IllegalArgumentException { + // Called when assigning ranges, depending on the overwrite mode. As more cases are added, + // consider refactoring and subclassing for clean semantics. + static > void checkDisjoint( + Column column, T value, RangeTree existing, RangeTree ranges, OverwriteMode mode) { + RangeTree intersection = existing.intersect(ranges); + if (!intersection.isEmpty()) { + // A non-empty intersection implies both inputs are also non-empty. + throw new RangeException(column, value, existing, ranges, intersection, mode); + } + } + + RangeException(Column column, + @Nullable Object value, + RangeTree existing, + RangeTree ranges, + RangeTree intersection, + OverwriteMode mode) { + super(explain(checkNotNull(column), value, existing, ranges, intersection, checkNotNull(mode))); + } + + private static String explain( + Column column, + @Nullable Object value, + RangeTree existing, + RangeTree ranges, + RangeTree intersection, + OverwriteMode mode) { + return String.format( + "cannot assign non-disjoint ranges for value '%s' in column '%s' using overwrite mode: %s\n" + + "overlapping ranges:\n%s" + + "existing ranges:\n%s" + + "new ranges:\n%s", + value, column, mode, toLines(intersection), toLines(existing), toLines(ranges)); + } + + private static String toLines(RangeTree ranges) { + checkArgument(!ranges.isEmpty()); + return ranges.asRangeSpecifications().stream().map(s -> " " + s + "\n").collect(joining()); + } + + // We suppress stack traces for "semantic" exceptions, since these aren't intended to indicate + // bugs, but rather user error (for which a stack trace is not very useful). + @Override + public synchronized Throwable fillInStackTrace() { + return this; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeKey.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeKey.java new file mode 100644 index 000000000..5f6f0f301 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeKey.java @@ -0,0 +1,215 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static java.lang.Integer.numberOfTrailingZeros; +import static java.util.Comparator.comparing; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSortedSet; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer; +import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.NavigableSet; +import java.util.Set; + +/** + * A range key is somewhat similar to a {@link RangeSpecification}, except that it can encode + * multiple possible lengths for a single range prefix. Range keys are particularly useful as + * unique "row keys" when representing range trees as tabular data. + */ +@AutoValue +public abstract class RangeKey { + /** + * Order by prefix first and then minimum length. For row keys representing disjoint ranges, this + * will be a total ordering (since the comparison is really with the "shortest" digit sequence in + * the ranges, which must be distinct for disjoint ranges). + */ + public static final Comparator ORDERING = + comparing(RangeKey::getPrefix, comparing(s -> s.min().toString())) + .thenComparing(RangeKey::getLengths, comparing(NavigableSet::first)); + + /** + * Creates a range key representing ranges with a prefix of some set of lengths. The prefix must + * not be longer than the possible lengths and cannot end with an "any" edge (i.e. "x"). + */ + public static RangeKey create(RangeSpecification prefix, Set lengths) { + checkArgument(prefix.length() == 0 || prefix.getBitmask(prefix.length() - 1) != ALL_DIGITS_MASK, + "prefix cannot end with an 'any' edge: %s", prefix); + ImmutableSortedSet sorted = ImmutableSortedSet.copyOf(lengths); + checkArgument(sorted.first() >= prefix.length(), + "lengths cannot be shorter than the prefix: %s - %s", prefix, lengths); + return new AutoValue_RangeKey(prefix, sorted); + } + + /** + * Decomposes the given range tree into a sorted sequence of keys, representing the same digit + * sequences. The resulting keys form a disjoint covering of the original range set, and no + * two keys will contain the same prefix (but prefixes of keys may overlap, even if the ranges + * they ultimately represent do not). The resulting sequence is ordered by {@link #ORDERING}. + */ + public static ImmutableList decompose(RangeTree tree) { + List keys = new ArrayList<>(); + // The ALLOW_EDGE_SPLITTING strategy works best for the case of generating row keys because it + // helps avoid having the same sequence appear in multiple rows. Note however than even this + // strategy isn't perfect, and partially overlapping ranges with different lengths can still + // cause issues. For example, 851 appears as a prefix for 2 rows in the following (real world) + // example. + // prefix=85[1-9], length=10 + // prefix=8[57]1, length=11 + // However a given digit sequence will still only appear in (at most) one range key based on + // its length. + for (RangeTree f : RangeTreeFactorizer.factor(tree, MergeStrategy.ALLOW_EDGE_SPLITTING)) { + KeyVisitor.visit(f, keys); + } + return ImmutableList.sortedCopyOf(ORDERING, keys); + } + + // A recursive descent visitor that splits range keys from the visited tree on the upward phase + // of visitation. After finding the terminal node, the visitor tries to strip as much of the + // trailing "any" path as possible, to leave the prefix. Note that the visitor can never start + // another downward visitation while its processing the "any" paths, because if it walks up + // through an "any" path, the node it reaches cannot have any other edges coming from it (the + // "any" path is all the possible edges). + private static class KeyVisitor implements DfaVisitor { + private static void visit(RangeTree tree, List keys) { + KeyVisitor v = new KeyVisitor(keys); + tree.accept(v); + // We may still need to emit a key for ranges with "any" paths that reach the root node. + int lengthMask = v.lengthMask; + // Shouldn't happen for phone numbers, since it implies the existence of "zero length" digit + // sequences. + if (tree.getInitial().canTerminate()) { + lengthMask |= 1; + } + if (lengthMask != 0) { + // Use the empty specification as a prefix since the ranges are defined purely by length. + keys.add(new AutoValue_RangeKey(RangeSpecification.empty(), buildLengths(lengthMask))); + } + } + + // Collection of extracted keys. + private final List keys; + // Current path from the root of the tree being visited. + private RangeSpecification path = RangeSpecification.empty(); + // Non-zero when we are in the "upward" phase of visitation, processing trailing "any" paths. + // When zero we are either in a "downward" phase or traversing up without stripping paths. + private int lengthMask = 0; + + private KeyVisitor(List keys) { + this.keys = checkNotNull(keys); + } + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + checkState(lengthMask == 0, + "during downward tree traversal, length mask should be zero (was %s)", lengthMask); + RangeSpecification oldPath = path; + path = path.extendByMask(edge.getDigitMask()); + if (target.equals(RangeTree.getTerminal())) { + lengthMask = (1 << path.length()); + // We might emit the key immediately for ranges without trailing paths (e.g. "1234"). + maybeEmitKey(); + } else { + target.accept(this); + // If we see a terminating node, we are either adding a new possible length to an existing + // key or starting to process a new key (we don't know and it doesn't matter providing we + // capture the current length in the mask). + if (target.canTerminate()) { + lengthMask |= (1 << path.length()); + } + maybeEmitKey(); + } + path = oldPath; + } + + // Conditionally emits a key for the current path prefix and possible lengths if we've found + // the "end" of an "any" path (e.g. we have possible lengths and the edge above us is not an + // "any" path). + private void maybeEmitKey() { + if (lengthMask != 0 && path.getBitmask(path.length() - 1) != ALL_DIGITS_MASK) { + keys.add(new AutoValue_RangeKey(path, buildLengths(lengthMask))); + lengthMask = 0; + } + } + } + + /** + * Returns the prefix for this range key. All digit sequences matches by this key are of the + * form {@code "xxxx"} for some number of "any" edges. This prefix can be "empty" for + * ranges such as {@code "xxxx"}. + */ + public abstract RangeSpecification getPrefix(); + + /** + * Returns the possible lengths for digit sequences matched by this key. The returned set is + * never empty. + */ + public abstract ImmutableSortedSet getLengths(); + + /** + * Converts the range key into a sequence of range specifications, ordered by length. The + * returned set is never empty. + */ + public final ImmutableList asRangeSpecifications() { + RangeSpecification s = getPrefix(); + return getLengths().stream() + .map(n -> s.extendByLength(n - s.length())) + .collect(toImmutableList()); + } + + public final RangeTree asRangeTree() { + RangeSpecification s = getPrefix(); + return RangeTree.from(getLengths().stream().map(n -> s.extendByLength(n - s.length()))); + } + + /* + * Checks if the RangeKey contains a range represented by the given prefix and length. + */ + public boolean contains(DigitSequence prefix, Integer length) { + return asRangeSpecifications().stream() + .anyMatch( + specification -> + specification.matches( + prefix.extendBy(DigitSequence.zeros(length - prefix.length())))); + } + + private static ImmutableSortedSet buildLengths(int lengthMask) { + checkArgument(lengthMask != 0); + ImmutableSortedSet.Builder lengths = ImmutableSortedSet.naturalOrder(); + do { + int length = numberOfTrailingZeros(lengthMask); + lengths.add(length); + // Clear each bit as we go. + lengthMask &= ~(1 << length); + } while (lengthMask != 0); + return lengths.build(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java new file mode 100644 index 000000000..07be2fee1 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java @@ -0,0 +1,951 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.google.common.collect.Iterables.transform; +import static com.google.common.collect.Maps.immutableEntry; +import static java.util.Comparator.comparing; +import static java.util.Map.Entry.comparingByKey; +import static java.util.stream.Collectors.joining; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableTable; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.google.common.collect.Table; +import com.google.common.collect.TreeBasedTable; +import com.google.common.collect.UnmodifiableIterator; +import com.google.i18n.phonenumbers.metadata.PrefixTree; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.function.Function; +import javax.annotation.Nullable; + +/** + * A tabular representation of attributes, assigned to number ranges. + *

+ * A {@code RangeTable} is equivalent to {@code Table<RangeSpecification, Column, Value>}, + * but is expressed as a mapping of {@code (Column, Value) -> RangeTree} (since {@code RangeTree} + * is not a good key). To keep the data structurally equivalent to its tabular form, it's important + * that within a column, all assigned ranges are mutually disjoint (and thus a digit sequence can + * have at most one value assigned in any column). + * + *

Table Schemas

+ * A table requires a {@link Schema}, which defines the columns which can be present and their + * order. Column ordering is important since it relates to how rules are applied (see below). + * + *

Columns and Column Groups

+ * A {@link Column} defines a category of values of a particular type (e.g. String, Boolean, + * Integer or user specified enums) and a default value. New columns can be implemented easily and + * can choose to limit their values to some known set. + *

+ * A {@link ColumnGroup} defines a related set of columns of the same type. The exact set of + * columns available in a group is not necessarily known in advance. A good example of a column + * group is having columns for names is different languages. A column group of "Name" could define + * columns such as "Name:en", "Name:fr", "Name:ja" etc. which contain the various translations of + * the value. The first time a value is added for a column inferred by a column group, that column + * is created. + *

+ * An {@link Assignment} is a useful way to encapsulate "a value in a column" and can be used to + * assign or unassign values to ranges, or query for the ranges which have that assignment. + * + *

Builders and Unassigned Values

+ * To allow a {@code RangeTable} to fully represent data in a tabular way, it must be possible to + * have rows in a table for which no value is assigned in any column. Unassigned ranges can be + * added to a builder via the {@link Builder#add(RangeTree)} method, and these "empty rows" are + * preserved in the final table. + *

+ * This is useful since it allows a {@link Change} to affect no columns, but still have an effect + * on the final table. It's also useful when applying rules to infer values and fill-in column + * defaults. + */ +public final class RangeTable { + + /** Overwrite rules for modifying range categorization. */ + public enum OverwriteMode { + /** Only assign ranges that were previously unassigned. */ + NEVER, + /** Only assign ranges that were either unassigned or had the same value. */ + SAME, + /** Always assign ranges (and unassign them from any other values in the same category). */ + ALWAYS; + } + + /** A builder for an immutable range table to which changes and rules can be applied. */ + public static final class Builder { + // The schema for the table to be built. + private final Schema schema; + // The map of per-column ranges. + private final SortedMap, DisjointRangeMap.Builder> columnRanges; + // The union of all ranges added to the builder (either by assignment or range addition). + // This is not just a cache of all the assigned ranges, since assigning and unassigning a range + // will not cause it to be removed from the table altogether (even if it is no longer assigned + // in any column). + private RangeTree allRanges = RangeTree.empty(); + + private Builder(Schema schema) { + this.schema = checkNotNull(schema); + this.columnRanges = new TreeMap<>(schema.ordering()); + } + + // Helper to return an on-demand builder for a column. + private > DisjointRangeMap.Builder getOrAddRangeMap(Column c) { + // The generic type of the builder is defined by the column it's building for, and the map + // just uses that column as its key. Thus, if the given column is recognized by the schema, + // the returned builder must be of the same type. + @SuppressWarnings("unchecked") + DisjointRangeMap.Builder ranges = (DisjointRangeMap.Builder) + columnRanges.computeIfAbsent(schema.checkColumn(c), DisjointRangeMap.Builder::new); + return ranges; + } + + // ---- Read-only API ---- + + /** Returns the schema for this builder. */ + public Schema getSchema() { + return schema; + } + + /** + * Returns ranges for the given assignment. If the value is {@code empty}, then the unassigned + * ranges in the column are returned. + */ + public RangeTree getRanges(Assignment assignment) { + return getRanges(assignment.column(), assignment.value().orElse(null)); + } + + /** + * Returns ranges for the given value in the specified column. If the value is {@code null}, + * then the unassigned ranges in the column are returned. If the column has no values assigned, + * then the empty range is returned (or, if {@code value == null}, all ranges in the table). + */ + public RangeTree getRanges(Column column, @Nullable Object value) { + getSchema().checkColumn(column); + DisjointRangeMap.Builder rangeMap = columnRanges.get(column); + if (value != null) { + return rangeMap != null ? rangeMap.getRanges(value) : RangeTree.empty(); + } else { + RangeTree all = getAllRanges(); + return rangeMap != null ? all.subtract(rangeMap.getAssignedRanges()) : all; + } + } + + /** + * Returns all assigned ranges in the specified column. If the column doesn't exist in the + * table, the empty range is returned). + */ + public RangeTree getAssignedRanges(Column column) { + getSchema().checkColumn(column); + DisjointRangeMap.Builder rangeMap = columnRanges.get(column); + return rangeMap != null ? rangeMap.getAssignedRanges() : RangeTree.empty(); + } + + /** + * Returns ranges which were added to this builder, either directly via {@link #add(RangeTree)} + * or indirectly via assignment. + */ + public RangeTree getAllRanges() { + return allRanges; + } + + /** Returns all ranges present in this table which are not assigned in any column. */ + public RangeTree getUnassignedRanges() { + RangeTree allAssigned = columnRanges.values().stream() + .map(DisjointRangeMap.Builder::getAssignedRanges) + .reduce(RangeTree.empty(), RangeTree::union); + return allRanges.subtract(allAssigned); + } + + /** + * Returns a snapshot of the columns in schema order (including empty columns which may have + * been added explicitly or exist due to values being unassigned). + */ + public ImmutableSet> getColumns() { + return columnRanges.entrySet().stream() + .map(Entry::getKey) + .collect(toImmutableSet()); + } + + // ---- Range assignment/addition/removal ---- + + /** + * Assigns the specified ranges to the given assignment. If the value is {@code empty}, then + * this has the effect of unassigning the given ranges, but does not remove them from the + * table. If {@code ranges} is empty, this method has no effect. + * + * @throws RangeException if assignment cannot be performed according to the overwrite mode + * (no change will have occurred in the table if this occurs). + */ + public Builder assign(Assignment assignment, RangeTree ranges, OverwriteMode mode) { + assign(assignment.column(), assignment.value().orElse(null), ranges, mode); + return this; + } + + /** + * Assigns the specified ranges to a value within a column (other columns unaffected). If the + * value is {@code null}, then this has the effect of unassigning the given ranges, but does + * not remove them from the table. If {@code ranges} is empty, this method has no effect. + * + * @throws RangeException if assignment cannot be performed according to the overwrite mode + * (no change will have occurred in the table if this occurs). + */ + public Builder assign( + Column column, @Nullable Object value, RangeTree ranges, OverwriteMode mode) { + if (!ranges.isEmpty()) { + getOrAddRangeMap(column).assign(value, ranges, mode); + allRanges = allRanges.union(ranges); + } + return this; + } + + /** + * Unconditionally assigns all values, ranges and columns in the given table. This does not + * clear any already assigned ranges. + */ + public Builder add(RangeTable table) { + add(table.getAllRanges()); + add(table.getColumns()); + for (Column column : table.getColumns()) { + for (Object value : table.getAssignedValues(column)) { + assign(column, value, table.getRanges(column, value), OverwriteMode.ALWAYS); + } + } + return this; + } + + /** + * Ensures that the given ranges exist in the table, even if no assignments are ever made in + * any columns. + */ + public Builder add(RangeTree ranges) { + allRanges = allRanges.union(ranges); + return this; + } + + /** Ensures that the given column exists in the table (even if there are no assignments). */ + public Builder add(Column column) { + getOrAddRangeMap(checkNotNull(column)); + return this; + } + + /** Ensures that the given columns exist in the table (even if there are no assignments). */ + public Builder add(Collection> columns) { + columns.forEach(this::add); + return this; + } + + /** Removes the given ranges from the table, including all assignments in all columns. */ + public Builder remove(RangeTree ranges) { + for (DisjointRangeMap.Builder rangeMap : columnRanges.values()) { + rangeMap.assign(null, ranges, OverwriteMode.ALWAYS); + } + allRanges = allRanges.subtract(ranges); + return this; + } + + /** Removes the given column from the table (has no effect if the column is not present). */ + public Builder remove(Column column) { + columnRanges.remove(checkNotNull(column)); + return this; + } + + /** Removes the given columns from the table (has no effect if columns are not present). */ + public Builder remove(Collection> columns) { + columns.forEach(this::remove); + return this; + } + + /** Copies the assigned, non-default, values of the specified column. */ + public > Builder copyNonDefaultValues( + Column column, RangeTable src, OverwriteMode mode) { + for (T v : src.getAssignedValues(column)) { + if (!column.defaultValue().equals(v)) { + assign(column, v, src.getRanges(column, v), mode); + } + } + return this; + } + + // ---- Applying changes ---- + + /** + * Unconditionally applies the given change to this range table. Unlike + * {@link #apply(Change, OverwriteMode)}, this method cannot fail, since changes are applied + * unconditionally. + */ + public Builder apply(Change change) { + return apply(change, OverwriteMode.ALWAYS); + } + + /** + * Applies the given change to this range table. A change adds ranges to the table, optionally + * assigning them specific category values within columns. + * + * @throws RangeException if the overwrite mode prohibits the modification in this change (the + * builder remains unchanged). + */ + public Builder apply(Change change, OverwriteMode mode) { + RangeTree ranges = change.getRanges(); + if (!ranges.isEmpty()) { + // Check first that the assignments will succeed before attempting them (so as not to + // leave the builder in an inconsistent state if it fails). + if (mode != OverwriteMode.ALWAYS) { + for (Assignment a : change.getAssignments()) { + getOrAddRangeMap(a.column()).checkAssign(a.value().orElse(null), ranges, mode); + } + } + for (Assignment a : change.getAssignments()) { + getOrAddRangeMap(a.column()).assign(a.value().orElse(null), ranges, mode); + } + allRanges = allRanges.union(ranges); + } + return this; + } + + // ---- Builder related methods ---- + + /** Builds the range table from the current state of the builder. */ + public RangeTable build() { + ImmutableMap, DisjointRangeMap> columnMap = columnRanges.entrySet().stream() + .map(e -> immutableEntry(e.getKey(), e.getValue().build())) + .sorted(comparingByKey(schema.ordering())) + .collect(toImmutableMap(Entry::getKey, Entry::getValue)); + return new RangeTable(schema, columnMap, allRanges, getUnassignedRanges()); + } + + /** + * Returns a new builder with the same state as the current builder. This is useful when state + * is being built up incrementally. + */ + public Builder copy() { + // Can be made more efficient if necessary... + return build().toBuilder(); + } + + /** Builds a minimal version of this table in which empty columns are no longer present. */ + public RangeTable buildMinimal() { + ImmutableSet> empty = columnRanges.entrySet().stream() + .filter(e -> e.getValue().getAssignedRanges().isEmpty()) + .map(Entry::getKey) + .collect(toImmutableSet()); + remove(empty); + return build(); + } + + @Override + public final String toString() { + return build().toString(); + } + } + + /** Returns a builder for a range table with the specified column mapping. */ + public static Builder builder(Schema schema) { + return new Builder(schema); + } + + public static RangeTable from( + Schema schema, Table, Optional> t) { + Builder table = builder(schema); + for (Entry, Optional>> row : t.rowMap().entrySet()) { + List> assignments = row.getValue().entrySet().stream() + .map(e -> Assignment.ofOptional(e.getKey(), e.getValue())) + .collect(toImmutableList()); + table.apply(Change.of(RangeTree.from(row.getKey()), assignments)); + } + return table.build(); + } + + // Definition of table columns. + private final Schema schema; + // Mapping to the assigned ranges for each column type. + private final ImmutableMap, DisjointRangeMap> columnRanges; + // All ranges in this table (possibly larger than union of all assigned ranges in all columns). + private final RangeTree allRanges; + // Ranges unassigned in any column (a subset of, or equal to allRanges). + private final RangeTree unassigned; + + private RangeTable( + Schema schema, + ImmutableMap, DisjointRangeMap> columnRanges, + RangeTree allRanges, + RangeTree unassigned) { + this.schema = checkNotNull(schema); + this.columnRanges = checkNotNull(columnRanges); + this.allRanges = checkNotNull(allRanges); + this.unassigned = checkNotNull(unassigned); + } + + /** Returns a builder initialized to the ranges and assignements in this table. */ + public Builder toBuilder() { + // Any mode would work here (the builder is empty) but the "always overwrite" mode is fastest. + return new Builder(schema).add(this); + } + + private Optional> getRangeMap(Column column) { + return Optional.ofNullable(columnRanges.get(schema.checkColumn(column))); + } + + public Schema getSchema() { + return schema; + } + + public ImmutableSet> getColumns() { + return columnRanges.keySet(); + } + + /** + * Returns the set of values with assigned ranges in the given column. + * + * @throws IllegalArgumentException if the specified column does not exist in this table. + */ + public > ImmutableSet getAssignedValues(Column column) { + getSchema().checkColumn(column); + // Safe since if the column is in the schema the values must have been checked when added. + @SuppressWarnings("unchecked") + DisjointRangeMap rangeMap = + (DisjointRangeMap) columnRanges.get(schema.checkColumn(column)); + return rangeMap != null ? rangeMap.getAssignedValues() : ImmutableSet.of(); + } + + /** Returns all assigned ranges in the specified column. */ + public RangeTree getAssignedRanges(Column column) { + return getRangeMap(column).map(DisjointRangeMap::getAssignedRanges).orElse(RangeTree.empty()); + } + + /** + * Returns ranges for the given assignment. If the value is {@code empty}, then the unassigned + * ranges in the column are returned. + */ + public RangeTree getRanges(Assignment assignment) { + return getRanges(assignment.column(), assignment.value().orElse(null)); + } + + /** + * Returns ranges for the given value in the specified column. If the value is {@code null}, then + * the unassigned ranges in the column are returned. + */ + public RangeTree getRanges(Column column, @Nullable Object value) { + getSchema().checkColumn(column); + if (value == null) { + return getAllRanges().subtract(getAssignedRanges(column)); + } else { + return getRangeMap(column).map(m -> m.getRanges(value)).orElse(RangeTree.empty()); + } + } + + /** Returns all ranges present in this table. */ + public RangeTree getAllRanges() { + return allRanges; + } + + /** Returns all ranges present in this table which are not assigned in any column. */ + public RangeTree getUnassignedRanges() { + return unassigned; + } + + /** + * Returns whether this table contains no ranges (assigned or unassigned). Note that not all + * empty tables are equal, since they may still differ by the columns they have. + */ + public boolean isEmpty() { + return allRanges.isEmpty(); + } + + /** + * Returns a sub-table with rows and columns limited by the specified bounds. The schema of the + * returned table is the same as this table. + */ + public RangeTable subTable(RangeTree bounds, Set> columns) { + // Columns must be a subset of what's allowed in this schema. + columns.forEach(getSchema()::checkColumn); + return subTable(bounds, getSchema(), columns); + } + + /** + * Returns a sub-table with rows and columns limited by the specified bounds. The schema of the + * returned table is the same as this table. + */ + public RangeTable subTable(RangeTree bounds, Column first, Column... rest) { + return subTable(bounds, ImmutableSet.>builder().add(first).add(rest).build()); + } + + /** + * Returns a table with rows and columns limited by the specified bounds. The schema of the + * returned table is the given sub-schema. + */ + public RangeTable subTable(RangeTree bounds, Schema subSchema) { + checkArgument(subSchema.isSubSchemaOf(getSchema()), + "expected sub-schema of %s, got %s", getSchema(), subSchema); + return subTable(bounds, subSchema, Sets.filter(getColumns(), subSchema::isValidColumn)); + } + + // Callers MUST validate that the given set of columns are all valid in the subSchema. + private RangeTable subTable(RangeTree bounds, Schema subSchema, Set> columns) { + ImmutableMap, DisjointRangeMap> columnMap = columns.stream() + // Bound the given columns which exist in this table. + .map(c -> immutableEntry(c, getRangeMap(c).map(r -> r.intersect(bounds)))) + // Reject columns we didn't already have (but allow empty columns if they exist). + .filter(e -> e.getValue().isPresent()) + // Sort to our schema (since the given set of columns is not required to be sorted). + .sorted(comparingByKey(schema.ordering())) + .collect(toImmutableMap(Entry::getKey, e -> e.getValue().get())); + return new RangeTable( + subSchema, columnMap, allRanges.intersect(bounds), unassigned.intersect(bounds)); + } + + /** + * Returns the assigned rows of a RangeTable as a minimal list of disjoint changes, which can + * be applied to an empty table to recreate this table. No two changes affect the same columns + * in the same way and changes are ordered by the minimal values of their ranges. This is + * essentially the same information as returned in {@link #toImmutableTable()} but does not + * decompose ranges into range specifications, and it thus more amenable to compact + * serialization. + */ + // Note that the minimal nature of the returned changes is essential for some algorithms that + // operate on tables and this must not be changed. + public ImmutableList toChanges() { + Table, Optional, RangeTree> table = HashBasedTable.create(); + for (Column c : getColumns()) { + for (Object v : getAssignedValues(c)) { + table.put(c, Optional.of(v), getRanges(c, v)); + } + } + return toChanges(schema, table, getAllRanges()); + } + + /** + * Returns a minimum set of changes based on a table of assignments (column plus value). This is + * not expected to be used often (since RangeTable is usually a better representation of the data + * but can be useful in representing things like updates and patches in which only some rows or + * columns are represented. + * @param schema a schema for the columns in the given Table (used to determine column order). + * @param table the table of assignments to assigned ranges. + * @param allRanges the set of all ranges affected by the changes (this might include ranges not + * present anywhere in the table, which correspond to empty rows). + */ + public static ImmutableList toChanges( + Schema schema, Table, Optional, RangeTree> table, RangeTree allRanges) { + return ImmutableList.copyOf( + transform(toRows(table, allRanges, schema.ordering()), Row::toChange)); + } + + /** + * Returns the data in this table represented as a {@link ImmutableTable}. Row keys are disjoint + * range specifications (in order). The returned table has the smallest number of rows necessary + * to represent the data in this range table. This is useful as a human readable serialized form + * since any digit sequence in the table is contained in a unique row. + */ + public ImmutableTable, Optional> toImmutableTable() { + Table, Optional, RangeTree> table = HashBasedTable.create(); + for (Column c : getColumns()) { + for (Object v : getAssignedValues(c)) { + table.put(c, Optional.of(v), getRanges(c, v)); + } + RangeTree unassigned = getAllRanges().subtract(getAssignedRanges(c)); + if (!unassigned.isEmpty()) { + table.put(c, Optional.empty(), unassigned); + } + } + // Unique changes contain disjoint ranges, each associated with a unique combination of + // assignments. + TreeBasedTable, Optional> out = + TreeBasedTable.create(comparing(RangeSpecification::min), schema.ordering()); + for (Change c : toChanges(schema, table, getAllRanges())) { + List keys = c.getRanges().asRangeSpecifications(); + for (Assignment a : c.getAssignments()) { + for (RangeSpecification k : keys) { + out.put(k, a.column(), a.value()); + } + } + } + return ImmutableTable.copyOf(out); + } + + /** + * Extracts a map for a single column in this table containing the minimal prefix tree for each + * of the assigned values. The returned prefixes are the shortest prefixes possible for + * distinguishing each value in the column. This method is especially useful if you want to + * categorize partial digit sequences efficiently (i.e. prefix matching). + * + *

A minimal length can be specified to avoid creating prefixes that are "too short" for some + * circumstances. Note that returned prefixes are never zero length, so {@code 1} is the lowest + * meaningful value (although zero is still accepted to imply "no length restriction"). + * + *

Note that for some table data, it is technically impossible to obtain perfect prefix + * information and in cases where overlap occurs, this method returns the shortest prefixes. This + * means that for some valid inputs it might be true that more than one prefix is matched. It + * is therefore up to the caller to determine a "best order" for testing the prefixes if this + * matters. See {@link PrefixTree#minimal(RangeTree, RangeTree, int)} for more information. + * + *

An example of an "impossible" prefix would be if "123" has value A, "1234" has value B and + * "12345" has value A again. In this case there is no prefix which can distinguish A and B + * (the calculated map would be { "123" => A, "1234" => B }). In this situation, testing for the + * longer prefix would help preserve as much of the original mapping as possible, but it would + * never be possible to correctly distinguish all inputs. + */ + public > ImmutableMap getPrefixMap( + Column column, int minPrefixLength) { + ImmutableMap.Builder map = ImmutableMap.builder(); + // Important: Don't just use the assigned ranges in the column, use the assigned ranges of the + // entire table. This ensures unassigned ranges in the column are not accidentally captured by + // any of the generated prefixes. + RangeTree allRanges = getAllRanges(); + for (T value : getAssignedValues(column)) { + RangeTree include = getRanges(column, value); + map.put(value, PrefixTree.minimal(include, allRanges.subtract(include), minPrefixLength)); + } + return map.build(); + } + + // Constants for the simplification routine below. + // Use -1 for unassigned rows (these are the "overlap" ranges and they don't have an index). + private static final Column INDEX = + Column.create(Integer.class, "Change Index", -1, Integer::parseInt); + private static final Schema INDEX_SCHEMA = Schema.builder().add(INDEX).build(); + + /** + * Applies a simplification function to the rows defined by the given columns of this table. The + * returned table will only have (at most) the specified columns present. + * + *

The simplification function is used to produce ranges which satisfy some business logic + * criteria (such as having at most N significant digits, or merging lengths). Range + * simplification enables easier comparison between data sources of differing precision, and + * helps to reduce unnecessary complexity in generated regular expressions. + * + *

The simplification function should return a range that's at least as large as the input + * range. This is to ensure that simplification cannot unassign ranges, even accidentally. The + * returned range is automatically restricted to preserve disjoint ranges in the final table. + * + *

By passing a {@link Change} rather than just a {@link RangeTree}, the simplification + * function has access to the row assignments for the range it is simplifying. This allows it to + * select different strategies according to the values in specific columns (e.g. area code + * length). + * + *

Note that unassigned ranges in the original table will be preserved and simplified ranges + * will not overwrite them. This can be useful for defining "no go" ranges which should be left + * alone. + */ + public RangeTable simplify( + Function simplifyFn, + int minPrefixLength, + Column first, + Column... rest) { + // Build the single column "index" table (one index for each change) and simplify its ranges. + // This only works because "toChanges()" produces the minimal set of changes such that each + // unique combination of assignments appears only once. + ImmutableList rows = subTable(getAllRanges(), first, rest).toChanges(); + RangeTable simplifiedIndexTable = simplifyIndexTable(rows, simplifyFn, minPrefixLength); + + // Reconstruct the output table by assigning values from the original change set according to + // the indices in the simplified index table. + Builder simplified = RangeTable.builder(getSchema()).add(simplifiedIndexTable.getAllRanges()); + for (int i : simplifiedIndexTable.getAssignedValues(INDEX)) { + RangeTree simplifiedRange = simplifiedIndexTable.getRanges(INDEX, i); + for (Assignment a : rows.get(i).getAssignments()) { + simplified.assign(a, simplifiedRange, OverwriteMode.NEVER); + } + } + return simplified.build(); + } + + /** + * Helper function to simplify an index table based on the given rows. The resulting table will + * have a single "index" column with simplified ranges, where the index value {@code N} + * references the Nth row in the given list of disjoint changes. This is a 3 stage process: + *

    + *
  1. Step 1: Determine which ranges can overlap with respect to set of range prefixes. + *
  2. Step 2: Do simplification on the non-overlapping "prefix disjoint" ranges in the table, + * which are then be re-partitioned by the disjoint prefixes. + *
  3. Step 3: Copy over any overlapping ranges from the original table (these don't get + * simplified since it's not possible to easily re-pertition them). + *
+ */ + private static > RangeTable simplifyIndexTable( + ImmutableList rows, Function simplifyFn, int minPrefixLength) { + RangeTable indexTable = makeIndexTable(rows); + + // Step 1: Determine overlapping ranges from the index table, retaining minimum prefix length. + ImmutableMap nonDisjointPrefixes = + indexTable.getPrefixMap(INDEX, minPrefixLength); + // Don't just use the assigned ranges (we need to account for valid but unassigned ranges when + // determining overlaps). + RangeTree allRanges = indexTable.getAllRanges(); + RangeTree overlaps = RangeTree.empty(); + for (int n : indexTable.getAssignedValues(INDEX)) { + RangeTree otherRanges = allRanges.subtract(indexTable.getRanges(INDEX, n)); + overlaps = overlaps.union(nonDisjointPrefixes.get(n).retainFrom(otherRanges)); + } + + // Step 2: Determine the "prefix disjoint" ranges in a new table and simplify it. + // + // Before getting the new set of prefixes, add the overlapping ranges back to the table, but + // without assigning them to anything. This keeps the generated prefixes as long as necessary + // to avoid creating conflicting assignments for different values. Essentially we're trying to + // keep ranges "away from" any overlaps. Note however that it is still possible for simplified + // ranges encroach on the overlapping areas, so we must still forcibly overwrite the original + // overlapping values after siplification. Consider: + // A = { "12x", "12xxx" }, B = { "123x" } + // where the simplification function just creates any "any" range for all lengths between the + // minimum and maximum range lengths (e.g. { "123", "45678" } ==> { "xxx", "xxxx", "xxxxx" }. + // + // The (non disjoint) prefix table is Pre(A) => { "12" }, Pre(B) => { "123" } and this + // captures the overlaps: + // Pre(A).retainFrom(B) = { "123x" } = B + // Pre(B).retainFrom(A) = { "123xx" } + // + // Since is of "B" is entirely contained by the overlap, it is not simplified, but A is + // simplified to: + // { "xxx", "xxxx", "xxxxx" } + // and the re-captured by the "disjoint" prefix (which is still just "12") to: + // { "12x", "12xx", "12xxx" } + // + // However now, when the original overlaps are added back at the end (in step 3) we find that + // both "123xx" already exists (with the same index) and "123x" exists with a different index. + // The resolution is to just overwrite all overlaps back into the table, since these represent + // the original (unsimplified) values. + // + // Thus in this case, the simplified table is: + // Sim(A) = { "12x", "12[0-24-9]x", "12xxx" }, Sim(B) = { "123x" } + // + // And it is still true that: Sim(A).containsAll(A) and Sim(B).containsAll(B) + RangeTable prefixDisjointTable = indexTable + .subTable(allRanges.subtract(overlaps), INDEX) + .toBuilder() + .add(overlaps) + .build(); + + // NOTE: Another way to do this would be to implement an "exclusive prefix" method which could + // be used to immediately return a set of truly "disjoint" prefixes (although this would change + // the algorithm's behaviour since more ranges would be considered "overlapping" than now). + // TODO: Experiment with an alternate "exclusive" prefix function. + ImmutableMap disjointPrefixes = prefixDisjointTable.getPrefixMap(INDEX, 1); + // Not all values from the original table need be present in the derived table (since some + // overlaps account for all the ranges of a value). + Builder simplified = RangeTable.builder(INDEX_SCHEMA); + for (int n : prefixDisjointTable.getAssignedValues(INDEX)) { + RangeTree disjointRange = prefixDisjointTable.getRanges(INDEX, n); + // Pass just the assignments, not the whole row (Change) because that also contains a range, + // which might not be the same as the disjoint range (so it could be rather confusing). + PrefixTree disjointPrefix = disjointPrefixes.get(n); + RangeTree simplifiedRange = + simplifyFn.apply(Change.of(disjointRange, rows.get(n).getAssignments())); + // Technically this check is not strictly required, but there's probably no good use-case in + // which you'd want to remove assignments via the simplification process. + checkArgument(simplifiedRange.containsAll(disjointRange), + "simplification should return a superset of the given range\n" + + "input: %s\n" + + "output: %s\n" + + "missing: %s", + disjointRange, simplifiedRange, disjointRange.subtract(simplifiedRange)); + // Repartition the simplified ranges by the "disjoint" prefixes to restore most of the + // simplified ranges. These ranges should never overlap with each other. + RangeTree repartitionedRange = disjointPrefix.retainFrom(simplifiedRange); + simplified.assign(INDEX, n, repartitionedRange, OverwriteMode.NEVER); + } + + // Step 3: Copy remaining overlapping ranges from the original table back into the result. + // Note that we may end up overwriting values here, but that's correct since it restores + // original "unsimplifiable" ranges. + for (int n : indexTable.getAssignedValues(INDEX)) { + simplified.assign( + INDEX, n, indexTable.getRanges(INDEX, n).intersect(overlaps), OverwriteMode.ALWAYS); + } + return simplified.build(); + } + + // Helper to make a table with a single column than references a list of disjoint changes by + // index (against the range of that change). + private static RangeTable makeIndexTable(ImmutableList rows) { + Builder indexTable = RangeTable.builder(INDEX_SCHEMA); + for (int i = 0; i < rows.size(); i++) { + // Empty rows are added to the table, but not assigned an index. Their existence in the index + // table prevents over simplification from affecting unassigned rows of the original table. + if (rows.get(i).getAssignments().isEmpty()) { + indexTable.add(rows.get(i).getRanges()); + } else { + indexTable.assign(INDEX, i, rows.get(i).getRanges(), OverwriteMode.NEVER); + } + } + return indexTable.build(); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof RangeTable)) { + return false; + } + RangeTable other = (RangeTable) obj; + return this == other + || (schema.equals(other.schema) + && allRanges.equals(other.allRanges) + && columnRanges.values().asList().equals(other.columnRanges.values().asList())); + } + + @Override + public int hashCode() { + // This could be memoized if it turns out to be slow. + return schema.hashCode() ^ columnRanges.hashCode() ^ allRanges.hashCode(); + } + + // TODO: Prettier format for toString(). + @Override + public final String toString() { + ImmutableTable, Optional> table = toImmutableTable(); + return table.rowMap().entrySet().stream() + .map(e -> String.format("%s, %s", e.getKey(), rowToString(e.getValue()))) + .collect(joining("\n")); + } + + private static String rowToString(Map, Optional> r) { + return r.values().stream() + .map(v -> v.map(Object::toString).orElse("UNSET")) + .collect(joining(", ")); + } + + // Helper method to convert a table of values into a minimal set of changes. This is used to + // turn a single RangeTable into an ImmutableTable, but also to convert a Patch into a minimal + // sequence of Changes. Each returned "row" defines a range, and a unique sequence of assignments + // over that range (i.e. no two rows have the same assignments in). The assignments are ordered + // in column order within each row, and the rows are ordered by the minimum digit sequence in + // each range and the ranges form a disjoint covering of the ranges in the original table. + // + // See go/phonenumber-v2-data-structure for more details. + private static ImmutableList toRows( + Table, Optional, RangeTree> src, + RangeTree allRanges, + Comparator> columnOrdering) { + // Get the non-empty columns in _reverse_ iteration order. We build up rows as a linked list + // structure, started from the "right hand side". This avoids a lot of copying as new columns + // are processed. + ImmutableList> reversedColumns = src.rowMap().entrySet().stream() + .filter(e -> !e.getValue().isEmpty()) + .map(Entry::getKey) + .sorted(columnOrdering.reversed()) + .collect(toImmutableList()); + List uniqueRows = new ArrayList<>(); + uniqueRows.add(Row.empty(allRanges)); + for (Column col : reversedColumns) { + // Loop backward here so that rows can be (a) removed in place and (b) added at the end. + for (int i = uniqueRows.size() - 1; i >= 0; i--) { + Row row = uniqueRows.get(i); + // Track the unprocessed range for each row as we extend it. + RangeTree remainder = row.getRanges(); + for (Entry, RangeTree> e : src.row(col).entrySet()) { + RangeTree overlap = e.getValue().intersect(remainder); + if (overlap.isEmpty()) { + continue; + } + // Extend the existing row by the current column value and reduce the remaining ranges. + uniqueRows.add(Row.of(overlap, col, e.getKey(), row)); + remainder = remainder.subtract(overlap); + if (remainder.isEmpty()) { + // We've accounted for all of the existing row in the new column, so remove it. + uniqueRows.remove(i); + break; + } + } + if (!remainder.isEmpty()) { + // The existing row is not completely covered by the new column, so retain what's left. + uniqueRows.set(i, row.bound(remainder)); + } + } + } + return ImmutableList.sortedCopyOf(comparing(r -> r.getRanges().first()), uniqueRows); + } + + /** + * A notional "row" with some set of assignments in a range table or table like structure. Note + * that a Row can represent unassignment as well as assignment, and not all rows need to contain + * all columns. Rows are used for representing value in a table, but also changes between tables. + */ + @AutoValue + abstract static class Row implements Iterable> { + private static Row empty(RangeTree row) { + return new AutoValue_RangeTable_Row(row, null); + } + + private static Row of(RangeTree row, Column col, Optional val, Row next) { + checkArgument(!row.isEmpty(), "empty ranges not permitted (col=%s, val=%s)", col, val); + return new AutoValue_RangeTable_Row( + row, new AutoValue_RangeTable_Cell(Assignment.ofOptional(col, val), next.head())); + } + + public abstract RangeTree getRanges(); + @Nullable abstract Cell head(); + + Change toChange() { + return Change.of(getRanges(), this); + } + + private Row bound(RangeTree ranges) { + return new AutoValue_RangeTable_Row(getRanges().intersect(ranges), head()); + } + + @Override + public Iterator> iterator() { + return new UnmodifiableIterator>() { + @Nullable private Cell cur = Row.this.head(); + + @Override + public boolean hasNext() { + return cur != null; + } + + @Override + public Assignment next() { + Cell c = cur; + if (c == null) { + throw new NoSuchElementException(); + } + cur = cur.next(); + return c.assignment(); + } + }; + } + + @Override + public final String toString() { + return "Row{" + getRanges() + " >> " + Iterables.toString(this) + "}"; + } + } + + @AutoValue + abstract static class Cell { + abstract Assignment assignment(); + @Nullable abstract Cell next(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java new file mode 100644 index 000000000..a6d7429e9 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.auto.value.AutoValue; +import com.google.common.collect.ImmutableCollection; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Ordering; +import java.util.Comparator; + +/** + * Representation of ordered {@link Column}s in a table. Schemas define columns in both + * {@code RangeTable} and {@code CsvTable}. + */ +@AutoValue +public abstract class Schema { + /** + * Builder for a table schema. Columns are ordered in the order in which they, or their owning + * group is added to the schema. + */ + public static final class Builder { + private final ImmutableSet.Builder names = ImmutableSet.builder(); + private final ImmutableMap.Builder> columns = ImmutableMap.builder(); + private final ImmutableMap.Builder> groups = ImmutableMap.builder(); + + /** Adds the given column to the schema. */ + public Builder add(Column column) { + names.add(column.getName()); + columns.put(column.getName(), column); + return this; + } + + /** Adds the given column group to the schema. */ + public Builder add(ColumnGroup group) { + names.add(group.prototype().getName()); + groups.put(group.prototype().getName(), group); + return this; + } + + public Schema build() { + return new AutoValue_Schema(names.build(), columns.build(), groups.build()); + } + } + + private static final Schema EMPTY = builder().build(); + + /** Returns an empty schema with no assigned columns. */ + public static Schema empty() { + return EMPTY; + } + + /** Returns a new schema builder. */ + public static Builder builder() { + return new Builder(); + } + + // Visible for AutoValue only. + Schema() {} + + // List of column/group names used to determine column order: + // E.g. if "names" is: ["col1", "grp1", "col2", "col3"] + // You can have the table <<"col1", "grp1:xx", "grp1:yy", "col3">> + // Not all columns need to be present and groups are ordered contiguously as the group prefix + // appears in the names list. + abstract ImmutableSet names(); + abstract ImmutableMap> columns(); + abstract ImmutableMap> groups(); + + /** + * Returns the column for the specified key string. For "plain" columns (not in groups) the key + * is just the column name. For group columns, the key takes the form "prefix:suffix", where the + * prefix is the name of the "prototype" column, and the "suffix" is an ID of a value within the + * group. For example: + *

{@oode + * // Schema has a plain column called "Type" in it. + * typeCol = table.getColumn("Type"); + * + * // Schema has a group called "Region" in it which can parse RegionCodes. + * usRegionCol = table.getColumn("Region:US"); + * }

+ */ + public Column getColumn(String key) { + int split = key.indexOf(':'); + Column column; + if (split == -1) { + column = columns().get(key); + } else { + ColumnGroup group = groups().get(key.substring(0, split)); + checkArgument(group != null, "invalid column %s, not in schema: %s", key, this); + column = group.getColumnFromId(key.substring(split + 1)); + } + checkArgument(column != null, "invalid column %s, not in schema: %s", key, this); + return column; + } + + /** Returns whether the given column is valid within this schema. */ + public > boolean isValidColumn(Column column) { + int split = column.getName().indexOf(':'); + if (split == -1) { + return columns().containsValue(column); + } else { + ColumnGroup group = groups().get(column.getName().substring(0, split)); + return group != null && column.isIn(group); + } + } + + /** + * Checks whether the given column is valid within this schema, otherwise throws + * IllegalArgumentException. This is expected to be internal use only, since table users are + * meant to always know which columns are valid. + */ + > Column checkColumn(Column column) { + checkArgument(isValidColumn(column), "invalid column %s, not in schema: %s", column, this); + return column; + } + + /** + * Returns whether the this schema has a subset of columns/groups, in the same order as the + * given schema. + */ + public boolean isSubSchemaOf(Schema schema) { + return schema.columns().values().containsAll(columns().values()) + && schema.groups().entrySet().containsAll(groups().entrySet()) + && names().asList().equals( + schema.names().stream().filter(names()::contains).collect(toImmutableList())); + } + + /** Returns an ordering for all columns in this schema. */ + public Comparator> ordering() { + return Comparator + .comparing(Schema::getPrefix, Ordering.explicit(names().asList())) + .thenComparing(Schema::getSuffix); + } + + public ImmutableSet getNames() { + return names(); + } + + public ImmutableCollection> getColumns() { + return columns().values(); + } + + private static String getPrefix(Column column) { + int split = column.getName().indexOf(':'); + return split != -1 ? column.getName().substring(0, split) : column.getName(); + } + + private static String getSuffix(Column column) { + int split = column.getName().indexOf(':'); + return split == -1 ? "" : column.getName().substring(split + 1); + } +} diff --git a/metadata/src/main/proto/enums.proto b/metadata/src/main/proto/enums.proto new file mode 100644 index 000000000..44784bee3 --- /dev/null +++ b/metadata/src/main/proto/enums.proto @@ -0,0 +1,69 @@ +// Copyright (C) 2017 The Libphonenumber Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package i18n.phonenumbers.metadata; + +option java_package = "com.google.i18n.phonenumbers.metadata.proto"; + +// The possible provenance which can be assigned to a range. +// This enum is NOT stable and must only be stored in text based protocol +// buffers. +enum Provenance { + // Having a distinct default/unknown enum with a zero value is a proto3 thing. + // No data should actually ever have this value. + UNKNOWN = 0; + + // Indicates that the ranges were defined in an official ITU document. The + // comment associated with this range should contain a link to the document. + // This is the most trusted for of evidence and will usually replace any + // previous "lower" provenance (though this is not always true for some + // countries). + ITU = 10; + + // Indicates that the ranges were defined in an official IR21 document. The + // comment associated with this range should contain a link to the document. + // This is the most trusted for of evidence and will usually replace any + // previous "lower" provenance (though this is not always true for some + // countries). + IR21 = 20; + + // Indicates that evidence for a range was found in a website belonging to + // an official, government endorsed entity (e.g. national telecoms operator), + // but not part of either an official ITU or IR21 document. + // The comment associated with this range should contain a URL to the + // appropriate page where the evidence was found. + GOVERNMENT = 30; + + // Indicates that evidence for a range was found in a website belonging to a + // telecoms operators (mobile carrier, MVNO etc...). The comment associated + // with this range should contain a URL to the appropriate page where the + // evidence was found. + TELECOMS = 40; + + // Indicates that evidence for a range was found in an unofficial website + // (e.g Facebook or a general company home page). The comment associated + // with this range should contain a URL to the appropriate page where the + // evidence was found. + WEB = 50; + + // Used to indicate special situations in which a number is accepted as + // valid, despite no citeable evidence. When this provenance the coment text + // should indicate some bug report or internal reasoning as to why this range + // should be accepted. This provenance should be used only in exceptional + // circumstances and the comment may be scrubbed from externally published + // versions of the range data. + INTERNAL = 100; +} diff --git a/metadata/src/main/proto/types.proto b/metadata/src/main/proto/types.proto new file mode 100644 index 000000000..5b90c89e9 --- /dev/null +++ b/metadata/src/main/proto/types.proto @@ -0,0 +1,82 @@ +// Copyright (C) 2017 The Libphonenumber Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package i18n.phonenumbers.metadata; + +option java_package = "com.google.i18n.phonenumbers.metadata.proto"; + +// Enum names must match the element names in the XML metadata modulo casing. +enum XmlNumberType { + // Having a distinct default/unknown enum with a zero value is a proto3 thing. + // No data should actually ever have this value. + XML_UNKNOWN = 0; + + XML_NO_INTERNATIONAL_DIALLING = 1; + XML_FIXED_LINE = 2; + XML_MOBILE = 3; + XML_PAGER = 4; + XML_TOLL_FREE = 5; + XML_PREMIUM_RATE = 6; + XML_SHARED_COST = 7; + XML_PERSONAL_NUMBER = 8; + XML_VOIP = 9; + XML_UAN = 10; + XML_VOICEMAIL = 11; +} + +// Validation types for phone number ranges. Each valid range is categorized as +// exactly one of these types. This does not include NO_INTERNATIONAL_DIALLING +// since it is an attribute of ranges rather than their fundamental type. +enum ValidNumberType { + // Having a distinct default/unknown enum with a zero value is a proto3 thing. + // No data should actually ever have this value. + UNKNOWN = 0; + + FIXED_LINE = 1; + MOBILE = 2; + FIXED_LINE_OR_MOBILE = 3; + PAGER = 4; + TOLL_FREE = 5; + PREMIUM_RATE = 6; + SHARED_COST = 7; + PERSONAL_NUMBER = 8; + VOIP = 9; + UAN = 10; + VOICEMAIL = 11; +} + +// Enum names must match the element names in the XML metadata modulo casing. +// Unlike main metadata, these types are not required to be exclusive a number. +enum XmlShortcodeType { + // Having a distinct default/unknown enum with a zero value is a proto3 thing. + // No data should actually ever have this value. + SC_UNKNOWN = 0; + + // General short codes without a more specific representation (unlike + // generalDesc, which can just be the leading digits, this must be precise). + SC_SHORT_CODE = 1; + + // Mutually exclusive sub-set of types for tariff. + SC_TOLL_FREE = 2; + SC_STANDARD_RATE = 3; + SC_PREMIUM_RATE = 4; + + // Use-case types. + SC_CARRIER_SPECIFIC = 5; + SC_EMERGENCY = 6; + SC_EXPANDED_EMERGENCY = 7; + SC_SMS_SERVICES = 8; +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/DigitSequenceTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/DigitSequenceTest.java new file mode 100644 index 000000000..80ebc30d9 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/DigitSequenceTest.java @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain; +import static org.junit.Assert.assertThrows; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class DigitSequenceTest { + + @Test + public void testEmpty() { + Object e = DigitSequence.of(""); + assertThat(e).isSameInstanceAs(DigitSequence.empty()); + assertThat(DigitSequence.empty().length()).isEqualTo(0); + assertThrows(IndexOutOfBoundsException.class, () -> DigitSequence.empty().getDigit(0)); + assertThat(DigitSequence.empty().toString()).isEqualTo(""); + } + + @Test + public void testCreate() { + DigitSequence s = DigitSequence.of("0123456789"); + assertThat(s).isEqualTo(DigitSequence.of("0123456789")); + assertThat(s).isNotEqualTo(DigitSequence.of("1111111111")); + } + + @Test + public void testGetDigit() { + DigitSequence s = DigitSequence.of("0123456789"); + assertThat(s.length()).isEqualTo(10); + for (int n = 0; n < s.length(); n++) { + assertThat(s.getDigit(n)).isEqualTo(n); + } + assertThat(s.toString()).isEqualTo("0123456789"); + } + + @Test + public void testBadArguments() { + assertThrows(NullPointerException.class, () -> DigitSequence.of(null)); + assertThrows(IllegalArgumentException.class, () -> DigitSequence.of("123X")); + // Too long (19 digits). + assertThrows(IllegalArgumentException.class, () -> DigitSequence.of("1234567890123456789")); + } + + @Test + public void testMin() { + assertThat(domain().minValue()).isEqualTo(DigitSequence.empty()); + assertThat(domain().next(DigitSequence.empty())).isNotNull(); + assertThat(domain().previous(DigitSequence.empty())).isNull(); + } + + @Test + public void testMax() { + DigitSequence max = DigitSequence.of("999999999999999999"); + assertThat(domain().maxValue()).isEqualTo(max); + assertThat(domain().previous(max)).isNotNull(); + assertThat(domain().next(max)).isNull(); + } + + @Test + public void testDistance() { + assertThat(domain().distance(DigitSequence.empty(), DigitSequence.of("0"))) + .isEqualTo(1); + assertThat(domain().distance(DigitSequence.of("0"), DigitSequence.of("1"))) + .isEqualTo(1); + assertThat(domain().distance(DigitSequence.of("0"), DigitSequence.of("00"))) + .isEqualTo(10); + assertThat(domain().distance(DigitSequence.of("0"), DigitSequence.of("10"))) + .isEqualTo(20); + assertThat(domain().distance(DigitSequence.of("10"), DigitSequence.of("0"))) + .isEqualTo(-20); + assertThat(domain().distance(DigitSequence.empty(), DigitSequence.of("000000"))) + .isEqualTo(111111); + assertThat(domain().distance(DigitSequence.of("000"), DigitSequence.of("000000"))) + .isEqualTo(111000); + // Max distance is one less than the total number of digit sequences. + assertThat(domain().distance(domain().minValue(), domain().maxValue())) + .isEqualTo(1111111111111111110L); + } + + @Test + public void testLexicographicalOrdering() { + testComparator( + DigitSequence.empty(), + DigitSequence.of("0"), + DigitSequence.of("1"), + DigitSequence.of("9"), + DigitSequence.of("00"), + DigitSequence.of("01"), + DigitSequence.of("10"), + DigitSequence.of("99"), + DigitSequence.of("000"), + DigitSequence.of("123"), + DigitSequence.of("124"), + DigitSequence.of("999")); + } + + @Test + public void testExtend() { + assertThat(DigitSequence.empty().extendBy(0)).isEqualTo(DigitSequence.of("0")); + assertThat(DigitSequence.of("1234").extendBy(DigitSequence.of("5678"))) + .isEqualTo(DigitSequence.of("12345678")); + } + + private static > void testComparator(T... items) { + for (int i = 0; i < items.length; i++) { + assertThat(items[i]).isEqualTo(items[i]); + assertThat(items[i]).isEquivalentAccordingToCompareTo(items[i]); + for (int j = i + 1; j < items.length; j++) { + assertThat(items[i]).isNotEqualTo(items[j]); + assertThat(items[i]).isLessThan(items[j]); + assertThat(items[j]).isGreaterThan(items[i]); + } + } + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/PrefixTreeTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/PrefixTreeTest.java new file mode 100644 index 000000000..1549fa868 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/PrefixTreeTest.java @@ -0,0 +1,213 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.RangeTree.empty; +import static com.google.i18n.phonenumbers.metadata.testing.RangeTreeSubject.assertThat; + +import java.util.Arrays; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class PrefixTreeTest { + @Test + public void testNewInstancesNormalized() { + assertThat(prefixes("123", "1234")).containsExactly("123"); + assertThat(prefixes("70x", "7[1-9]")).containsExactly("7"); + // Regression test for b/68707522 + assertThat(prefixes("123xxx", "123x_xxx", "567xxx", "567x_xxx")).containsExactly("123", "567"); + } + + @Test + public void testRetainFrom() { + PrefixTree prefix = prefixes("123", "124", "126", "555"); + RangeTree ranges = ranges("1xxxxxx", "5xxxxxx", "6xxxxxx"); + assertThat(prefix.retainFrom(ranges)).containsExactly("12[346]xxxx", "555xxxx"); + } + + @Test + public void testPrefixes() { + PrefixTree prefix = prefixes("123", "124", "126", "555"); + assertThat(prefix.prefixes(seq("1230000"))).isTrue(); + assertThat(prefix.prefixes(seq("555000"))).isTrue(); + assertThat(prefix.prefixes(seq("12"))).isFalse(); + assertThat(prefix.prefixes(seq("120000"))).isFalse(); + } + + @Test + public void testEmptyVsZeroLength() { + PrefixTree empty = PrefixTree.from(empty()); + PrefixTree zeroLength = prefixes("xxx"); + + assertThat(empty).isEmpty(); + assertThat(zeroLength).isNotEmpty(); + assertThat(zeroLength).hasSize(1); + assertThat(zeroLength).containsExactly(RangeSpecification.empty()); + + // While the empty prefix tree filters out everything, the zero length tree allows everything + // to pass. This is because the zero length prefix tree represents a single prefix of length + // zero and all digit sequences start with a zero length sub-sequence. + RangeTree ranges = ranges("12x", "3xx", "456"); + assertThat(empty.retainFrom(ranges)).isEqualTo(empty()); + assertThat(zeroLength.retainFrom(ranges)).isEqualTo(ranges); + } + + @Test + public void testNoTrailingAnyPath() { + assertThat(prefixes("123xxx", "456xx", "789x")).containsExactly("123", "456", "789"); + } + + @Test + public void testRangeAndPrefixSameLength() { + PrefixTree prefix = prefixes("1234"); + RangeTree ranges = ranges("xxxx"); + assertThat(prefix.retainFrom(ranges)).containsExactly("1234"); + } + + @Test + public void testRangeShorterThanPrefix() { + PrefixTree prefix = prefixes("1234"); + RangeTree ranges = ranges("xxx"); + assertThat(prefix.retainFrom(ranges)).isEmpty(); + } + + @Test + public void testComplex() { + PrefixTree prefix = prefixes("[12]", "3x4x5", "67890", "987xx9"); + RangeTree ranges = ranges("x", "xx", "xxx", "1234xx", "234xxx", "3xx8xx", "67890"); + assertThat(prefix.retainFrom(ranges)) + .containsExactly("[12]", "[12]x", "[12]xx", "67890", "1234xx", "234xxx", "3x485x"); + } + + @Test + public void testEmptyPrefixTree() { + // The empty filter filters everything out, since a filter operation is defined to return + // only ranges which are prefixed by an element in the filter (of which there are none). + assertThat(PrefixTree.from(empty()).retainFrom(ranges("12xxx"))).isEmpty(); + } + + @Test + public void testZeroLengthPrefix() { + // The non-empty prefix tree which contains a single prefix of zero length. This has no effect + // as a filter, since all ranges "have a zero length prefix". + PrefixTree prefix = PrefixTree.from(RangeTree.from(RangeSpecification.empty())); + RangeTree input = ranges("12xxx"); + assertThat(prefix.retainFrom(input)).isEqualTo(input); + } + + @Test + public void testUnion() { + // Overlapping prefixes retain the more general (shorter) one. + assertThat(prefixes("1234").union(prefixes("12"))).containsExactly("12"); + // Indentical prefixes treated like normal union. + assertThat(prefixes("12").union(prefixes("12"))).containsExactly("12"); + // Non-overlapping prefixes treated like normal union. + assertThat(prefixes("123").union(prefixes("124"))).containsExactly("12[34]"); + // Complex case where prefixes are split into 2 lengths due to a partial overlap. + assertThat(prefixes("1234", "45", "800").union(prefixes("12", "4x67"))) + .containsExactly("12", "45", "4[0-46-9]67", "800"); + } + + @Test + public void testIntersection() { + // Overlapping prefixes retain the more specific (longer) one. + assertThat(prefixes("1234").intersect(prefixes("12"))).containsExactly("1234"); + // Indentical prefixes treated like normal intersection. + assertThat(prefixes("12").intersect(prefixes("12"))).containsExactly("12"); + // Non-overlapping prefixes treated like normal intersection. + assertThat(prefixes("123").intersect(prefixes("124"))).isEmpty(); + // Unlike the union case, with intersection, only the longest prefix remains. + assertThat(prefixes("1234", "45x", "800").intersect(prefixes("12x", "4x67"))) + .containsExactly("1234", "4567"); + } + + @Test + public void testTrim() { + assertThat(prefixes("1234").trim(3)).containsExactly("123"); + assertThat(prefixes("12").trim(3)).containsExactly("12"); + assertThat(prefixes("1234").trim(0)).containsExactly(RangeSpecification.empty()); + // Trimming can result in prefixes shorter than the stated length if by collapsing the original + // prefix tree you end up with trailing any digit sequences. + assertThat(prefixes("12[0-4]5", "12[5-9]").trim(3)).containsExactly("12"); + assertThat(prefixes("7001", "70[1-9]", "7[1-9]").trim(3)).containsExactly("7"); + } + + @Test + public void testMinimal() { + // If there are no ranges to include, the minimal prefix is empty (matching nothing). + assertThat(PrefixTree.minimal(RangeTree.empty(), ranges("123x"), 0)).isEmpty(); + + // If the prefix for the included ranges is the identity, then the result is the identity + // (after converting to a prefix, ranges like "xxx.." become the identity prefix). + assertThat(PrefixTree.minimal(ranges("xxxx"), ranges("123"), 0).isIdentity()).isTrue(); + // Without an exclude set, the prefix returned (at zero length) can just accept everything. + assertThat(PrefixTree.minimal(ranges("123x"), RangeTree.empty(), 0).isIdentity()).isTrue(); + + assertThat(PrefixTree.minimal(ranges("123x", "456x"), ranges("13xx", "459x"), 0)) + .containsExactly("12", "456"); + + assertThat(PrefixTree.minimal(ranges("123x", "456x"), empty(), 1)).containsExactly("[14]"); + assertThat(PrefixTree.minimal(ranges("123x", "456x"), empty(), 2)).containsExactly("12", "45"); + + // Pick the shortest prefix when several suffice. + assertThat(PrefixTree.minimal(ranges("12", "1234", "56"), ranges("1xx", "5xxx"), 0)) + .containsExactly("12", "56"); + assertThat(PrefixTree.minimal(ranges("12", "1234", "56"), ranges("1xx", "5xxx"), 3)) + .containsExactly("12", "56"); + + // When ranges are contested, split the prefix (only "12" is contested out of "1[2-4]"). + assertThat(PrefixTree.minimal(ranges("1[2-4]5xx", "189xx"), ranges("128xx"), 0)) + .containsExactly("125", "1[348]"); + + // If the include range already prefixes an entire path of the exclude set, ignore that path. + // Here '12' (the shorter path) already captures '123', so '123' is ignored. + assertThat(PrefixTree.minimal(ranges("12", "1234", "56"), ranges("123", "5xxx"), 0)) + .containsExactly("1", "56"); + // Now all exclude paths are ignored, so you get the "identity" prefix that catches everything. + assertThat(PrefixTree.minimal(ranges("12", "1234", "56"), ranges("123", "5678"), 0)) + .containsExactly(""); + } + + @Test + public void testMinimal_regression() { + // This is extracted from a real case in which the old algorithm would fail for this case. The + // "281xxxxxxx" path was necessary for failing since while visiting this, the old algorithm + // became "confused" and added an additional "250" path to the minimal prefix, meaning that + // the resulting range tree was "250", "250395". When this was turned into a prefix tree, the + // shorter, early terminating, path took precedence and the result was (incorrectly) "250". + assertThat( + PrefixTree.minimal( + ranges("250395xxxx"), + ranges("250[24-9]xxxxxx", "2503[0-8]xxxxx", "25039[0-46-9]xxxx", "281xxxxxxx"), + 3)) + .containsExactly("250395"); + } + + private static DigitSequence seq(String s) { + return DigitSequence.of(s); + } + + private static PrefixTree prefixes(String... specs) { + return PrefixTree.from(ranges(specs)); + } + + private static RangeTree ranges(String... specs) { + return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java new file mode 100644 index 000000000..7a8622e90 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java @@ -0,0 +1,308 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.parse; +import static java.util.Arrays.asList; +import static org.junit.Assert.assertThrows; + +import com.google.common.collect.ImmutableRangeSet; +import com.google.common.collect.Range; +import com.google.common.collect.RangeSet; +import com.google.common.truth.Truth; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RangeSpecificationTest { + @Test + public void testParse() { + assertThat(parse("")).isSameInstanceAs(RangeSpecification.empty()); + assertThat(parse("0").toString()).isEqualTo("0"); + assertThat(parse("0").length()).isEqualTo(1); + assertThat(parse("01234").toString()).isEqualTo("01234"); + assertThat(parse("01234").length()).isEqualTo(5); + assertThat(parse("012[0-9]").toString()).isEqualTo("012x"); + assertThat(parse("012[0234789]xxx").toString()).isEqualTo("012[02-47-9]xxx"); + assertThat(parse("0_1_2").toString()).isEqualTo("012"); + assertThat(parse("0_12[3-8]_xxx_xxx").toString()).isEqualTo("012[3-8]xxxxxx"); + } + + @Test + public void testParseBad() { + assertThrows(NullPointerException.class, () -> parse(null)); + assertThrows(IllegalArgumentException.class, () -> parse("#")); + assertThrows(IllegalArgumentException.class, () -> parse("[")); + assertThrows(IllegalArgumentException.class, () -> parse("[]")); + assertThrows(IllegalArgumentException.class, () -> parse("[0-")); + assertThrows(IllegalArgumentException.class, () -> parse("[0-]")); + assertThrows(IllegalArgumentException.class, () -> parse("[0--9]")); + assertThrows(IllegalArgumentException.class, () -> parse("[0..9]")); + assertThrows(IllegalArgumentException.class, () -> parse("[33]")); + assertThrows(IllegalArgumentException.class, () -> parse("[32]")); + assertThrows(IllegalArgumentException.class, () -> parse("[3-3]")); + assertThrows(IllegalArgumentException.class, () -> parse("[3-2]")); + assertThrows(IllegalArgumentException.class, () -> parse("123[9-0]456")); + assertThrows(IllegalArgumentException.class, () -> parse("1234_")); + assertThrows(IllegalArgumentException.class, () -> parse("_1234")); + assertThrows(IllegalArgumentException.class, () -> parse("12__34")); + assertThrows(IllegalArgumentException.class, () -> parse("1[2_4]5")); + } + + @Test + public void testSingleton() { + assertThat(RangeSpecification.singleton(asList(0, 1, 2, 4, 5, 7, 8, 9))) + .isEqualTo(parse("[0-2457-9]")); + } + + @Test + public void testMatches() { + assertThat(RangeSpecification.empty().matches(DigitSequence.empty())).isTrue(); + + assertAllMatch(parse("0"), "0"); + assertNoneMatch(parse("0"), "00", "1"); + + assertAllMatch(parse("01234"), "01234"); + assertNoneMatch(parse("01234"), "01233", "01235"); + + assertAllMatch(parse("012x"), "0120", "0125", "0129"); + assertNoneMatch(parse("012x"), "012", "0119", "0130", "01200"); + + assertAllMatch(parse("012[3-689]xxx"), "0124000", "0128999"); + assertNoneMatch(parse("012[3-689]xxx"), "0122000", "0127999"); + } + + @Test + public void testMinMax() { + assertThat(parse("123xxx").min()).isEqualTo(DigitSequence.of("123000")); + assertThat(parse("123xxx").max()).isEqualTo(DigitSequence.of("123999")); + assertThat(parse("1x[2-3]x4").min()).isEqualTo(DigitSequence.of("10204")); + assertThat(parse("1x[2-3]x4").max()).isEqualTo(DigitSequence.of("19394")); + } + + @Test + public void testSequenceCount() { + assertThat(RangeSpecification.empty().getSequenceCount()).isEqualTo(1); + assertThat(parse("1xx").getSequenceCount()).isEqualTo(100); + assertThat(parse("1[2-46-8]x").getSequenceCount()).isEqualTo(60); + assertThat(parse("1xx[0-27-9]").getSequenceCount()).isEqualTo(600); + } + + @Test + public void testFrom() { + assertThat(RangeSpecification.from(DigitSequence.empty())) + .isEqualTo(RangeSpecification.empty()); + assertThat(RangeSpecification.from(DigitSequence.of("1"))).isEqualTo(parse("1")); + assertThat(RangeSpecification.from(DigitSequence.of("1234"))).isEqualTo(parse("1234")); + } + + @Test + public void testAny() { + assertThat(RangeSpecification.any(0)).isEqualTo(RangeSpecification.empty()); + assertThat(RangeSpecification.any(2)).isEqualTo(parse("xx")); + assertThat(RangeSpecification.any(10)).isEqualTo(parse("xxxxxxxxxx")); + assertThrows(IllegalArgumentException.class, () -> RangeSpecification.any(-1)); + assertThrows(IllegalArgumentException.class, () -> RangeSpecification.any(19)); + } + + @Test + public void testFirst() { + RangeSpecification spec = parse("123[4-7]xxxx"); + assertThat(spec.first(3)).isEqualTo(parse("123")); + assertThat(spec.first(6)).isEqualTo(parse("123[4-7]xx")); + assertThat(spec.first(spec.length())).isSameInstanceAs(spec); + assertThat(spec.first(100)).isSameInstanceAs(spec); + assertThat(spec.first(0)).isEqualTo(RangeSpecification.empty()); + assertThrows(IllegalArgumentException.class, () -> spec.first(-1)); + } + + @Test + public void testLast() { + RangeSpecification spec = parse("123[4-7]xxxx"); + assertThat(spec.last(3)).isEqualTo(parse("xxx")); + assertThat(spec.last(6)).isEqualTo(parse("3[4-7]xxxx")); + assertThat(spec.last(spec.length())).isSameInstanceAs(spec); + assertThat(spec.last(100)).isSameInstanceAs(spec); + assertThat(spec.last(0)).isEqualTo(RangeSpecification.empty()); + assertThrows(IllegalArgumentException.class, () -> spec.last(-1)); + } + + @Test + public void testGetPrefix() { + assertThat(RangeSpecification.empty().getPrefix()).isEqualTo(RangeSpecification.empty()); + assertThat(parse("xxxx").getPrefix()).isEqualTo(RangeSpecification.empty()); + assertThat(parse("xx1x").getPrefix()).isEqualTo(parse("xx1")); + assertThat(parse("123[4-7]xxxx").getPrefix()).isEqualTo(parse("123[4-7]")); + } + + @Test + public void testOrdering_simple() { + // For specifications representing a single DigitSequence, the ordering should be the same. + testComparator( + RangeSpecification.empty(), + parse("0"), + parse("00"), + parse("000"), + parse("01"), + parse("1"), + parse("10"), + parse("123"), + parse("124"), + parse("4111"), + parse("4200"), + parse("4555"), + parse("9"), + parse("99"), + parse("999")); + } + + @Test + public void testOrdering_disjoint() { + // NOT the same as using the min() sequence for ordering (since "4555" > "4200" > "4111"). + testComparator( + parse("12xx"), + parse("13xx"), + parse("14xx"), + parse("1[5-8]00"), + parse("[2-3]xxx"), + parse("[4-6]555"), + parse("[45]111"), + parse("[45]2xx"), + parse("4999")); + } + + @Test + public void testOrdering_overlapping() { + // Ordering for overlapping ranges is well defined but not particularly intuitive. + testComparator( + parse("01xxx"), + parse("01xx[0-5]"), + parse("01x0[0-5]"), + parse("01x00"), + parse("01[0-6]00"), + parse("01[2-7]xx"), + parse("01[2-7]00"), + parse("01[2-7]67"), + parse("01[4-9]00")); + } + + @Test + public void testToString() { + assertThat(parse("0").toString()).isEqualTo("0"); + assertThat(parse("01234").toString()).isEqualTo("01234"); + assertThat(parse("012[3-4]").toString()).isEqualTo("012[34]"); + assertThat(parse("012[0-9]").toString()).isEqualTo("012x"); + assertThat(parse("012[3-689]xxx").toString()).isEqualTo("012[3-689]xxx"); + } + + @Test + public void testBitmaskToString() { + assertThat(RangeSpecification.toString(1 << 0)).isEqualTo("0"); + assertThat(RangeSpecification.toString(1 << 9)).isEqualTo("9"); + assertThat(RangeSpecification.toString(0xF)).isEqualTo("[0-3]"); + assertThat(RangeSpecification.toString(0xF1)).isEqualTo("[04-7]"); + assertThat(RangeSpecification.toString(ALL_DIGITS_MASK)).isEqualTo("x"); + + assertThrows(IllegalArgumentException.class, () -> RangeSpecification.toString(0)); + assertThrows(IllegalArgumentException.class, () -> RangeSpecification.toString(0x400)); + } + + @Test + public void testRangeProcessing_singleBlock() { + Truth.assertThat(RangeSpecification.from(setOf(range("1200", "1299")))) + .isEqualTo(specs("12xx")); + } + + @Test + public void testRangeProcessing_fullRange() { + Truth.assertThat(RangeSpecification.from(setOf(range("0000", "9999")))) + .isEqualTo(specs("xxxx")); + } + + @Test + public void testRangeProcessing_edgeCases() { + Truth.assertThat(RangeSpecification.from(setOf(range("1199", "1300")))).isEqualTo(specs( + "1199", + "12xx", + "1300")); + } + + @Test + public void testRangeProcessing_complex() { + Truth.assertThat(RangeSpecification.from(setOf(range("123", "45678")))).isEqualTo(specs( + "12[3-9]", + "1[3-9]x", + "[2-9]xx", + "xxxx", + "[0-3]xxxx", + "4[0-4]xxx", + "45[0-5]xx", + "456[0-6]x", + "4567[0-8]")); + } + + @Test + public void testAsRanges_edgeCase() { + // The middle 2 ranges abut. + assertThat(RangeSpecification.parse("12[34][0189]x").asRanges()) + .containsExactly(range("12300", "12319"), range("12380", "12419"), range("12480", "12499")) + .inOrder(); + } + + private static void assertAllMatch(RangeSpecification r, String... sequences) { + for (String digits : sequences) { + assertThat(r.matches(DigitSequence.of(digits))).isTrue(); + } + } + + private static void assertNoneMatch(RangeSpecification r, String... sequences) { + for (String digits : sequences) { + assertThat(r.matches(DigitSequence.of(digits))).isFalse(); + } + } + + List specs(String... s) { + return Stream.of(s).map(RangeSpecification::parse).collect(toImmutableList()); + } + + private static Range range(String lo, String hi) { + return Range.closed(DigitSequence.of(lo), DigitSequence.of(hi)).canonical(domain()); + } + + private static RangeSet setOf(Range... r) { + return ImmutableRangeSet.copyOf(Arrays.asList(r)); + } + + private static > void testComparator(T... items) { + for (int i = 0; i < items.length; i++) { + assertThat(items[i]).isEqualTo(items[i]); + assertThat(items[i]).isEquivalentAccordingToCompareTo(items[i]); + for (int j = i + 1; j < items.length; j++) { + assertThat(items[i]).isNotEqualTo(items[j]); + assertThat(items[i]).isLessThan(items[j]); + assertThat(items[j]).isGreaterThan(items[i]); + } + } + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizerTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizerTest.java new file mode 100644 index 000000000..e1a5aea6f --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeFactorizerTest.java @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.RangeTree.empty; +import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.ALLOW_EDGE_SPLITTING; +import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES; +import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.factor; + +import java.util.List; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RangeTreeFactorizerTest { + @Test + public void testEmpty() { + assertThat(factor(empty(), REQUIRE_EQUAL_EDGES)).isEmpty(); + assertThat(factor(empty(), ALLOW_EDGE_SPLITTING)).isEmpty(); + } + + @Test + public void testSimplePrefix() { + RangeTree t = ranges("123x", "123xx", "123xxx"); + assertThat(factor(t, REQUIRE_EQUAL_EDGES)).containsExactly(t); + assertThat(factor(t, ALLOW_EDGE_SPLITTING)).containsExactly(t); + } + + @Test + public void testDisjointBranchesNotFactored() { + RangeTree t = ranges("123xxx", "124xx", "125x"); + assertThat(factor(t, REQUIRE_EQUAL_EDGES)).containsExactly(t); + assertThat(factor(t, ALLOW_EDGE_SPLITTING)).containsExactly(t); + } + + @Test + public void testOverlappingBranchesAreFactored() { + RangeTree t = ranges("123xxx", "1234x", "1234", "123"); + assertThat(factor(t, REQUIRE_EQUAL_EDGES)) + .containsExactly(ranges("123xxx", "123"), ranges("1234x", "1234")) + .inOrder(); + assertThat(factor(t, ALLOW_EDGE_SPLITTING)) + .containsExactly(ranges("123xxx", "123"), ranges("1234x", "1234")) + .inOrder(); + } + + @Test + public void testStrategyDifference() { + // When factoring with REQUIRE_EQUAL_EDGES the [3-9] edge in the shorter path cannot be merged + // into the longer path of the first factor, since [3-4] already exists and is not equal to + // [3-9]. However since [3-4] is contained by [3-9], when we ALLOW_EDGE_SPLITTING, we can split + // the edge we are trying to merge to add paths for both [3-4] and [5-9]. This isn't always a + // win for regular expression length, and in fact for the most complex cases, + // REQUIRE_EQUAL_EDGES often ends up smaller. + RangeTree splittable = ranges("12[3-5]xx", "12[3-9]x"); + assertThat(factor(splittable, REQUIRE_EQUAL_EDGES)) + .containsExactly(ranges("12[3-5]xx"), ranges("12[3-9]x")) + .inOrder(); + assertThat(factor(splittable, ALLOW_EDGE_SPLITTING)) + .containsExactly(ranges("12[3-5]xx", "12[3-9]x")); + + // In this case, the [3-5] edge in the first factor in only a partial overlap with the [4-9] + // edge we are trying to merge in. Now both strategies will prefer to treat the shorter path + // as a separate factor, since there's no clean way to merge into the existing edge. + RangeTree unsplittable = ranges("12[3-5]xx", "12[4-9]x"); + assertThat(factor(unsplittable, REQUIRE_EQUAL_EDGES)) + .containsExactly(ranges("12[3-5]xx"), ranges("12[4-9]x")) + .inOrder(); + assertThat(factor(unsplittable, ALLOW_EDGE_SPLITTING)) + .containsExactly(ranges("12[3-5]xx"), ranges("12[4-9]x")) + .inOrder(); + + // TODO: Find a non-complex example where REQUIRE_EQUAL_EDGES yeilds smaller regex. + // Approximately 50 out of the 1000+ regex's in the XML get smaller with REQUIRE_EQUAL_EDGES. + } + + RangeTree ranges(String... s) { + return RangeTree.from(specs(s)); + } + + List specs(String... s) { + return Stream.of(s).map(RangeSpecification::parse).collect(toImmutableList()); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeTest.java new file mode 100644 index 000000000..646a63c1d --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeTreeTest.java @@ -0,0 +1,555 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain; +import static com.google.i18n.phonenumbers.metadata.testing.RangeTreeSubject.assertThat; +import static java.util.Arrays.asList; +import static org.junit.Assert.assertThrows; + +import com.google.auto.value.AutoValue; +import com.google.common.base.Strings; +import com.google.common.collect.ImmutableRangeSet; +import com.google.common.collect.Range; +import com.google.common.collect.RangeSet; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ForkJoinPool; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RangeTreeTest { + @Test + public void testEmptyTree() { + assertThat(RangeTree.empty()).containsExactly(); + assertThat(RangeTree.empty()).hasSize(0); + } + + @Test + public void testEmptySequenceTree() { + // The tree that matches a zero length input is a perfectly valid range tree (zero length input + // is perfectly valid input). This is very distinct from the empty tree, which cannot match any + // input. It's not used very often, but it is well defined. + RangeTree r = RangeTree.from(RangeSpecification.empty()); + assertThat(r).containsExactly(RangeSpecification.empty()); + assertThat(r).hasSize(1); + } + + @Test + public void testFromRangeSetSimple() { + // Single ranges produce minimal/canoncial range specifications. + RangeTree r = RangeTree.from(rangeSetOf(range("1000", "4999"))); + assertThat(r).containsExactly("[1-4]xxx"); + assertThat(r).hasSize(4000); + } + + @Test + public void testFromRangeSetMinMax() { + RangeTree r = RangeTree.from(rangeSetOf(range("0000", "9999"))); + assertThat(r).containsExactly("xxxx"); + assertThat(r).hasSize(10000); + } + + @Test + public void testFromRangeSetAllValues() { + // Just checking for any out-of-bounds issues at the end of the domain. + RangeTree r = RangeTree.from(rangeSetOf(range("0", domain().maxValue().toString()))); + assertThat(r).containsExactly( + "x", + "xx", + "xxx", + "xxxx", + "xxxxx", + "xxxxxx", + "xxxxxxx", + "xxxxxxxx", + "xxxxxxxxx", + "xxxxxxxxxx", + "xxxxxxxxxxx", + "xxxxxxxxxxxx", + "xxxxxxxxxxxxx", + "xxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxx", + "xxxxxxxxxxxxxxxxxx"); + } + + @Test + public void testContains() { + // The tree generated from the empty range specification actually contains one digit sequence + // (the empty one). This is not the same as RangeTree.empty() which really contains nothing. + assertThat(RangeTree.empty()).doesNotContain(""); + assertThat(RangeTree.from(RangeSpecification.empty())).contains(""); + assertThat(RangeTree.from(spec("x"))).contains("7"); + assertThat(RangeTree.from(spec("1"))).contains("1"); + assertThat(RangeTree.from(spec("1"))).doesNotContain("5"); + assertThat(RangeTree.from(spec("xx"))).contains("99"); + assertThat(RangeTree.from(spec("xx"))).doesNotContain("100"); + assertThat(RangeTree.from(spec("0[123]x[456]x[789]"))).contains("027617"); + } + + @Test + public void testMatchCount() { + assertThat(RangeTree.empty()).hasSize(0); + assertThat(RangeTree.from(RangeSpecification.empty())).hasSize(1); + assertThat(RangeTree.from(spec("x"))).hasSize(10); + assertThat(RangeTree.from(spec("1"))).hasSize(1); + assertThat(RangeTree.from(spec("[123]"))).hasSize(3); + assertThat(RangeTree.from(spec("xx"))).hasSize(100); + assertThat(RangeTree.from(spec("[234]xx"))).hasSize(300); + assertThat(RangeTree.from(spec("1[234]xx"))).hasSize(300); + assertThat(RangeTree.from(spec("1[234][567]xx"))).hasSize(900); + assertThat(RangeTree.from(spec("0[123]x[456]x[789]"))).hasSize(2700); + } + + @Test + public void testUnion() { + RangeTree a = ranges("12xx", "456xx"); + assertThat(a.union(a)).isEqualTo(a); + assertThat(a.union(RangeTree.empty())).isEqualTo(a); + assertThat(RangeTree.empty().union(a)).isEqualTo(a); + + RangeTree b = ranges("1234", "4xxxx", "999"); + assertThat(a.union(b)).containsExactly("999", "12xx", "4xxxx"); + assertThat(b.union(a)).containsExactly("999", "12xx", "4xxxx"); + } + + @Test + public void testIntersection() { + RangeTree a = ranges("12xx", "456xx"); + assertThat(a.intersect(a)).isEqualTo(a); + assertThat(a.intersect(RangeTree.empty())).isSameInstanceAs(RangeTree.empty()); + assertThat(RangeTree.empty().intersect(a)).isSameInstanceAs(RangeTree.empty()); + + RangeTree b = ranges("1234", "4xxxx", "999"); + assertThat(a.intersect(b)).containsExactly("1234", "456xx"); + assertThat(b.intersect(a)).containsExactly("1234", "456xx"); + } + + @Test + public void testSubtraction() { + RangeTree a = ranges("12xx", "456xx"); + assertThat(a.subtract(a)).isSameInstanceAs(RangeTree.empty()); + assertThat(a.subtract(RangeTree.empty())).isEqualTo(a); + assertThat(RangeTree.empty().subtract(a)).isSameInstanceAs(RangeTree.empty()); + + RangeTree b = ranges("1234", "4xxxx", "999"); + assertThat(a.subtract(b)).containsExactly("12[0-24-9]x", "123[0-35-9]"); + assertThat(b.subtract(a)).containsExactly("999", "4[0-46-9]xxx", "45[0-57-9]xx"); + } + + @Test + public void testContainsAll() { + RangeTree a = ranges("12[3-6]xx", "13[5-8]xx", "456xxxx"); + assertThat(a.containsAll(a)).isTrue(); + assertThat(a.containsAll(RangeTree.empty())).isTrue(); + assertThat(RangeTree.empty().containsAll(a)).isFalse(); + // Test branching, since 12.. and 13... are distinct branches but both contain ..[56][78]x + assertThat(a.containsAll(ranges("1[23][56][78]x", "4567890"))).isTrue(); + + // Path 127.. is not contained. + assertThat(a.containsAll(ranges("12[357]xx"))).isFalse(); + // Hard to test for, but this should fail immediately (due to length mismatch). + assertThat(a.containsAll(ranges("123456"))).isFalse(); + + // Check edge case for zero-length paths. + assertThat(ranges("", "1").containsAll(ranges(""))).isTrue(); + assertThat(RangeTree.empty().containsAll(ranges(""))).isFalse(); + } + + @Test + public void testVennDiagram() { + // Test basic set-theoretic assumptions about the logical operations. + // In theory we could run this test with any non-disjoint pair of trees. + RangeTree a = ranges("12xx", "456xx"); + RangeTree b = ranges("1234", "4xxxx", "999"); + + RangeTree intAB = a.intersect(b); + RangeTree subAB = a.subtract(b); + RangeTree subBA = b.subtract(a); + + // (A\B) and (B\A) are disjoint with (A^B) and each other. + assertThat(subAB.intersect(intAB)).isSameInstanceAs(RangeTree.empty()); + assertThat(subBA.intersect(intAB)).isSameInstanceAs(RangeTree.empty()); + assertThat(subAB.intersect(subBA)).isSameInstanceAs(RangeTree.empty()); + + // Even the union of (A\B) and (B\A) is disjoint to the intersection. + assertThat(subAB.union(subBA).intersect(intAB)).isSameInstanceAs(RangeTree.empty()); + + // (A\B) + (A^B) = A, (B\A) + (A^B) = B, (A\B) + (B\A) + (A^B) == (A+B) + assertThat(subAB.union(intAB)).isEqualTo(a); + assertThat(subBA.union(intAB)).isEqualTo(b); + assertThat(subAB.union(subBA).union(intAB)).isEqualTo(a.union(b)); + } + + @Test + public void testFromRaggedRange() { + RangeTree r = RangeTree.from(rangeSetOf(range("123980", "161097"))); + // Very 'ragged' ranges produde a lot of range specifications. + assertThat(r).containsExactly( + "1239[8-9]x", + "12[4-9]xxx", + "1[3-5]xxxx", + "160xxx", + "1610[0-8]x", + "16109[0-7]"); + } + + @Test + public void testComplexSpecsToSimpleRange() { + List specs = specs( + "12[3-9]", + "1[3-9]x", + "[2-9]xx", + "xxxx", + "[0-3]xxxx", + "4[0-4]xxx", + "45[0-5]xx", + "456[0-6]x", + "4567[0-8]"); + RangeTree r = RangeTree.from(specs); + assertThat(r).containsExactly(specs); + assertThat(r.asRangeSet()).isEqualTo(rangeSetOf(range("123", "45678"))); + } + + @Test + public void testAsRangeSetMultipleGroups() { + // The range specification has 4 ranges, one each for the four 123x prefixes. + RangeTree r = ranges("012[3-58][2-7]x"); + assertThat(r.asRangeSet()).isEqualTo(rangeSetOf( + range("012320", "012379"), + range("012420", "012479"), + range("012520", "012579"), + range("012820", "012879"))); + } + + @Test + public void testAsRangeSetMerging() { + // In isolation, the first specification represents two range, and the second represents one. + RangeTree r = ranges("12[3-4][7-9]x", "125[0-5]x"); + // The range ending 12499 merges with the range starting 12500, giving 2 rather than 3 ranges. + assertThat(r.asRangeSet()).isEqualTo(rangeSetOf( + range("12370", "12399"), + range("12470", "12559"))); + } + + @Test + public void testVisitor() { + // Carefully construct DFA so depth first visitation order is just incrementing from 0. + RangeTree r = ranges("012", "345", "367", "3689"); + TestVisitor v = new TestVisitor(); + r.accept(v); + + DfaNode initial = r.getInitial(); + DfaNode terminal = RangeTree.getTerminal(); + + assertThat(v.visited).hasSize(10); + // Edges 0 & 3 leave the initial state, edges 2,5,7,9 reach the terminal. + assertThat(v.visited.stream().map(Edge::source).filter(initial::equals).count()).isEqualTo(2); + assertThat(v.visited.stream().map(Edge::target).filter(terminal::equals).count()).isEqualTo(4); + // Check expected edge value masks. + for (int n = 0; n < 10; n++) { + assertThat(v.visited.get(n).digitMask()).isEqualTo(1 << n); + } + } + + @Test + public void testMin() { + assertThrows(IllegalStateException.class, () -> RangeTree.empty().first()); + assertThat(RangeTree.from(RangeSpecification.empty()).first()).isEqualTo(DigitSequence.empty()); + RangeTree tree = ranges("[1-6]xxxx", "[6-9]xx", "[89]xxx"); + assertThat(tree.first()).isEqualTo(DigitSequence.of("600")); + assertThat(tree.subtract(ranges("[6-8]xx")).first()).isEqualTo(DigitSequence.of("900")); + assertThat(tree.subtract(ranges("xxx")).first()).isEqualTo(DigitSequence.of("8000")); + assertThat(tree.subtract(ranges("xxx", "8[0-6]xx")).first()) + .isEqualTo(DigitSequence.of("8700")); + assertThat(tree.subtract(ranges("xxx", "xxxx")).first()).isEqualTo(DigitSequence.of("10000")); + } + + @Test + public void testSample() { + assertThrows(IndexOutOfBoundsException.class, () -> RangeTree.empty().sample(0)); + assertThat(RangeTree.from(RangeSpecification.empty()).sample(0)) + .isEqualTo(DigitSequence.empty()); + RangeTree tree = ranges("[1-6]xxxx", "[6-9]xx", "[89]xxx"); + // sometimes iteration looks ordered ... + assertThat(tree.sample(0)).isEqualTo(DigitSequence.of("10000")); + assertThat(tree.sample(1)).isEqualTo(DigitSequence.of("10001")); + assertThat(tree.sample(10)).isEqualTo(DigitSequence.of("10010")); + // but in general sample(n).next() != sample(n+1) + assertThat(tree.sample(49999)).isEqualTo(DigitSequence.of("59999")); + assertThat(tree.sample(50000)).isEqualTo(DigitSequence.of("600")); + assertThat(tree.sample(50001)).isEqualTo(DigitSequence.of("60000")); + assertThat(tree.sample(tree.size() - 1)).isEqualTo(DigitSequence.of("9999")); + assertThrows(IndexOutOfBoundsException.class, () -> RangeTree.empty().sample(tree.size())); + } + + @Test + public void testSignificantDigits() { + RangeTree ranges = ranges("123xx", "14567", "789"); + assertThat(ranges.significantDigits(3)).containsExactly("123xx", "145xx", "789"); + assertThat(ranges.significantDigits(2)).containsExactly("12xxx", "14xxx", "78x"); + assertThat(ranges.significantDigits(1)).containsExactly("1xxxx", "7xx"); + assertThat(ranges.significantDigits(0)).containsExactly("xxxxx", "xxx"); + } + + @Test + public void testPrefixWith() { + RangeTree ranges = ranges("123xx", "456x"); + assertThat(ranges.prefixWith(spec("00"))).isEqualTo(ranges("00123xx", "00456x")); + assertThat(ranges.prefixWith(RangeSpecification.empty())).isSameInstanceAs(ranges); + // The prefixing of an empty tree is empty (all paths that exist been prefixed correctly). + assertThat(RangeTree.empty().prefixWith(spec("00"))).isEqualTo(RangeTree.empty()); + } + + @Test + public void testSlicing() { + RangeTree ranges = ranges("", "1", "123", "125xx", "456x"); + assertThat(ranges.slice(1)).isEqualTo(ranges("[14]")); + assertThat(ranges.slice(2)).isEqualTo(ranges("12", "45")); + assertThat(ranges.slice(3)).isEqualTo(ranges("12[35]", "456")); + assertThat(ranges.slice(4)).isEqualTo(ranges("125x", "456x")); + assertThat(ranges.slice(2, 4)).isEqualTo(ranges("123", "125x", "456x")); + assertThat(ranges.slice(0, 5)).isEqualTo(ranges); + } + + @Test + public void testSerializingRealWorldExample() { + List expected = specs( + "11[2-7]xxxxxxx", + "12[0-249][2-7]xxxxxx", + "12[35-8]x[2-7]xxxxx", + "13[0-25][2-7]xxxxxx", + "13[346-9]x[2-7]xxxxx", + "14[145][2-7]xxxxxx", + "14[236-9]x[2-7]xxxxx", + "1[59][0235-9]x[2-7]xxxxx", + "1[59][14][2-7]xxxxxx", + "16[014][2-7]xxxxxx", + "16[235-9]x[2-7]xxxxx", + "17[1257][2-7]xxxxxx", + "17[34689]x[2-7]xxxxx", + "18[01346][2-7]xxxxxx", + "18[257-9]x[2-7]xxxxx", + "2[02][2-7]xxxxxxx", + "21[134689]x[2-7]xxxxx", + "21[257][2-7]xxxxxx", + "23[013][2-7]xxxxxx", + "23[24-8]x[2-7]xxxxx", + "24[01][2-7]xxxxxx", + "24[2-8]x[2-7]xxxxx", + "25[0137][2-7]xxxxxx", + "25[25689]x[2-7]xxxxx", + "26[0158][2-7]xxxxxx", + "26[2-4679]x[2-7]xxxxx", + "27[13-79]x[2-7]xxxxx", + "278[2-7]xxxxxx", + "28[1568][2-7]xxxxxx", + "28[2-479]x[2-7]xxxxx", + "29[14][2-7]xxxxxx", + "29[235-9]x[2-7]xxxxx", + "301x[2-7]xxxxx", + "31[79]x[2-7]xxxxx", + "32[1-5]x[2-7]xxxxx", + "326[2-7]xxxxxx", + "33[2-7]xxxxxxx", + "34[13][2-7]xxxxxx", + "342[0189][2-7]xxxxx", + "342[2-7]xxxxxx", + "34[5-8]x[2-7]xxxxx", + "35[125689]x[2-7]xxxxx", + "35[34][2-7]xxxxxx", + "36[01489][2-7]xxxxxx", + "36[235-7]x[2-7]xxxxx", + "37[02-46][2-7]xxxxxx", + "37[157-9]x[2-7]xxxxx", + "38[159][2-7]xxxxxx", + "38[2-467]x[2-7]xxxxx", + "4[04][2-7]xxxxxxx", + "41[14578]x[2-7]xxxxx", + "41[36][2-7]xxxxxx", + "42[1-47][2-7]xxxxxx", + "42[5689]x[2-7]xxxxx", + "43[15][2-7]xxxxxx", + "43[2-467]x[2-7]xxxxx", + "45[12][2-7]xxxxxx", + "45[4-7]x[2-7]xxxxx", + "46[0-26-9][2-7]xxxxxx", + "46[35]x[2-7]xxxxx", + "47[0-24-9][2-7]xxxxxx", + "473x[2-7]xxxxx", + "48[013-57][2-7]xxxxxx", + "48[2689]x[2-7]xxxxx", + "49[014-7][2-7]xxxxxx", + "49[2389]x[2-7]xxxxx", + "51[025][2-7]xxxxxx", + "51[146-9]x[2-7]xxxxx", + "52[14-8]x[2-7]xxxxx", + "522[2-7]xxxxxx", + "53[1346]x[2-7]xxxxx", + "53[25][2-7]xxxxxx", + "54[14-69]x[2-7]xxxxx", + "54[28][2-7]xxxxxx", + "55[12][2-7]xxxxxx", + "55[46]x[2-7]xxxxx", + "56[146-9]x[2-7]xxxxx", + "56[25][2-7]xxxxxx", + "571[2-7]xxxxxx", + "57[2-4]x[2-7]xxxxx", + "581[2-7]xxxxxx", + "58[2-8]x[2-7]xxxxx", + "59[15][2-7]xxxxxx", + "59[246]x[2-7]xxxxx", + "61[1358]x[2-7]xxxxx", + "612[2-7]xxxxxx", + "621[2-7]xxxxxx", + "62[2457]x[2-7]xxxxx", + "631[2-7]xxxxxx", + "63[2-4]x[2-7]xxxxx", + "641[2-7]xxxxxx", + "64[235-7]x[2-7]xxxxx", + "65[17][2-7]xxxxxx", + "65[2-689]x[2-7]xxxxx", + "66[13][2-7]xxxxxx", + "66[24578]x[2-7]xxxxx", + "671[2-7]xxxxxx", + "67[235689]x[2-7]xxxxx", + "674[0189][2-7]xxxxx", + "674[2-7]xxxxxx", + "680[2-7]xxxxxx", + "68[1-6]x[2-7]xxxxx", + "71[013-9]x[2-7]xxxxx", + "712[2-7]xxxxxx", + "72[0235-9]x[2-7]xxxxx", + "72[14][2-7]xxxxxx", + "73[134][2-7]xxxxxx", + "73[2679]x[2-7]xxxxx", + "74[1-35689]x[2-7]xxxxx", + "74[47][2-7]xxxxxx", + "75[15][2-7]xxxxxx", + "75[2-46-9]x[2-7]xxxxx", + "7[67][02-9]x[2-7]xxxxx", + "7[67]1[2-7]xxxxxx", + "78[013-7]x[2-7]xxxxx", + "782[0-6][2-7]xxxxx", + "788[0189][2-7]xxxxx", + "788[2-7]xxxxxx", + "79[0189]x[2-7]xxxxx", + "79[2-7]xxxxxxx", + "80[2-467]xxxxxxx", + "81[1357-9]x[2-7]xxxxx", + "816[2-7]xxxxxx", + "82[014][2-7]xxxxxx", + "82[235-8]x[2-7]xxxxx", + "83[03-57-9]x[2-7]xxxxx", + "83[126][2-7]xxxxxx", + "84[0-24-9]x[2-7]xxxxx", + "85xx[2-7]xxxxx", + "86[136][2-7]xxxxxx", + "86[2457-9]x[2-7]xxxxx", + "87[078][2-7]xxxxxx", + "87[1-6]x[2-7]xxxxx", + "88[1256]x[2-7]xxxxx", + "88[34][2-7]xxxxxx", + "891[2-7]xxxxxx", + "89[2-4]x[2-7]xxxxx"); + + RangeTree t1 = RangeTree.from(expected); + assertThat(t1).containsExactly(expected); + assertThat(RangeTree.from(t1.asRangeSet())).containsExactly(expected); + } + + @Test + public void testThreadSafety() throws ExecutionException, InterruptedException { + // For 10^5 this takes ~500ms. For 10^6 it starts to take non-trivial time (~10 seconds). + int numDigits = 5; + // At 1000 threads this starts to take non-trivial time. + int numThreads = 100; + + // Collect 10^N ranges from "00..." to "99...", all distinct. + List ranges = Stream + .iterate(DigitSequence.zeros(numDigits), DigitSequence::next) + .limit((int) Math.pow(10, numDigits)) + .map(RangeTreeTest::singletonRange) + .collect(Collectors.toCollection(ArrayList::new)); + Collections.shuffle(ranges, new Random(1234L)); + + // Recombining all 10^N ranges should give a single combined block (i.e. "xx..."). Doing it + // with high parallelism should test the thread safety of the concurrent interning map. + RangeTree combined = new ForkJoinPool(numThreads) + .submit(() -> ranges.parallelStream().reduce(RangeTree.empty(), RangeTree::union)) + .get(); + assertThat(combined).isEqualTo(ranges(Strings.repeat("x", numDigits))); + } + + @AutoValue + abstract static class Edge { + static Edge of(DfaNode source, DfaNode target, DfaEdge edge) { + return new AutoValue_RangeTreeTest_Edge(source, target, edge.getDigitMask()); + } + abstract DfaNode source(); + abstract DfaNode target(); + abstract int digitMask(); + } + + // Range tree visitor that captures edges visited (in depth first order) + private static final class TestVisitor implements DfaVisitor { + List visited = new ArrayList<>(); + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + visited.add(Edge.of(source, target, edge)); + target.accept(this); + } + } + + RangeTree ranges(String... s) { + return RangeTree.from(specs(s)); + } + + private static RangeSpecification spec(String s) { + return RangeSpecification.parse(s); + } + + private static List specs(String... s) { + return Stream.of(s).map(RangeSpecification::parse).collect(toImmutableList()); + } + + private static Range range(String lo, String hi) { + return Range.closed(DigitSequence.of(lo), DigitSequence.of(hi)).canonical(domain()); + } + + private static RangeSet rangeSetOf(Range... r) { + return ImmutableRangeSet.copyOf(asList(r)); + } + + private static RangeTree singletonRange(DigitSequence s) { + return RangeTree.from(spec(s.toString())); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegionTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegionTest.java new file mode 100644 index 000000000..251d483f0 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/PhoneRegionTest.java @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.i18n; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.common.truth.Truth8.assertThat; +import static org.junit.Assert.assertThrows; + +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class PhoneRegionTest { + @Test + public void testOrdering() { + assertThat(Stream.of(r("US"), r("GB"), r("AE"), r("001"), r("KR"), r("MN")).sorted()) + .containsAtLeast(r("AE"), r("GB"), r("KR"), r("MN"), r("US"), r("001")) + .inOrder(); + } + + @Test + public void testWorld() { + assertThat(PhoneRegion.getWorld()).isEqualTo(r("001")); + } + + @Test + public void testBadArgs() { + assertThat(assertThrows(IllegalArgumentException.class, () -> PhoneRegion.of("ABC"))) + .hasMessageThat() + .contains("ABC"); + assertThat(assertThrows(IllegalArgumentException.class, () -> PhoneRegion.of("us"))) + .hasMessageThat() + .contains("us"); + assertThat(assertThrows(IllegalArgumentException.class, () -> PhoneRegion.of("000"))) + .hasMessageThat() + .contains("000"); + } + + private static PhoneRegion r(String cldrCode) { + return PhoneRegion.of(cldrCode); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTagTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTagTest.java new file mode 100644 index 000000000..d7706e62c --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/i18n/SimpleLanguageTagTest.java @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.i18n; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class SimpleLanguageTagTest { + @Test + public void testSimple() { + assertThat(SimpleLanguageTag.of("en").toString()).isEqualTo("en"); + assertThat(SimpleLanguageTag.of("zh_Hant").toString()).isEqualTo("zh-Hant"); + } + + @Test + public void testBadArgs() { + assertThat(assertThrows(IllegalArgumentException.class, () -> SimpleLanguageTag.of("x"))) + .hasMessageThat().contains("x"); + assertThat(assertThrows(IllegalArgumentException.class, () -> SimpleLanguageTag.of("EN"))) + .hasMessageThat().contains("EN"); + assertThat(assertThrows(IllegalArgumentException.class, () -> SimpleLanguageTag.of("003"))) + .hasMessageThat().contains("003"); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpecTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpecTest.java new file mode 100644 index 000000000..12e4edd84 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatSpecTest.java @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.common.truth.Truth8.assertThat; +import static org.junit.Assert.assertThrows; + +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.model.FormatSpec.FormatTemplate; +import java.util.Optional; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class AltFormatSpecTest { + @Test + public void testSimple() { + FormatTemplate template = FormatTemplate.parse("XXXX XXXX"); + RangeSpecification prefix = RangeSpecification.parse("123"); + + AltFormatSpec spec = AltFormatSpec.create(template, prefix, "foo", Optional.of("Comment")); + + assertThat(spec.template()).isEqualTo(template); + assertThat(spec.prefix()).isEqualTo(prefix); + assertThat(spec.parentFormatId()).isEqualTo("foo"); + assertThat(spec.comment()).hasValue("Comment"); + assertThat(spec.specifier()).isEqualTo("123X XXXX"); + } + + @Test + public void testGoodTemplateAndPrefix() { + assertGoodTemplateAndPrefix("XXX XXX", "", "XXX XXX"); + assertGoodTemplateAndPrefix("XXX XXX", "123", "123 XXX"); + assertGoodTemplateAndPrefix("XXX XXX", "1234", "123 4XX"); + assertGoodTemplateAndPrefix("XXX XXX", "123456", "123 456"); + assertGoodTemplateAndPrefix("XXX XXX**", "123", "123 XXX**"); + assertGoodTemplateAndPrefix("XXX XXX", "12[3-6]", "12[3-6] XXX"); + assertGoodTemplateAndPrefix("XXX XXX", "1x3", "1X3 XXX"); + } + + @Test + public void testBadTemplateOrPrefix() { + // Prefix too long. + assertBadTemplateAndPrefix("XXXX", "12345"); + // Prefix too long for min length. + assertBadTemplateAndPrefix("XXXX**", "12345"); + // Bad template chars. + assertBadTemplateAndPrefix("XXX-XXX", "123"); + // Extra whitespace. + assertBadTemplateAndPrefix(" XXXXXX", "123"); + // Prefix must not end with "any digit". + assertBadTemplateAndPrefix(" XXXXXX", "123xx"); + } + + private static void assertGoodTemplateAndPrefix(String template, String prefix, String spec) { + FormatTemplate t = FormatTemplate.parse(template); + RangeSpecification p = RangeSpecification.parse(prefix); + assertThat(AltFormatSpec.create(t, p, "foo", Optional.empty()).specifier()).isEqualTo(spec); + } + + private static void assertBadTemplateAndPrefix(String template, String prefix) { + FormatTemplate t = FormatTemplate.parse(template); + RangeSpecification p = RangeSpecification.parse(prefix); + assertThrows(IllegalArgumentException.class, + () -> AltFormatSpec.create(t, p, "foo", Optional.empty())); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchemaTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchemaTest.java new file mode 100644 index 000000000..ae4855203 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/AltFormatsSchemaTest.java @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.truth.Truth.assertThat; + +import com.google.common.base.CharMatcher; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.Arrays; +import java.util.List; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class AltFormatsSchemaTest { + + @Test + public void testSimple_export() throws IOException { + assertThat( + exportCsv( + altFormat("123 XXX XXXX", "foo", "Hello World"))) + .containsExactly( + "Format ; Parent Format ; Comment", + "123 XXX XXXX ; foo ; \"Hello World\"") + .inOrder(); + } + + @Test + public void testSimple_import() throws IOException { + assertThat( + importCsv( + "Format ; Parent Format ; Comment", + "123 XXX XXXX ; foo ; \"Hello World\"")) + .containsExactly( + altFormat("123 XXX XXXX", "foo", "Hello World")); + } + + @Test + public void testEscapedText_export() throws IOException { + assertThat( + exportCsv( + altFormat("123 XXX XXXX", "foo", "\tHello\nWorld\\"))) + .containsExactly( + "Format ; Parent Format ; Comment", + "123 XXX XXXX ; foo ; \"\\tHello\\nWorld\\\\\"") + .inOrder(); + } + + @Test + public void testEscapedText_import() throws IOException { + assertThat( + importCsv( + "Format ; Parent Format ; Comment", + "123 XXX XXXX ; foo ; \"\\tHello\\nWorld\\\\\"")) + .containsExactly( + altFormat("123 XXX XXXX", "foo", "\tHello\nWorld\\")); + } + + @Test + public void testRetainsExplicitOrdering() throws IOException { + assertThat( + exportCsv( + altFormat("123 XXXXXX", "foo", "First"), + altFormat("XX XXXX", "bar", "Second"), + altFormat("9X XXX XXX", "baz", "Third"))) + .containsExactly( + "Format ; Parent Format ; Comment", + "123 XXXXXX ; foo ; \"First\"", + "XX XXXX ; bar ; \"Second\"", + "9X XXX XXX ; baz ; \"Third\"") + .inOrder(); + } + + private AltFormatSpec altFormat(String spec, String parentId, String comment) { + return AltFormatsSchema.parseAltFormat(spec, parentId, comment); + } + + private static List exportCsv(AltFormatSpec... altFormats) throws IOException { + try (StringWriter out = new StringWriter()) { + AltFormatsSchema.exportCsv(out, Arrays.asList(altFormats)); + // Ignore trailing empty lines. + return Splitter.on('\n').splitToList(CharMatcher.is('\n').trimTrailingFrom(out.toString())); + } + } + + private static ImmutableList importCsv(String... lines) + throws IOException { + // Add a trailing newline, since that's what we expect in the real CSV files. + StringReader file = new StringReader(Joiner.on('\n').join(lines) + "\n"); + return AltFormatsSchema.importAltFormats(file); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchemaTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchemaTest.java new file mode 100644 index 000000000..bf9735854 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/CommentsSchemaTest.java @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment.anchor; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_FIXED_LINE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_MOBILE; + +import com.google.common.base.CharMatcher; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment.Anchor; +import java.io.IOException; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.Arrays; +import java.util.List; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CommentsSchemaTest { + private static final PhoneRegion REGION_US = PhoneRegion.of("US"); + private static final PhoneRegion REGION_CA = PhoneRegion.of("CA"); + + private static final Anchor US_TOP = Comment.anchor(REGION_US); + private static final Anchor US_FIXED_LINE = anchor(REGION_US, XML_FIXED_LINE); + private static final Anchor US_MOBILE = anchor(REGION_US, XML_MOBILE); + private static final Anchor US_SHORTCODE = Comment.shortcodeAnchor(REGION_US); + private static final Anchor CA_FIXED_LINE = anchor(REGION_CA, XML_FIXED_LINE); + + @Test + public void testSimple_export() throws IOException { + assertThat( + exportCsv( + comment(US_FIXED_LINE, "Hello World"))) + .containsExactly( + "Region ; Label ; Comment", + "US ; XML_FIXED_LINE ; \"Hello World\"") + .inOrder(); + } + + @Test + public void testSimple_import() throws IOException { + assertThat( + importCsv( + "Region ; Label ; Comment", + "US ; XML_FIXED_LINE ; \"Hello World\"")) + .containsExactly( + comment(US_FIXED_LINE, "Hello World")); + } + + @Test + public void testEscapedText_export() throws IOException { + assertThat( + exportCsv( + comment(US_FIXED_LINE, "\tHello", "World\\"))) + .containsExactly( + "Region ; Label ; Comment", + "US ; XML_FIXED_LINE ; \"\\tHello\\nWorld\\\\\"") + .inOrder(); + } + + @Test + public void testEscapedText_import() throws IOException { + assertThat( + importCsv( + "Region ; Label ; Comment", + "US ; XML_FIXED_LINE ; \"\\tHello\\nWorld\\\\\"")) + .containsExactly( + comment(US_FIXED_LINE, "\tHello", "World\\")); + } + + @Test + public void testOrdering_export() throws IOException { + assertThat( + exportCsv( + comment(US_FIXED_LINE, "First"), + comment(US_FIXED_LINE, "Second"), + comment(US_FIXED_LINE, "Third"), + comment(US_TOP, "Top Level Comment"), + comment(US_SHORTCODE, "Shortcode Comment"), + comment(US_MOBILE, "Other Type"), + comment(CA_FIXED_LINE, "Other Region"))) + .containsExactly( + "Region ; Label ; Comment", + "CA ; XML_FIXED_LINE ; \"Other Region\"", + "US ; SC ; \"Shortcode Comment\"", + "US ; XML ; \"Top Level Comment\"", + "US ; XML_FIXED_LINE ; \"First\"", + "US ; XML_FIXED_LINE ; \"Second\"", + "US ; XML_FIXED_LINE ; \"Third\"", + "US ; XML_MOBILE ; \"Other Type\"") + .inOrder(); + } + + @Test + public void testOrdering_import() throws IOException { + assertThat( + importCsv( + "Region ; Label ; Comment", + "US ; XML_FIXED_LINE ; \"First\"", + "US ; XML_FIXED_LINE ; \"Second\"", + "US ; XML_FIXED_LINE ; \"Third\"", + "US ; XML ; \"Top Level Comment\"", + "US ; SC ; \"Shortcode Comment\"", + "US ; XML_MOBILE ; \"Other Type\"", + "CA ; XML_FIXED_LINE ; \"Other Region\"")) + .containsExactly( + comment(CA_FIXED_LINE, "Other Region"), + comment(US_SHORTCODE, "Shortcode Comment"), + comment(US_TOP, "Top Level Comment"), + comment(US_FIXED_LINE, "First"), + comment(US_FIXED_LINE, "Second"), + comment(US_FIXED_LINE, "Third"), + comment(US_MOBILE, "Other Type")) + .inOrder(); + } + + private Comment comment(Anchor a, String... lines) { + return Comment.create(a, Arrays.asList(lines)); + } + + private static List exportCsv(Comment... comments) throws IOException { + try (StringWriter out = new StringWriter()) { + CommentsSchema.exportCsv(out, Arrays.asList(comments)); + // Ignore trailing empty lines. + return Splitter.on('\n').splitToList(CharMatcher.is('\n').trimTrailingFrom(out.toString())); + } + } + + private static ImmutableList importCsv(String... lines) + throws IOException { + // Add a trailing newline, since that's what we expect in the real CSV files. + StringReader file = new StringReader(Joiner.on('\n').join(lines) + "\n"); + return CommentsSchema.importComments(file); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/FormatSpecTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/FormatSpecTest.java new file mode 100644 index 000000000..5b7b5d6d0 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/model/FormatSpecTest.java @@ -0,0 +1,160 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.model; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.common.truth.Truth8.assertThat; +import static java.util.Optional.empty; +import static org.junit.Assert.assertThrows; + +import com.google.i18n.phonenumbers.metadata.model.FormatSpec.FormatTemplate; +import java.util.Optional; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class FormatSpecTest { + @Test + public void testCreate_national() { + national("XXXX"); + national("XXX***"); + national("#XXX XXX"); + national("(#XXX) XX**-XXX"); + assertThat(national("XX\\XXX").national().skeleton()).isEqualTo("$1X$2"); + } + + @Test + public void testCreate_international() { + // The international spec can be a duplicate (signifies international formatting is permitted). + international("XXX XXXX", "XXX XXXX"); + // Or it can be different (including grouping and separators). + international("(#XXX) XXXX", "XXX-XXXX"); + } + + @Test + public void testCreate_carrier() { + carrier("# XXX XXXX", "# @ XXX XXXX"); + carrier("XXX XXXX", "@ XXX XXXX"); + // Carrier and national prefix can differ on whether national prefix is needed. + carrier("XXX XXXX", "#@ XXX XXXX"); + } + + @Test + public void testCreate_national_bad() { + assertThrows(IllegalArgumentException.class, () -> national("")); + assertThrows(IllegalArgumentException.class, () -> national("Hello")); + assertThrows(IllegalArgumentException.class, () -> national("$1")); + assertThrows(IllegalArgumentException.class, () -> national("XX**XX")); + assertThrows(IllegalArgumentException.class, () -> national("****")); + assertThrows(IllegalArgumentException.class, () -> national("@ XXX")); + } + + @Test + public void testCreate_international_bad() { + // National prefix is not allowed. + assertThrows(IllegalArgumentException.class, () -> international("#XXXX", "#XXXX")); + // Groups must match. + assertThrows(IllegalArgumentException.class, () -> international("# XXXX", "XX XX")); + assertThrows(IllegalArgumentException.class, () -> international("# XXXX", "XXX")); + } + + @Test + public void testCreate_carrier_bad() { + // Carrier specs must have '@' present. + assertThrows(IllegalArgumentException.class, () -> carrier("XXX XXXX", "XXX XXXX")); + // Carrier specs cannot differ after the first group (including separator). + assertThrows(IllegalArgumentException.class, () -> carrier("#XXX XXXX", "#@XXX-XXXX")); + // National prefix (if present) must come first (if this is ever relaxed, we would need to + // change how carrier prefixes are handled and how nationalPrefixForParsing is generated). + assertThrows(IllegalArgumentException.class, () -> carrier("# XXX XXXX", "@# XXX XXXX")); + } + + @Test + public void testTemplate_splitPrefix() { + FormatTemplate t = FormatTemplate.parse("(#) XXX - XXX**"); + assertThat(t.getXmlCapturingPattern()).isEqualTo("(\\d{3})(\\d{3,5})"); + assertThat(t.getXmlFormat()).isEqualTo("$1 - $2"); + assertThat(t.getXmlPrefix()).hasValue("($NP) $FG"); + assertThat(t.hasNationalPrefix()).isTrue(); + assertThat(t.hasCarrierCode()).isFalse(); + } + + @Test + public void testTemplate_noPrefix() { + FormatTemplate t = FormatTemplate.parse("XXX XX-XX"); + assertThat(t.getXmlCapturingPattern()).isEqualTo("(\\d{3})(\\d{2})(\\d{2})"); + assertThat(t.getXmlFormat()).isEqualTo("$1 $2-$3"); + assertThat(t.getXmlPrefix()).isEmpty(); + assertThat(t.hasNationalPrefix()).isFalse(); + assertThat(t.hasCarrierCode()).isFalse(); + } + + @Test + public void testTemplate_replacementNoNationalPrefix() { + FormatTemplate t = FormatTemplate.parse("{XXX>123} XX-XX"); + assertThat(t.getXmlCapturingPattern()).isEqualTo("(\\d{3})(\\d{2})(\\d{2})"); + assertThat(t.getXmlFormat()).isEqualTo("$2-$3"); + assertThat(t.getXmlPrefix()).hasValue("123 $FG"); + assertThat(t.hasNationalPrefix()).isFalse(); + assertThat(t.hasCarrierCode()).isFalse(); + } + + @Test + public void testTemplate_replacementWithNationalPrefix() { + FormatTemplate t = FormatTemplate.parse("#{XXX>123} XX-XX"); + assertThat(t.getXmlCapturingPattern()).isEqualTo("(\\d{3})(\\d{2})(\\d{2})"); + assertThat(t.getXmlFormat()).isEqualTo("$2-$3"); + assertThat(t.getXmlPrefix()).hasValue("$NP123 $FG"); + assertThat(t.hasNationalPrefix()).isTrue(); + assertThat(t.hasCarrierCode()).isFalse(); + } + + @Test + public void testTemplate_replacementNotFirstGroup() { + FormatTemplate t = FormatTemplate.parse("XXX {XX>ABC} XX"); + assertThat(t.getXmlCapturingPattern()).isEqualTo("(\\d{3})(\\d{2})(\\d{2})"); + assertThat(t.getXmlFormat()).isEqualTo("$1 ABC $3"); + assertThat(t.getXmlPrefix()).isEmpty(); + assertThat(t.hasNationalPrefix()).isFalse(); + assertThat(t.hasCarrierCode()).isFalse(); + } + + @Test + public void testTemplate_removeFirstGroupViaReplacement() { + // This test is very important for Argentina, where the leading group must be removed (and a + // different mobile token is used after the area code). + FormatTemplate t = FormatTemplate.parse("{XX>}XXX XXXX"); + assertThat(t.getXmlCapturingPattern()).isEqualTo("(\\d{2})(\\d{3})(\\d{4})"); + assertThat(t.getXmlFormat()).isEqualTo("$2 $3"); + assertThat(t.getXmlPrefix()).isEmpty(); + assertThat(t.hasNationalPrefix()).isFalse(); + assertThat(t.hasCarrierCode()).isFalse(); + } + + + private static FormatSpec national(String national) { + return FormatSpec.of(national, empty(), empty(), empty(), false, empty()); + } + + private static FormatSpec international(String national, String intl) { + return FormatSpec.of(national, empty(), Optional.of(intl), empty(), false, empty()); + } + + private static FormatSpec carrier(String national, String carrier) { + return FormatSpec.of(national, Optional.of(carrier), empty(), empty(), false, empty()); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/AssignmentTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/AssignmentTest.java new file mode 100644 index 000000000..b27ae9643 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/AssignmentTest.java @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.common.truth.Truth8.assertThat; +import static org.junit.Assert.assertThrows; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class AssignmentTest { + private static final Column COL_A = Column.ofString("A"); + private static final Column COL_B = Column.ofString("B"); + private static final Column COL_X = Column.ofUnsignedInteger("X"); + + private static final Schema SCHEMA = Schema.builder().add(COL_A).add(COL_B).add(COL_X).build(); + + @Test + public void testParsing() { + assertAssignment(Assignment.parse("A=foo", SCHEMA), COL_A, "foo"); + assertAssignment(Assignment.parse(" B = bar ", SCHEMA), COL_B, "bar"); + assertUnassignment(Assignment.parse("A=", SCHEMA), COL_A); + assertAssignment(Assignment.parse("X=23", SCHEMA), COL_X, 23); + assertThrows(IllegalArgumentException.class, () -> Assignment.parse("C=Nope", SCHEMA)); + assertThrows(IllegalArgumentException.class, () -> Assignment.parse("X=NaN", SCHEMA)); + } + + @Test + public void testOf() { + assertAssignment(Assignment.of(COL_A, "foo"), COL_A, "foo"); + assertThat(Assignment.of(COL_A, "foo")).isNotEqualTo(Assignment.of(COL_A, "bar")); + assertThat(Assignment.of(COL_A, "")).isNotEqualTo(Assignment.of(COL_B, "")); + assertThat(Assignment.of(COL_A, COL_A.defaultValue())).isNotEqualTo(Assignment.unassign(COL_A)); + assertThrows(NullPointerException.class, () -> Assignment.of(COL_A, null)); + } + + @Test + public void testUnassign() { + // Not much else to do here... + assertThat(Assignment.unassign(COL_A)).isEqualTo(Assignment.unassign(COL_A)); + assertUnassignment(Assignment.unassign(COL_A), COL_A); + } + + private static > void assertAssignment( + Assignment a, Column c, T v) { + assertThat(a.column()).isSameInstanceAs(c); + assertThat(a.value()).hasValue(v); + } + + private static void assertUnassignment(Assignment a, Column c) { + assertThat(a.column()).isSameInstanceAs(c); + assertThat(a.value()).isEmpty(); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ChangeTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ChangeTest.java new file mode 100644 index 000000000..0bd62427c --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ChangeTest.java @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.testing.RangeTreeSubject.assertThat; +import static java.util.Arrays.asList; +import static org.junit.Assert.assertThrows; + +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import java.util.Arrays; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class ChangeTest { + private static final Column COL_A = Column.ofString("A"); + private static final Column COL_B = Column.ofString("B"); + + @Test + public void testEmpty() { + assertThat(Change.empty().getRanges()).isEmpty(); + assertThat(Change.empty().getAssignments()).isEmpty(); + // Not all "no-op" changes are equal to the "empty" change (unlike RangeTree). This should be + // fine however since Changes are expected to have a very short lifecycle in most code and not + // be used as keys in maps etc... + assertThat(Change.empty()) + .isNotEqualTo(Change.builder(RangeTree.empty()).assign(COL_A, "foo").build()); + assertThat(Change.empty()).isNotEqualTo(Change.builder(ranges("12xxxx")).build()); + } + + @Test + public void testBuilder() { + Change c = Change.builder(ranges("12xxxx")).assign(COL_A, "foo").assign(COL_B, "bar").build(); + assertThat(c.getRanges()).containsExactly("12xxxx"); + Assignment assignFoo = Assignment.of(COL_A, "foo"); + Assignment assignBar = Assignment.of(COL_B, "bar"); + assertThat(c.getAssignments()).containsExactly(assignFoo, assignBar); + assertThat(c).isEqualTo(Change.of(ranges("12xxxx"), asList(assignFoo, assignBar))); + // Don't allow same column twice (this could be relaxed in future if necessary)! + assertThrows(IllegalArgumentException.class, + () -> Change.builder(ranges("12xxxx")).assign(COL_A, "foo").assign(COL_A, "bar").build()); + } + + @Test + public void testBuilderUnassignment() { + Change c = Change.builder(ranges("12xxxx")).unassign(COL_A).build(); + Assignment unassign = Assignment.unassign(COL_A); + assertThat(c.getAssignments()).containsExactly(unassign); + assertThat(c).isEqualTo(Change.of(ranges("12xxxx"), asList(unassign))); + } + + private static RangeTree ranges(String... rangeSpecs) { + return RangeTree.from(Arrays.stream(rangeSpecs).map(RangeSpecification::parse)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroupTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroupTest.java new file mode 100644 index 000000000..6d9b577cc --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnGroupTest.java @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import com.google.common.collect.ImmutableSet; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class ColumnGroupTest { + @Test + public void testGroupColumns() { + Column prototype = Column.ofBoolean("Region"); + ColumnGroup group = ColumnGroup.byRegion(prototype); + + Column us = group.getColumnFromId("US"); + assertThat(us.getName()).isEqualTo("Region:US"); + assertThat(us.type()).isEqualTo(Boolean.class); + + Column ca = group.getColumn(PhoneRegion.of("CA")); + assertThat(ca.getName()).isEqualTo("Region:CA"); + + // Only the suffix part should be given to get the column from the group. + assertThrows(IllegalArgumentException.class, () -> group.getColumnFromId("Region:US")); + } + + @Test + public void testExtractGroupColumns() { + Column first = Column.ofString("FirstColumn"); + Column last = Column.ofString("LastColumn"); + Column prototype = Column.ofBoolean("Region"); + ColumnGroup group = ColumnGroup.byRegion(prototype); + Column us = group.getColumnFromId("US"); + Column ca = group.getColumn(PhoneRegion.of("CA")); + + // The prototype is a valid column, but it's not part of its own group. + assertThat(group.extractGroupColumns(ImmutableSet.of(first, us, prototype, ca, last))) + .containsExactly(PhoneRegion.of("US"), us, PhoneRegion.of("CA"), ca).inOrder(); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnTest.java new file mode 100644 index 000000000..b46fad1e5 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/ColumnTest.java @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.FIXED_LINE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.UNKNOWN; +import static com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType.XML_UNKNOWN; +import static java.lang.Boolean.FALSE; +import static java.lang.Boolean.TRUE; +import static org.junit.Assert.assertThrows; + +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.proto.Types.XmlNumberType; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class ColumnTest { + @Test + public void testBooleanColumn() { + Column column = Column.ofBoolean("bool"); + assertThat(column.getName()).isEqualTo("bool"); + assertThat(column.type()).isEqualTo(Boolean.class); + assertThat(column.cast(true)).isTrue(); + assertThrows(ClassCastException.class, () -> column.cast("")); + // All upper or all lower case are accepted. + assertThat(column.parse("true")).isTrue(); + assertThat(column.parse("false")).isFalse(); + assertThat(column.parse("TRUE")).isTrue(); + assertThat(column.parse("FALSE")).isFalse(); + assertThat(column.serialize(TRUE)).isEqualTo("true"); + assertThat(column.serialize(FALSE)).isEqualTo("false"); + // We're lenient, but not that lenient. + assertThrows(IllegalArgumentException.class, () -> column.parse("TruE")); + assertThrows(IllegalArgumentException.class, () -> column.parse("FaLse")); + assertThrows(IllegalArgumentException.class, () -> Column.ofBoolean("Foo:Bar")); + } + + @Test + public void testStringColumn() { + Column column = Column.ofString("string"); + assertThat(column.getName()).isEqualTo("string"); + assertThat(column.type()).isEqualTo(String.class); + assertThat(column.cast("hello")).isEqualTo("hello"); + assertThat(column.parse("")).isNull(); + assertThrows(ClassCastException.class, () -> column.cast(true)); + // Anything other than the empty string is permitted. + assertThat(column.parse("world")).isEqualTo("world"); + assertThat(column.serialize("world")).isEqualTo("world"); + // Unquoted whitespace is stripped. + assertThat(column.parse(" world ")).isEqualTo("world"); + // You can preserve whitespace by surrounding the string with double quotes. + assertThat(column.parse("\" world \"")).isEqualTo(" world "); + assertThat(column.serialize(" world ")).isEqualTo("\" world \""); + // And null is always the empty string. + assertThat(column.serialize(null)).isEqualTo(""); + assertThrows(IllegalArgumentException.class, () -> Column.ofString("Foo:Bar")); + } + + @Test + public void testEnumColumn() { + Column column = Column.of(ValidNumberType.class, "type", UNKNOWN); + assertThat(column.getName()).isEqualTo("type"); + assertThat(column.type()).isEqualTo(ValidNumberType.class); + assertThat(column.cast(FIXED_LINE)).isEqualTo(FIXED_LINE); + assertThrows(ClassCastException.class, () -> column.cast("")); + + // Several case formats are supported. + assertThat(column.parse("FIXED_LINE")).isEqualTo(FIXED_LINE); + assertThat(column.parse("fixed_line")).isEqualTo(FIXED_LINE); + assertThat(column.parse("fixedLine")).isEqualTo(FIXED_LINE); + + // We're lenient, but not that lenient. + assertThrows(IllegalArgumentException.class, () -> column.parse("fIxEdLiNe")); + assertThrows(IllegalArgumentException.class, + () -> Column.of(XmlNumberType.class, "Foo:Bar", XML_UNKNOWN)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvParserTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvParserTest.java new file mode 100644 index 000000000..14cf0e67d --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvParserTest.java @@ -0,0 +1,177 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.table.CsvParser.rowMapper; +import static org.junit.Assert.assertThrows; + +import com.google.common.collect.ImmutableMap; +import com.google.i18n.phonenumbers.metadata.table.CsvParser.RowMapper; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CsvParserTest { + @Test + public void testSimple() { + // Simplest case. + assertSingleRow(CsvParser.commaSeparated(), "Hello,World!", "Hello", "World!"); + + // Empty row yields one empty value in the "first column" (matches behaviour with quoting). + assertSingleRow(CsvParser.commaSeparated(), "", ""); + assertSingleRow(CsvParser.commaSeparated(), "\"\"", ""); + + // Trailing delimiter yields a trailing empty value (matches behaviour with quoting). + assertSingleRow(CsvParser.commaSeparated(), "foo,", "foo", ""); + assertSingleRow(CsvParser.commaSeparated(), "foo,\"\"", "foo", ""); + } + + @Test + public void testOtherDelimiters() { + // Tabs sequences are not "folded" (maybe this could be an option?) + assertSingleRow(CsvParser.tabSeparated(), "Hello\t\tWorld!", "Hello", "", "World!"); + assertSingleRow(CsvParser.withSeparator(';'), "Hello;World!", "Hello", "World!"); + } + + @Test + public void testWhitespaceTrimming() { + // Whitespace is preserved by default, but can be trimmed. + assertSingleRow(CsvParser.commaSeparated(), + " foo, bar, baz ", " foo", " bar", " baz "); + assertSingleRow(CsvParser.commaSeparated().trimWhitespace(), + " foo, bar, baz ", "foo", "bar", "baz"); + assertSingleRow(CsvParser.commaSeparated().trimWhitespace(), + " foo, , ", "foo", "", ""); + + } + + @Test + public void testQuoting() { + // Quoting works as expected (and combines with whitespace trimming). + assertSingleRow(CsvParser.commaSeparated(), + "\"foo\",\"\"\"bar, baz\"\"\"", "foo", "\"bar, baz\""); + assertSingleRow(CsvParser.commaSeparated().trimWhitespace(), + " \"foo\" , \"\"\"bar, baz\"\"\" ", "foo", "\"bar, baz\""); + } + + @Test + public void testQuoting_illegal() { + // Without whitespace trimming any quotes in "unquoted" values are not permitted. + assertThrows(IllegalArgumentException.class, () -> + parse(CsvParser.commaSeparated(), "foo, \"bar, baz\"")); + } + + @Test + public void testDelimiter() { + assertSingleRow(CsvParser.tabSeparated(), "Hello\tWorld!", "Hello", "World!"); + assertSingleRow(CsvParser.withSeparator(';'), "Hello;World!", "Hello", "World!"); + } + + @Test + public void testUnicode() { + assertSingleRow(CsvParser.withSeparator('-'), "😱-😂-💩", "😱", "😂", "💩"); + assertSingleRow(CsvParser.commaSeparated(), "\0,😱😂,\n", "\0", "😱😂", "\n"); + // Fun fact, not all ISO control codes count as "whitespace". + assertSingleRow(CsvParser.commaSeparated().trimWhitespace(), "\0,😱😂,\n", "\0", "😱😂", ""); + } + + @Test + public void testMultiline() { + // Newlines become literals in quoted values. + List> rows = parse(CsvParser.commaSeparated().allowMultiline(), + "foo,\"Hello,", + "World!\""); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)).containsExactly("foo", "Hello,\nWorld!").inOrder(); + } + + @Test + public void testMultilineWithTrimming() { + List> rows = parse( + CsvParser.commaSeparated().allowMultiline().trimWhitespace(), + " foo , \" Hello,", + "World! \" "); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)).containsExactly("foo", " Hello,\nWorld! ").inOrder(); + } + + @Test + public void testMultiline_illegal() { + // If not configured for multiline values, this is an unterminated quoted value. + assertThrows(IllegalArgumentException.class, () -> + parse(CsvParser.commaSeparated(), "foo,\"Hello,", "World!\"")); + // This fails because no more lines exist (even if multiline is allowed) + assertThrows(IllegalArgumentException.class, () -> + parse(CsvParser.commaSeparated().allowMultiline(), "foo,\"Hello,")); + } + + @Test + public void testRowMapping() { + List> rows = parseMap( + CsvParser.commaSeparated(), + rowMapper(), + "FOO,BAR", + "foo,bar", + "Hello,World!", + "No Trailing,", + ",", + ""); + assertThat(rows).hasSize(5); + assertThat(rows.get(0)).containsExactly("FOO", "foo", "BAR", "bar").inOrder(); + assertThat(rows.get(1)).containsExactly("FOO", "Hello", "BAR", "World!").inOrder(); + assertThat(rows.get(2)).containsExactly("FOO", "No Trailing").inOrder(); + assertThat(rows.get(3)).isEmpty(); + assertThat(rows.get(4)).isEmpty(); + } + + @Test + public void testRowMapping_withHeader() { + List header = new ArrayList<>(); + List> rows = parseMap( + CsvParser.commaSeparated(), + rowMapper(header::addAll), + "FOO,BAR", + "foo,bar"); + assertThat(rows).hasSize(1); + assertThat(header).containsExactly("FOO", "BAR").inOrder(); + assertThat(rows.get(0)).containsExactly("FOO", "foo", "BAR", "bar").inOrder(); + } + + private void assertSingleRow(CsvParser parser, String line, String... values) { + List> rows = parse(parser, line); + assertThat(rows).hasSize(1); + assertThat(rows.get(0)).containsExactlyElementsIn(values).inOrder(); + } + + private static List> parse(CsvParser parser, String... lines) { + List> rows = new ArrayList<>(); + parser.parse(Stream.of(lines), r -> rows.add(r.collect(toImmutableList()))); + return rows; + } + + private static List> parseMap( + CsvParser p, RowMapper mapper, String... lines) { + List> rows = new ArrayList<>(); + p.parse(Stream.of(lines), mapper.mapTo(rows::add)); + return rows; + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvTableTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvTableTest.java new file mode 100644 index 000000000..6817d25e9 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/CsvTableTest.java @@ -0,0 +1,275 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.AREA_CODE_LENGTH; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.COMMENT; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtType.FIXED_LINE; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtType.FIXED_LINE_OR_MOBILE; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtType.MOBILE; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.FORMAT; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.REGIONS; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.TABLE_COLUMNS; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.TYPE; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.toCsv; +import static com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.toRangeTable; +import static com.google.i18n.phonenumbers.metadata.table.CsvTable.DiffMode.ALL; +import static com.google.i18n.phonenumbers.metadata.table.CsvTable.DiffMode.CHANGES; +import static com.google.i18n.phonenumbers.metadata.table.CsvTable.DiffMode.LHS; +import static com.google.i18n.phonenumbers.metadata.table.CsvTable.DiffMode.RHS; +import static org.junit.Assert.assertThrows; + +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Table; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.model.ExamplesTableSchema; +import com.google.i18n.phonenumbers.metadata.model.ExamplesTableSchema.ExampleNumberKey; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import java.io.IOException; +import java.io.PrintWriter; +import java.io.StringReader; +import java.io.StringWriter; +import java.util.Optional; +import java.util.stream.IntStream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CsvTableTest { + private static final CsvKeyMarshaller TEST_MARSHALLER = + CsvKeyMarshaller.ofSortedString("Id"); + + private static final Column REGION_CA = REGIONS.getColumn(PhoneRegion.of("CA")); + private static final Column REGION_US = REGIONS.getColumn(PhoneRegion.of("US")); + + @Test + public void testRangeTableExport() throws IOException { + ImmutableList> columns = + ImmutableList.of(TYPE, AREA_CODE_LENGTH, REGION_CA, REGION_US, COMMENT); + RangeTable table = RangeTable.builder(TABLE_COLUMNS) + .apply(row(columns, key("1", 7), MOBILE, 0, true, true)) + .apply(row(columns, key("2x[34]", 7, 8), FIXED_LINE_OR_MOBILE, 0, true, null, "Foo Bar")) + .apply(row(columns, key("345", 8), FIXED_LINE, 3, true, null)) + .apply(row(columns, key("456x8", 8), FIXED_LINE, 3, null, true)) + .build(); + CsvTable csv = toCsv(table); + assertCsv(csv, + "Prefix ; Length ; Type ; Area Code Length ; Regions ; Comment", + "1 ; 7 ; MOBILE ; 0 ; \"CA,US\"", + "2x[34] ; 7,8 ; FIXED_LINE_OR_MOBILE ; 0 ; \"CA\" ; \"Foo Bar\"", + "345 ; 8 ; FIXED_LINE ; 3 ; \"CA\"", + "456x8 ; 8 ; FIXED_LINE ; 3 ; \"US\""); + assertThat(toRangeTable(csv)).isEqualTo(table); + } + + @Test + public void testExampleNumberExport() throws IOException { + Table table = HashBasedTable.create(); + table.put(PhoneRegion.of("US"), ValidNumberType.TOLL_FREE, DigitSequence.of("800123456")); + table.put(PhoneRegion.of("US"), ValidNumberType.PREMIUM_RATE, DigitSequence.of("945123456")); + table.put(PhoneRegion.of("CA"), ValidNumberType.MOBILE, DigitSequence.of("555123456")); + // Ordering is well defined in the CSV output. + // TODO: Consider making columns able to identify if their values need CSV escaping. + CsvTable csv = ExamplesTableSchema.toCsv(table); + assertCsv(csv, + "Region ; Type ; Number", + "CA ; MOBILE ; \"555123456\"", + "US ; TOLL_FREE ; \"800123456\"", + "US ; PREMIUM_RATE ; \"945123456\""); + assertThat(ExamplesTableSchema.toExampleTable(csv)).isEqualTo(table); + } + + @Test + public void testDiff() throws IOException { + ImmutableList> columns = ImmutableList.of(COMMENT); + RangeTable lhs = RangeTable.builder(TABLE_COLUMNS) + .apply(row(columns, key("1", 6), "Left Side Only")) + .apply(row(columns, key("3", 6), "Left Value")) + .apply(row(columns, key("4", 6), "Same Value")) + .build(); + RangeTable rhs = RangeTable.builder(TABLE_COLUMNS) + .apply(row(columns, key("2", 6), "Right Side Only")) + .apply(row(columns, key("3", 6), "Right Value")) + .apply(row(columns, key("4", 6), "Same Value")) + .build(); + assertCsv(CsvTable.diff(toCsv(lhs), toCsv(rhs), ALL), + "Diff ; Prefix ; Length ; Comment", + "---- ; 1 ; 6 ; \"Left Side Only\"", + "++++ ; 2 ; 6 ; \"Right Side Only\"", + "<<<< ; 3 ; 6 ; \"Left Value\"", + ">>>> ; 3 ; 6 ; \"Right Value\"", + "==== ; 4 ; 6 ; \"Same Value\""); + assertCsv(CsvTable.diff(toCsv(lhs), toCsv(rhs), CHANGES), + "Diff ; Prefix ; Length ; Comment", + "---- ; 1 ; 6 ; \"Left Side Only\"", + "++++ ; 2 ; 6 ; \"Right Side Only\"", + "<<<< ; 3 ; 6 ; \"Left Value\"", + ">>>> ; 3 ; 6 ; \"Right Value\""); + assertCsv(CsvTable.diff(toCsv(lhs), toCsv(rhs), LHS), + "Diff ; Prefix ; Length ; Comment", + "---- ; 1 ; 6 ; \"Left Side Only\"", + "<<<< ; 3 ; 6 ; \"Left Value\"", + "==== ; 4 ; 6 ; \"Same Value\""); + assertCsv(CsvTable.diff(toCsv(lhs), toCsv(rhs), RHS), + "Diff ; Prefix ; Length ; Comment", + "++++ ; 2 ; 6 ; \"Right Side Only\"", + ">>>> ; 3 ; 6 ; \"Right Value\"", + "==== ; 4 ; 6 ; \"Same Value\""); + } + + @Test + public void testEscaping() throws IOException { + ImmutableList> columns = ImmutableList.of(COMMENT); + RangeTable table = RangeTable.builder(TABLE_COLUMNS) + .apply(row(columns, key("1", 6), "Doubling \" Double Quotes")) + .apply(row(columns, key("2", 6), "Escaping \n Newlines")) + .apply(row(columns, key("3", 6), "Other \t \\ \r Escaping")) + .build(); + assertCsv(toCsv(table), + "Prefix ; Length ; Comment", + "1 ; 6 ; \"Doubling \"\" Double Quotes\"", + "2 ; 6 ; \"Escaping \\n Newlines\"", + "3 ; 6 ; \"Other \\t \\\\ \\r Escaping\""); + } + + @Test + public void testOrdering() throws IOException { + // This came up in relation to discovering that ImmutableSet.copyOf(TreeBasedTable) does not + // result in rows/columns in the order of the TreeBasedTable's column comparator. Hence the + // code does a copy via a temporary ImmutableTable.Builder. + ImmutableList> columns = + ImmutableList.of(TYPE, AREA_CODE_LENGTH, REGION_US, COMMENT); + RangeTable table = RangeTable.builder(TABLE_COLUMNS) + .apply(row(columns, key("1", 4), null, null, null, "Foo Bar")) + .apply(row(columns, key("2", 4), null, null, true)) + .apply(row(columns, key("3", 4), null, 2)) + .apply(row(columns, key("4", 4), MOBILE)) + .build(); + CsvTable csv = toCsv(table); + assertCsv( + csv, + "Prefix ; Length ; Type ; Area Code Length ; Regions ; Comment", + "1 ; 4 ; ; ; ; \"Foo Bar\"", + "2 ; 4 ; ; ; \"US\"", + "3 ; 4 ; ; 2", + "4 ; 4 ; MOBILE"); + assertThat(toRangeTable(csv)).isEqualTo(table); + } + + // This is (Jan 2019) currently impossible using ImmutableTable. + @Test + public void testOptionalRowOrdering() throws IOException { + CsvKeyMarshaller unorderedIntegerMarshaller = + new CsvKeyMarshaller<>( + n -> IntStream.of(n).boxed().map(Object::toString), + p -> Integer.parseInt(p.get(0)), + Optional.empty(), + "Unordered"); + CsvSchema schema = + CsvSchema.of(unorderedIntegerMarshaller, RangesTableSchema.SCHEMA.columns()); + + CsvTable.Builder csv = CsvTable.builder(schema); + csv.putRow(4, ImmutableMap.of(COMMENT, "Foo Bar")); + csv.putRow(1, ImmutableMap.of(FORMAT, "Quux")); + csv.putRow(3, ImmutableMap.of(AREA_CODE_LENGTH, 2)); + csv.putRow(2, ImmutableMap.of(TYPE, MOBILE)); + + assertCsv( + csv.build(), + "Unordered ; Type ; Area Code Length ; Format ; Comment", + "4 ; ; ; ; \"Foo Bar\"", + "1 ; ; ; \"Quux\"", + "3 ; ; 2", + "2 ; MOBILE"); + } + + @Test + public void testUnsafeString() { + Column unsafe = Column.ofString("unsafe"); + CsvSchema schema = CsvSchema.of(TEST_MARSHALLER, Schema.builder().add(unsafe).build()); + CsvTable csv = + CsvTable.builder(schema).put("key", unsafe, "Control chars Not \0 Allowed").build(); + assertThrows(IllegalArgumentException.class, () -> export(csv, false)); + } + + private enum Perverse { + UNSAFE_VALUE() { + @Override + public String toString() { + return "Unsafe ; for \n \"CSV\""; + } + }; + } + + @Test + public void testPerverseEdgeCase() { + Column unsafe = Column.of(Perverse.class, "Unsafe", Perverse.UNSAFE_VALUE); + CsvSchema schema = CsvSchema.of(TEST_MARSHALLER, Schema.builder().add(unsafe).build()); + CsvTable csv = + CsvTable.builder(schema).put("key", unsafe, Perverse.UNSAFE_VALUE).build(); + assertThrows(IllegalArgumentException.class, () -> export(csv, false)); + } + + private static void assertCsv(CsvTable csv, String... lines) throws IOException { + String aligned = join(lines); + // Assumes test values don't contain semi-colons where space matters. + String unaligned = aligned.replaceAll(" *; *", ";"); + String exported = export(csv, true); + assertThat(exported).isEqualTo(aligned); + assertThat(export(csv, false)).isEqualTo(unaligned); + CsvTable imported = CsvTable.importCsv(csv.getSchema(), new StringReader(exported)); + assertThat(csv).isEqualTo(imported); + } + + private static String export(CsvTable csv, boolean align) { + StringWriter out = new StringWriter(); + csv.exportCsv(new PrintWriter(out), align); + return out.toString(); + } + + private static Change row(ImmutableList> columns, RangeKey key, Object... values) { + Change.Builder row = Change.builder(key.asRangeTree()); + checkArgument(values.length <= columns.size()); + int n = 0; + for (Object v : values) { + if (v != null) { + Column c = columns.get(n); + row.assign(c, c.cast(v)); + } + n++; + } + return row.build(); + } + + private static String join(String... lines) { + return String.join("\n", lines) + "\n"; + } + + private static RangeKey key(String spec, Integer... lengths) { + RangeSpecification prefix = + spec.isEmpty() ? RangeSpecification.empty() : RangeSpecification.parse(spec); + return RangeKey.create(prefix, ImmutableSet.copyOf(lengths)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeKeyTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeKeyTest.java new file mode 100644 index 000000000..baa38f856 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeKeyTest.java @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.truth.Truth.assertThat; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.PrefixTree; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RangeKeyTest { + @Test + public void testEmpty() { + ImmutableList keys = RangeKey.decompose(RangeTree.empty()); + assertThat(keys).isEmpty(); + } + + @Test + public void testZeroLengthMatch() { + ImmutableList keys = RangeKey.decompose(RangeTree.from(RangeSpecification.empty())); + assertThat(keys).containsExactly(key("", 0)); + } + + @Test + public void testOnlyAnyPath() { + ImmutableList keys = RangeKey.decompose(ranges("xxx", "xxxx", "xxxxx")); + assertThat(keys).containsExactly(key("", 3, 4, 5)); + } + + @Test + public void testSimple() { + ImmutableList keys = RangeKey.decompose(ranges("123xxx", "123xxxx", "123xxxxx")); + assertThat(keys).containsExactly(key("123", 6, 7, 8)); + } + + @Test + public void testEmbeddedRanges() { + ImmutableList keys = + RangeKey.decompose(ranges("1x", "1xx", "1xx23", "1xx23x", "1xx23xx")); + assertThat(keys).containsExactly(key("1", 2, 3), key("1xx23", 5, 6, 7)).inOrder(); + } + + @Test + public void testSplitFactors() { + ImmutableList keys = RangeKey.decompose(ranges("123xxxx", "1234x", "1234xx")); + // If the input wasn't "factored" first, this would result in: + // key("123[0-35-9]", 7), key("1234", 5, 6, 7) + assertThat(keys).containsExactly(key("123", 7), key("1234", 5, 6)).inOrder(); + } + + @Test + public void testMergeStrategy() { + ImmutableList keys = RangeKey.decompose(ranges("12[0-4]xxx", "12xxx", "12xx")); + // The merge strategy for factorizing the tree will prefer to keep the longer paths intact + // and split shorter paths around it. Using the other strategy we would get: + // key("12", 4, 5), key("12[0-4]", 6) + assertThat(keys).containsExactly(key("12[0-4]", 4, 5, 6), key("12[5-9]", 4, 5)).inOrder(); + } + + @Test + public void testAsRangeSpecifications() { + assertThat(key("", 3, 4, 5).asRangeSpecifications()) + .containsExactly(spec("xxx"), spec("xxxx"), spec("xxxxx")).inOrder(); + assertThat(key("1[2-4]", 3, 4, 5).asRangeSpecifications()) + .containsExactly(spec("1[2-4]x"), spec("1[2-4]xx"), spec("1[2-4]xxx")).inOrder(); + assertThat(key("1x[468]", 3, 5, 7).asRangeSpecifications()) + .containsExactly(spec("1x[468]"), spec("1x[468]xx"), spec("1x[468]xxxx")).inOrder(); + } + + @Test + public void testSimpleRealWorldData() { + // From ITU German numbering plan, first few fixed line ranges. + PrefixTree prefixes = + PrefixTree.from(ranges("20[1-389]", "204[135]", "205[1-468]", "206[4-6]", "20[89]")); + RangeTree ranges = prefixes.retainFrom( + ranges("xxxxxx", "xxxxxxx", "xxxxxxxx", "xxxxxxxxx", "xxxxxxxxxx", "xxxxxxxxxxx")); + ImmutableList keys = RangeKey.decompose(ranges); + assertThat(keys).containsExactly( + key("20[1-389]", 6, 7, 8, 9, 10, 11), + key("204[135]", 6, 7, 8, 9, 10, 11), + key("205[1-468]", 6, 7, 8, 9, 10, 11), + key("206[4-6]", 6, 7, 8, 9, 10, 11)) + .inOrder(); + } + + @Test + public void testContains() { + RangeKey key = key("1[23]", 7, 8, 9); + assertThat(key.contains(digitSequence("12"), 8)).isTrue(); + assertThat(key.contains(digitSequence("12"), 10)).isFalse(); + assertThat(key.contains(digitSequence("7"), 8)).isFalse(); + } + + private static RangeKey key(String spec, Integer... lengths) { + RangeSpecification prefix = + spec.isEmpty() ? RangeSpecification.empty() : RangeSpecification.parse(spec); + return RangeKey.create(prefix, ImmutableSet.copyOf(lengths)); + } + + private static RangeTree ranges(String... spec) { + return RangeTree.from(Stream.of(spec).map(RangeSpecification::parse)); + } + + private static RangeSpecification spec(String spec) { + return RangeSpecification.parse(spec); + } + + private static DigitSequence digitSequence(String spec) { + return DigitSequence.of(spec); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeTableTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeTableTest.java new file mode 100644 index 000000000..78311690a --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/RangeTableTest.java @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.FIXED_LINE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.MOBILE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.PREMIUM_RATE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.SHARED_COST; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.TOLL_FREE; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.UNKNOWN; +import static com.google.i18n.phonenumbers.metadata.testing.RangeTableSubject.assertThat; +import static com.google.i18n.phonenumbers.metadata.testing.RangeTreeSubject.assertThat; +import static java.util.stream.IntStream.rangeClosed; +import static org.junit.Assert.assertThrows; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Table; +import com.google.common.collect.Table.Cell; +import com.google.common.collect.Tables; +import com.google.i18n.phonenumbers.metadata.PrefixTree; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import java.util.Arrays; +import java.util.Map; +import java.util.Optional; +import java.util.function.Function; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RangeTableTest { + + private static final Column TYPE = + Column.of(ValidNumberType.class, "Type", UNKNOWN); + public static final Column AREA_CODE_LENGTH = Column.ofUnsignedInteger("AreaCodeLength"); + + private static final ColumnGroup REGIONS = + ColumnGroup.byRegion(Column.ofBoolean("Region")); + private static final Column REGION_US = REGIONS.getColumn(PhoneRegion.of("US")); + private static final Column REGION_CA = REGIONS.getColumn(PhoneRegion.of("CA")); + + private static final Schema SCHEMA = + Schema.builder().add(TYPE).add(AREA_CODE_LENGTH).add(REGIONS).build(); + + // This is essentially the most "extreme" simplification you can have. All detail is removed and + // lengths are merged into a contiguous range. It's basically like turning a range into "\d{n,m}" + // For example, { "123", "12345" } becomes { "xxx", "xxxx", "xxxxx" }. + private static final Function EXTREME_SIMPLIFICATION = + c -> RangeTree.from( + rangeClosed(c.getRanges().getLengths().first(), c.getRanges().getLengths().last()) + .mapToObj(RangeSpecification::any)); + + @Test + public void testEmptyMap() { + RangeTable table = RangeTable.builder(SCHEMA).build(); + assertThat(table).isEmpty(); + } + + @Test + public void testBasicAssign() { + RangeTable.Builder table = RangeTable.builder(SCHEMA); + + table.assign(TYPE, MOBILE, ranges("1[234]xxxx"), OverwriteMode.ALWAYS); + assertThat(table.getRanges(TYPE, MOBILE)).isEqualTo(ranges("1[234]xxxx")); + + table.assign(TYPE, null, ranges("13xxxx"), OverwriteMode.ALWAYS); + assertThat(table.getRanges(TYPE, MOBILE)).isEqualTo(ranges("1[24]xxxx")); + + Assignment fixedLine = Assignment.of(TYPE, FIXED_LINE); + + // Overwrite an existing range. + table.assign(fixedLine, ranges("14xxxx"), OverwriteMode.ALWAYS); + assertThat(table.getRanges(TYPE, MOBILE)).isEqualTo(ranges("12xxxx")); + assertThat(table.getRanges(TYPE, FIXED_LINE)).isEqualTo(ranges("14xxxx")); + + // Partially overwrite an existing range (same value). + table.assign(fixedLine, ranges("1[34]xxxx"), OverwriteMode.SAME); + assertThat(table.getRanges(TYPE, MOBILE)).isEqualTo(ranges("12xxxx")); + assertThat(table.getRanges(TYPE, FIXED_LINE)).isEqualTo(ranges("1[34]xxxx")); + + // Fail to overwrite range with a different value in "SAME" mode. + assertThrows(IllegalArgumentException.class, + () -> table.assign(fixedLine, ranges("1[23]xxxx"), OverwriteMode.SAME)); + + // Add new ranges (but never overwriting). + table.assign(fixedLine, ranges("15xxxx"), OverwriteMode.NEVER); + assertThat(table.getRanges(TYPE, MOBILE)).isEqualTo(ranges("12xxxx")); + assertThat(table.getRanges(TYPE, FIXED_LINE)).isEqualTo(ranges("1[3-5]xxxx")); + + // Fail to write ranges with the same value in "NEVER" mode. + assertThrows(IllegalArgumentException.class, + () -> table.assign(fixedLine, ranges("15xxxx"), OverwriteMode.NEVER)); + + // Unassignment (null value) makes no sense for modes other than "ALWAYS". + // TODO: This highlights the way this API is bad, make a separate "unassign" method. + assertThrows(IllegalArgumentException.class, + () -> table.assign(TYPE, null, ranges("123"), OverwriteMode.SAME)); + assertThrows(IllegalArgumentException.class, + () -> table.assign(TYPE, null, ranges("123"), OverwriteMode.NEVER)); + } + + @Test + public void testApplyChanges() { + // Changes ordered top-to-bottom. + RangeTable table = RangeTable.builder(SCHEMA) + .apply(assign( + ranges("[18]2xxxxx"), ImmutableMap.of(TYPE, MOBILE, AREA_CODE_LENGTH, 3))) + .apply(assign(ranges("7xxxxxx"), TYPE, MOBILE)) + .apply(assign(ranges("[1-3]xxxxxx"), TYPE, FIXED_LINE)) + .build(); + // The union of all the ranges. + assertThat(table).allRanges().containsExactly("[1-37]xxxxxx", "82xxxxx"); + // The ranges assigned for various columns. + assertThat(table).assigned(TYPE).containsExactly("[1-37]xxxxxx", "82xxxxx"); + assertThat(table).assigned(AREA_CODE_LENGTH).containsExactly("[18]2xxxxx"); + + // Note that the 12xxxxx range is replaced by the fixed line in the type map. + assertThat(table).assigned(TYPE, FIXED_LINE).containsExactly("[1-3]xxxxxx"); + assertThat(table).assigned(TYPE, MOBILE).containsExactly("7xxxxxx", "82xxxxx"); + // Area code length unaffected by update of the 12xxxxx range (only type was affected). + assertThat(table).assigned(AREA_CODE_LENGTH, 3).containsExactly("[18]2xxxxx"); + } + + @Test + public void testBareRangeAddition() { + RangeTable table = RangeTable.builder(SCHEMA) + .add(ranges("1xxxxx")) + .apply(assign(ranges("12xxxx"), TYPE, MOBILE)) + .build(); + assertThat(table).allRanges().containsExactly("1xxxxx"); + // Note that there is not "getUnassignedRanges()" on RangeTable (yet), so we fudge it by + // checking that there's only one column and looking at all the assigned ranges in it. + assertThat(table).hasColumns(TYPE); + assertThat(table).assigned(TYPE).containsExactly("12xxxx"); + + // Also check that the re-built builder remembers the unassigned ranges. + RangeTable.Builder builder = table.toBuilder(); + assertThat(builder.getAllRanges()).containsExactly("1xxxxx"); + assertThat(builder.getAssignedRanges(TYPE)).containsExactly("12xxxx"); + } + + @Test + public void testAssignAndUnassign() { + RangeTable table = RangeTable.builder(SCHEMA) + .apply(assign(ranges("1xxxxx"), TYPE, MOBILE)) + .apply(unassign(ranges("1[0-4]xxxx"), TYPE)) + .build(); + assertThat(table).allRanges().containsExactly("1xxxxx"); + assertThat(table).hasColumns(TYPE); + assertThat(table).assigned(TYPE).containsExactly("1[5-9]xxxx"); + + // Also check that the re-built builder remembers the unassigned ranges. + RangeTable.Builder builder = table.toBuilder(); + assertThat(builder.getAllRanges()).containsExactly("1xxxxx"); + assertThat(builder.getAssignedRanges(TYPE)).containsExactly("1[5-9]xxxx"); + } + + @Test + public void testAssignAndRemove() { + RangeTable table = RangeTable.builder(SCHEMA) + .apply(assign(ranges("1xxxxx"), TYPE, MOBILE)) + .remove(ranges("1[5-9]xxxx")) + .build(); + assertThat(table).allRanges().containsExactly("1[0-4]xxxx"); + assertThat(table).hasColumns(TYPE); + assertThat(table).assigned(TYPE).containsExactly("1[0-4]xxxx"); + + RangeTable.Builder builder = table.toBuilder(); + assertThat(builder.getAllRanges()).containsExactly("1[0-4]xxxx"); + assertThat(builder.getAssignedRanges(TYPE)).containsExactly("1[0-4]xxxx"); + } + + @Test + public void testTableImportExport() { + RangeTable original = RangeTable.builder(SCHEMA) + .apply(assign(ranges("[13]xxxxxx"), TYPE, MOBILE)) + .apply(assign(ranges("[24]xxxxxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("[14]xxxxxx"), AREA_CODE_LENGTH, 3)) + .apply(assign(ranges("[23]xxxxxx"), AREA_CODE_LENGTH, 2)) + .build(); + + Table, Optional> exported = original.toImmutableTable(); + assertThat(exported).hasSize(8); + assertThat(exported).containsCell(assigned("1xxxxxx", TYPE, MOBILE)); + assertThat(exported).containsCell(assigned("1xxxxxx", AREA_CODE_LENGTH, 3)); + assertThat(exported).containsCell(assigned("2xxxxxx", TYPE, FIXED_LINE)); + assertThat(exported).containsCell(assigned("2xxxxxx", AREA_CODE_LENGTH, 2)); + assertThat(exported).containsCell(assigned("3xxxxxx", TYPE, MOBILE)); + assertThat(exported).containsCell(assigned("3xxxxxx", AREA_CODE_LENGTH, 2)); + assertThat(exported).containsCell(assigned("4xxxxxx", TYPE, FIXED_LINE)); + assertThat(exported).containsCell(assigned("4xxxxxx", AREA_CODE_LENGTH, 3)); + + RangeTable imported = RangeTable.from(SCHEMA, exported); + assertThat(imported).isEqualTo(original); + assertThat(imported.toImmutableTable()).isEqualTo(exported); + } + + @Test + public void testColumnGroupMapping() { + // Changes ordered top-to-bottom. + RangeTable table = RangeTable.builder(SCHEMA) + .apply(assign(ranges("1xxxxx"), ImmutableMap.of(REGION_US, true))) + .apply(assign(ranges("2xxxxx"), ImmutableMap.of(REGION_CA, true))) + .apply(assign(ranges("3xxxxx"), ImmutableMap.of(REGION_US, true, REGION_CA, true))) + .build(); + // The union of all the ranges. + assertThat(table).allRanges().containsExactly("[1-3]xxxxx"); + Map> regionMap = REGIONS.extractGroupColumns(table.getColumns()); + assertThat(regionMap.keySet()).containsExactly(PhoneRegion.of("US"), PhoneRegion.of("CA")); + assertThat(table.getAssignedRanges(regionMap.get(PhoneRegion.of("US")))).containsExactly("[13]xxxxx"); + assertThat(table.getAssignedRanges(regionMap.get(PhoneRegion.of("CA")))).containsExactly("[23]xxxxx"); + // If a column in a group is not present, it counts as having no ranges, but if a plain column + // is not in the schema at all, it's an error. + assertThat(table.getAssignedRanges(REGIONS.getColumn(PhoneRegion.of("CH")))).isEmpty(); + Column bogus = Column.ofString("Bogus"); + assertThrows(IllegalArgumentException.class, () -> table.getAssignedRanges(bogus)); + Column nope = ColumnGroup.byRegion(bogus).getColumn(PhoneRegion.of("US")); + assertThrows(IllegalArgumentException.class, () -> table.getAssignedRanges(nope)); + } + + @Test + public void testSubTable() { + RangeTable original = RangeTable.builder(SCHEMA) + .apply(assign(ranges("[13]xxxxxx"), TYPE, MOBILE)) + .apply(assign(ranges("[24]xxxxxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("[14]xxxxxx"), AREA_CODE_LENGTH, 3)) + .apply(assign(ranges("[23]xxxxxx"), AREA_CODE_LENGTH, 2)) + .build(); + // Restrict to the ranges in which area code length is 2, but keep only the type column. + RangeTable subTable = original.subTable(original.getRanges(AREA_CODE_LENGTH, 2), TYPE); + + assertThat(subTable).hasColumns(TYPE); + assertThat(subTable).hasRowCount(2); + assertThat(subTable).hasRanges("2xxxxxx", FIXED_LINE); + assertThat(subTable).hasRanges("3xxxxxx", MOBILE); + } + + @Test + public void testGetPrefixMap() { + RangeTable table = RangeTable.builder(SCHEMA) + .apply(assign(ranges("1234xxxx", "1256xxxx"), TYPE, MOBILE)) + .apply(assign(ranges("1236xxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("4xxxx"), TYPE, TOLL_FREE)) + .apply(assign(ranges("49xxxx"), TYPE, PREMIUM_RATE)) + .build(); + + ImmutableMap map = table.getPrefixMap(TYPE, 0); + + assertThat(map).containsEntry(MOBILE, PrefixTree.from(ranges("1234", "125"))); + assertThat(map).containsEntry(FIXED_LINE, PrefixTree.from(ranges("1236"))); + // The ranges 4xxxx and 49xxxx overlap (since 49 is a prefix for both) and the prefix map + // contains the shortest unique prefix for each range. The mapping from TOLL_FREE could not + // contain only "4[0-8]" since that would not match "49123". Overlapping range lengths with + // different types is thus highly problematic, but the prefix map will contain mappings for + // both, and it's up to the caller to handle this, possibly by ordering any checks made. + assertThat(map).containsEntry(TOLL_FREE, PrefixTree.from(ranges("4"))); + assertThat(map).containsEntry(PREMIUM_RATE, PrefixTree.from(ranges("49"))); + } + + @Test + public void testGetPrefixMap_minLength() { + RangeTable table = RangeTable.builder(SCHEMA) + .apply(assign(ranges("123xxxxx", "1256xxxx"), TYPE, MOBILE)) + .apply(assign(ranges("124xxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("4xxxx"), TYPE, TOLL_FREE)) + .apply(assign(ranges("49xxxx"), TYPE, PREMIUM_RATE)) + .build(); + + ImmutableMap map = table.getPrefixMap(TYPE, 3); + + assertThat(map).containsEntry(MOBILE, PrefixTree.from(ranges("12[35]"))); + assertThat(map).containsEntry(FIXED_LINE, PrefixTree.from(ranges("124"))); + assertThat(map).containsEntry(TOLL_FREE, PrefixTree.from(ranges("4"))); + assertThat(map).containsEntry(PREMIUM_RATE, PrefixTree.from(ranges("49"))); + } + + @Test + public void testSimplify_multipleColumns() { + RangeTable table = RangeTable.builder(SCHEMA) + // This can't be simplified since expanding any of the area code length ranges will overlap + // (possibly with the unassigned area code length ranges). + .apply(assign(ranges("1[0-4]x_xxxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("12x_xxxx"), AREA_CODE_LENGTH, 2)) + .apply(assign(ranges("123_xxxx"), AREA_CODE_LENGTH, 3)) + .apply(assign(ranges("123_4xxx"), AREA_CODE_LENGTH, 4)) + // This can be simplified since it expands into "empty" ranges. + .apply(assign(ranges("156_xxxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("156_xxxx"), AREA_CODE_LENGTH, 3)) + .apply(assign(ranges("234_xxxx"), TYPE, MOBILE)) + // This should be ignored since simplification happens only on the other columns. + .apply(assign(ranges("[12]23_xxxx"), REGION_CA, true)) + .build(); + + RangeTable simplified = + table.simplify(c -> c.getRanges().significantDigits(2), 0, TYPE, AREA_CODE_LENGTH); + + assertThat(simplified).hasColumns(TYPE, AREA_CODE_LENGTH); + // The 156 range got pulled back to 2 digits (the other was already 2 digits). + assertThat(simplified).assigned(TYPE, FIXED_LINE).containsExactly("1[0-4]x_xxxx", "15x_xxxx"); + // The 234 range got pulled back to 2 digits. + assertThat(simplified).assigned(TYPE, MOBILE).containsExactly("23x_xxxx"); + assertThat(simplified).assigned(AREA_CODE_LENGTH, 2).containsExactly("12[0-24-9]_xxxx"); + // The 123 ranges were preserved, but the 156 range was pulled back to 2 digits. + assertThat(simplified).assigned(AREA_CODE_LENGTH, 3) + .containsExactly("123_[0-35-9]xxx", "15x_xxxx"); + assertThat(simplified).assigned(AREA_CODE_LENGTH, 4).containsExactly("123_4xxx"); + } + + @Test + public void testSimplify_chineseRanges() { + // This mimics real data found in the CN regular expression whereby a SHARED_COST range + // partially overlaps with the fixed line prefixes. + RangeTable table = RangeTable.builder(SCHEMA) + // The pattern is: + // abc | length=10 | FIXED_LINE + // abc100 | length=8 | FIXED_LINE + // abc95 | length=8,9 | FIXED_LINE + // abc96 | length=8,9 | SHARED_COST + .apply(assign(ranges("123_xxx_xxxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("123_100xx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("123_95xxx", "123_95xxxx"), TYPE, FIXED_LINE)) + .apply(assign(ranges("123_96xxx", "123_96xxxx"), TYPE, SHARED_COST)) + // Just add a range that sits "either side" of what's being simplified to ensure it + // doesn't "leak". + .apply(assign(ranges("1[13]4_56xx_xxxx"), TYPE, MOBILE)) + .build(); + + RangeTable simplified = table.simplify(c -> c.getRanges().significantDigits(3), 0, TYPE); + + // The simplification function just takes the first 3 significant digits. If the "shared cost" + // ranges were not overlapping, this would result in a "fixed line" range of "123xxx..." with + // lengths 8,9,10. However to avoid corrupting the shared cost range, we end up with: + // abc | length=10 | FIXED_LINE + // abc[0-8] | length=8,9 | FIXED_LINE + // abc9[0-57-9] | length=8,9 | FIXED_LINE + // abc96 | length=8,9 | SHARED_COST + assertThat(simplified).hasColumns(TYPE); + assertThat(simplified).assigned(TYPE, FIXED_LINE).containsExactly( + "123_xxx_xxxx", + "123_[0-8]xx_xx", + "123_[0-8]xx_xxx", + "123_9[0-57-9]x_xx", + "123_9[0-57-9]x_xxx"); + assertThat(simplified).assigned(TYPE, SHARED_COST).containsExactly( + "123_96x_xx", + "123_96x_xxx"); + assertThat(simplified).assigned(TYPE, MOBILE).containsExactly( + "1[13]4_xxxx_xxxx"); + } + + @Test + public void testSimplify_overlappingCheck() { + Schema shortcodeSchema = Schema.builder().add(TYPE).build(); + + RangeTable table = RangeTable.builder(shortcodeSchema) + .apply(assign(ranges("123x"), TYPE, FIXED_LINE)) + .apply(assign(ranges("12x", "12xxx"), TYPE, MOBILE)) + .build(); + + // The simplification function here is good for testing edge case behaviour since it's + // essentially the most "extreme" simplification you can have. + RangeTable simplified = table.simplify(EXTREME_SIMPLIFICATION, 0, TYPE); + + assertThat(simplified).hasColumns(TYPE); + assertThat(simplified).assigned(TYPE, FIXED_LINE).containsExactly("123x"); + assertThat(simplified).assigned(TYPE, MOBILE).containsExactly("12x", "12[0-24-9]x", "12xxx"); + } + + private static RangeTree ranges(String... rangeSpecs) { + return RangeTree.from(Arrays.stream(rangeSpecs).map(RangeSpecification::parse)); + } + + private static > Change assign( + RangeTree ranges, Column column, T value) { + return Change.builder(ranges).assign(column, value).build(); + } + + private static > Change unassign(RangeTree ranges, Column column) { + return Change.builder(ranges).unassign(column).build(); + } + + private Change assign(RangeTree ranges, Map, ?> map) { + return Change.of(ranges, + map.entrySet().stream() + .map(e -> Assignment.of(e.getKey(), e.getValue())) + .collect(toImmutableList())); + } + + private static Cell, Optional> assigned( + String range, Column column, Object value) { + return Tables.immutableCell(RangeSpecification.parse(range), column, Optional.of(value)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/SchemaTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/SchemaTest.java new file mode 100644 index 000000000..a7847c1b1 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/table/SchemaTest.java @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.table; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.common.truth.Truth8.assertThat; +import static com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType.UNKNOWN; +import static org.junit.Assert.assertThrows; + +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class SchemaTest { + + private static final Column TYPE = + Column.of(ValidNumberType.class, "Type", UNKNOWN); + private static final Column OPERATORS = Column.ofString("Operators"); + + private static final ColumnGroup REGIONS = + ColumnGroup.byRegion(Column.ofBoolean("Region")); + private static final Column REGION_US = REGIONS.getColumn(PhoneRegion.of("US")); + private static final Column REGION_CA = REGIONS.getColumn(PhoneRegion.of("CA")); + + private static final Column BOGUS = Column.ofBoolean("Bogus"); + + private static final Schema SCHEMA = + Schema.builder().add(TYPE).add(OPERATORS).add(REGIONS).build(); + + @Test + public void testColumnOrdering() { + assertThat(Stream.of(OPERATORS, REGION_US, TYPE, REGION_CA).sorted(SCHEMA.ordering())) + .containsExactly(TYPE, OPERATORS, REGION_CA, REGION_US) + .inOrder(); + // The names are the columns/groups (but not the names of columns in groups, such as + // "Region:US", since those are functionally generated and aren't known by the schema. + assertThat(SCHEMA.names()).containsExactly("Type", "Operators", "Region").inOrder(); + } + + @Test + public void test() { + assertThat(SCHEMA.getColumn("Type")).isEqualTo(TYPE); + assertThat(SCHEMA.getColumn("Region:US")).isEqualTo(REGION_US); + assertThrows(IllegalArgumentException.class, () -> SCHEMA.getColumn("Region")); + assertThrows(IllegalArgumentException.class, () -> SCHEMA.getColumn("Bogus")); + } + + @Test + public void testCheckColumn() { + assertThat(SCHEMA.checkColumn(TYPE)).isEqualTo(TYPE); + assertThat(SCHEMA.checkColumn(REGION_US)).isEqualTo(REGION_US); + assertThrows(IllegalArgumentException.class, () -> SCHEMA.checkColumn(BOGUS)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTableSubject.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTableSubject.java new file mode 100644 index 000000000..614ae2e8b --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTableSubject.java @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.testing; + +import static com.google.common.base.Strings.lenientFormat; +import static com.google.common.truth.Fact.simpleFact; +import static com.google.common.truth.Truth.assertAbout; +import static java.util.Arrays.asList; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableTable; +import com.google.common.truth.FailureMetadata; +import com.google.common.truth.Subject; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import java.util.Optional; +import javax.annotation.Nullable; + +/** A Truth subject for asserting on {@link RangeTable} instances. */ +public class RangeTableSubject extends Subject { + /** Returns Truth subject for asserting on a {@link RangeTable}. */ + public static RangeTableSubject assertThat(@Nullable RangeTable table) { + return assertAbout(RangeTableSubject.SUBJECT_FACTORY).that(table); + } + + private static final Factory SUBJECT_FACTORY = + RangeTableSubject::new; + + private final RangeTable actual; + + private RangeTableSubject(FailureMetadata failureMetadata, @Nullable RangeTable subject) { + super(failureMetadata, subject); + this.actual = subject; + } + + // Add more methods below as needed. + + /** Asserts that the table is empty. */ + public void isEmpty() { + if (!actual.isEmpty()) { + failWithActual(simpleFact("expected to be empty")); + } + } + + /** Asserts that the table has exactly the given columns in the given order (and no others). */ + public void hasColumns(Column... columns) { + check("getColumns()").that(actual.getColumns()).containsExactlyElementsIn(asList(columns)); + } + + /** Asserts that the table has the specified number of rows. */ + public void hasRowCount(int count) { + check("toImmutableTable().rowKeySet().size()") + .that(actual.toImmutableTable().rowKeySet().size()) + .isEqualTo(count); + } + + /** + * Asserts the specified range has the given values for each column. All columns need to be + * specified, with {@code null} meanings "no value present". This method does not ensure that no + * other ranges were also assigned the same values, so for complete coverage in a test it's best + * to use this in conjunction with something like {@link #allRanges()}. + */ + public void hasRanges(String spec, Object... values) { + ImmutableTable, Optional> table = + this.actual.toImmutableTable(); + RangeSpecification rowKey = RangeSpecification.parse(spec); + if (!table.rowKeySet().contains(rowKey)) { + failWithoutActual( + simpleFact( + lenientFormat( + "specified row %s does not exist in the table: rows=%s", + rowKey, table.rowKeySet()))); + } + ImmutableMap, Optional> row = table.row(rowKey); + if (row.size() != values.length) { + failWithoutActual( + simpleFact( + lenientFormat( + "incorrect number of columns: expected %s, got %s", row.size(), values.length))); + } + int n = 0; + for (Optional actual : row.values()) { + Object expected = values[n++]; + if (actual.isPresent()) { + if (!actual.get().equals(expected)) { + failWithoutActual( + simpleFact( + lenientFormat("unexpected value in row: expected %s, got %s", expected, actual))); + } + } else if (expected != null) { + failWithoutActual(simpleFact(lenientFormat("missing value in row: expected %s", expected))); + } + } + } + + /** + * Returns a {@link RangeTreeSubject} for asserting about the ranges assigned to the given value + * in the specified column. + */ + public RangeTreeSubject assigned(Column column, Object value) { + return RangeTreeSubject.assertWithMessageThat( + actual.getRanges(column, value), "%s in column %s", value, column); + } + + /** + * Returns a {@link RangeTreeSubject} for asserting about all ranges assigned in the specified + * column. + */ + public RangeTreeSubject assigned(Column column) { + return RangeTreeSubject.assertWithMessageThat( + actual.getAssignedRanges(column), "column %s", column); + } + + /** Returns a {@link RangeTreeSubject} for asserting about all ranges in the table. */ + public RangeTreeSubject allRanges() { + return RangeTreeSubject.assertWithMessageThat(actual.getAllRanges(), "all ranges"); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTreeSubject.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTreeSubject.java new file mode 100644 index 000000000..5103a86c0 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/RangeTreeSubject.java @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.testing; + +import static com.google.common.truth.Fact.simpleFact; +import static com.google.common.truth.Truth.assertAbout; +import static com.google.common.truth.Truth.assertWithMessage; + +import com.google.common.collect.FluentIterable; +import com.google.common.collect.ImmutableSet; +import com.google.common.truth.FailureMetadata; +import com.google.common.truth.Subject; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.PrefixTree; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import javax.annotation.Nullable; + +/** A Truth subject for asserting on {@link RangeTree} instances. */ +public class RangeTreeSubject extends Subject { + + public static RangeTreeSubject assertThat(@Nullable RangeTree tree) { + return assertAbout(RangeTreeSubject.SUBJECT_FACTORY).that(tree); + } + + public static RangeTreeSubject assertThat(@Nullable PrefixTree tree) { + return assertAbout(RangeTreeSubject.SUBJECT_FACTORY).that(tree.asRangeTree()); + } + + public static RangeTreeSubject assertWithMessageThat( + @Nullable RangeTree tree, String message, Object... args) { + return assertWithMessage(message, args).about( + RangeTreeSubject.SUBJECT_FACTORY).that(tree); + } + + private static final Factory SUBJECT_FACTORY = + RangeTreeSubject::new; + + private final RangeTree actual; + + private RangeTreeSubject(FailureMetadata failureMetadata, @Nullable RangeTree subject) { + super(failureMetadata, subject); + this.actual = subject; + } + + // Add more methods below as needed. + + public void isEmpty() { + if (!actual.isEmpty()) { + failWithActual(simpleFact("expected to be empty")); + } + } + + public void isNotEmpty() { + if (actual.isEmpty()) { + failWithActual(simpleFact("expected not to be empty")); + } + } + + public void hasSize(long size) { + check("size()").withMessage("size").that(actual.size()).isEqualTo(size); + } + + public void contains(String digits) { + DigitSequence seq = digits.isEmpty() ? DigitSequence.empty() : DigitSequence.of(digits); + if (!actual.contains(seq)) { + failWithActual("expected to contain ", digits); + } + } + + public void doesNotContain(String digits) { + DigitSequence seq = digits.isEmpty() ? DigitSequence.empty() : DigitSequence.of(digits); + if (actual.contains(seq)) { + failWithActual("expected not to contain", digits); + } + } + + public void containsExactly(RangeSpecification spec) { + RangeTree tree = RangeTree.from(spec); + if (!actual.equals(tree)) { + failWithActual("expected to be equal to", spec); + } + } + + public void containsExactly(Iterable specs) { + RangeTree tree = RangeTree.from(specs); + if (!actual.equals(tree)) { + failWithActual("expected to be equal to", specs); + } + } + + public void containsExactly(String spec) { + containsExactly(RangeSpecification.parse(spec)); + } + + public void containsExactly(String... specs) { + containsExactly(FluentIterable.from(specs).transform(RangeSpecification::parse)); + } + + public void hasLengths(Integer... lengths) { + check("getLengths()") + .that(actual.getLengths()) + .containsExactlyElementsIn(ImmutableSet.copyOf(lengths)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/TestNumberingScheme.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/TestNumberingScheme.java new file mode 100644 index 000000000..59eed428c --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/testing/TestNumberingScheme.java @@ -0,0 +1,477 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.google.i18n.phonenumbers.metadata.testing; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.lang.Boolean.TRUE; +import static java.util.function.Function.identity; + +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.google.common.collect.Table; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.Types; +import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; +import com.google.i18n.phonenumbers.metadata.i18n.SimpleLanguageTag; +import com.google.i18n.phonenumbers.metadata.model.AltFormatSpec; +import com.google.i18n.phonenumbers.metadata.model.FormatSpec; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Attributes; +import com.google.i18n.phonenumbers.metadata.model.NumberingScheme.Comment; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtTariff; +import com.google.i18n.phonenumbers.metadata.model.RangesTableSchema.ExtType; +import com.google.i18n.phonenumbers.metadata.model.ShortcodesTableSchema; +import com.google.i18n.phonenumbers.metadata.model.ShortcodesTableSchema.ShortcodeType; +import com.google.i18n.phonenumbers.metadata.model.XmlRangesSchema; +import com.google.i18n.phonenumbers.metadata.proto.Types.ValidNumberType; +import com.google.i18n.phonenumbers.metadata.table.Column; +import com.google.i18n.phonenumbers.metadata.table.RangeTable; +import com.google.i18n.phonenumbers.metadata.table.RangeTable.OverwriteMode; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Optional; +import java.util.stream.Stream; + +/** + * Reusable test-only builder for numbering schemes. More methods can be added as necessary to + * support whatever is needed for testing. + * + *

Note that the various "modifer" classes returned by methods such as "addRanges()" are + * designed only as fluent APIs and instances of modifiers should never be assigned to variables + * and especially not interleaved with other mutations of the range tables. + */ +public final class TestNumberingScheme { + /** + * Returns a mutable numbering scheme builder for testing. Since an IDD is always required by + * NumberingScheme for geographic regions, a default value of "00" is set by default. This can be + * overridden or reset by {@code setInternationalPrefix{}} and {@code clearInternationalPrefix()}. + */ + public static TestNumberingScheme forCallingCode( + String cc, PhoneRegion main, PhoneRegion... others) { + return new TestNumberingScheme(DigitSequence.of(cc), main, ImmutableSet.copyOf(others)); + } + + private final DigitSequence callingCode; + private final PhoneRegion mainRegion; + private final ImmutableSet otherRegions; + private final ImmutableMap> regionMap; + + // See setNationalPrefix() / clearNationalPrefix() + private final List nationalPrefix = new ArrayList<>(); + + // See setInternationalPrefix() / clearInternationalPrefix() + private Optional internationalPrefix = Optional.empty(); + + // See setCarrierPrefixes() + private RangeTree carrierPrefixes = RangeTree.empty(); + + // Uses the CSV schema (rather than XML) since that handles type/tariff better. + private final RangeTable.Builder csvRanges = RangeTable.builder(RangesTableSchema.TABLE_COLUMNS); + private final Map shortcodes = new HashMap<>(); + private final Map formats = new LinkedHashMap<>(); + + // Alternate formats are largely separate from everything else. + private ImmutableList altFormats = ImmutableList.of(); + + // Explicit example numbers. + private final Table examples = + HashBasedTable.create(); + + private final List comments = new ArrayList<>(); + + private TestNumberingScheme( + DigitSequence cc, PhoneRegion main, ImmutableSet others) { + checkArgument(!others.contains(main), "duplicate regions"); + this.callingCode = checkNotNull(cc); + this.mainRegion = checkNotNull(main); + this.otherRegions = others; + this.regionMap = Stream.concat(Stream.of(main), others.stream()) + .collect(toImmutableMap(identity(), RangesTableSchema.REGIONS::getColumn)); + // Set a reasonable IDD default for geographic regions. + if (!main.equals(PhoneRegion.getWorld())) { + setInternationalPrefix("00"); + } + } + + /** Sets the national prefix of this scheme, replacing any previous value. */ + public TestNumberingScheme setNationalPrefix(String prefix) { + checkArgument(!prefix.isEmpty(), "national prefix must not be empty"); + this.nationalPrefix.clear(); + this.nationalPrefix.add(DigitSequence.of(prefix)); + return this; + } + + /** Sets the national prefix of this scheme, replacing any previous value. */ + public TestNumberingScheme setNationalPrefixes(String... prefix) { + List prefixes = Arrays.asList(prefix); + this.nationalPrefix.clear(); + prefixes.forEach(p -> { + checkArgument(!p.isEmpty(), "national prefix must not be empty"); + this.nationalPrefix.add(DigitSequence.of(p)); + }); + return this; + } + + /** Removes the national prefix */ + public TestNumberingScheme clearNationalPrefix() { + this.nationalPrefix.clear(); + return this; + } + + /** Sets the international prefix of this scheme, replacing any previous value. */ + public TestNumberingScheme setInternationalPrefix(String prefix) { + checkState(!mainRegion.equals(PhoneRegion.getWorld()), + "[%s] cannot set IDD for non-geographic calling code", callingCode); + this.internationalPrefix = Optional.of(DigitSequence.of(prefix)); + return this; + } + + /** Removes the international prefix */ + public TestNumberingScheme clearInternationalPrefix() { + this.internationalPrefix = Optional.empty(); + return this; + } + + /** Sets the national prefix of this scheme, replacing any previous value. */ + public TestNumberingScheme setCarrierPrefixes(String... prefix) { + this.carrierPrefixes = RangeTree.from(Arrays.stream(prefix).map(RangeSpecification::parse)); + return this; + } + + /** + * Adds ranges (which must not already exist) to the scheme. This method returns a fluent API + * for modifying the newly added ranges. + */ + public RangeModifier addRanges(ExtType type, ExtTariff tariff, String... specs) { + return addRanges(type, tariff, rangesOf(specs)); + } + + /** + * Adds ranges (which must not already exist) to the scheme. This method returns a fluent API + * for modifying the newly added ranges. + */ + public RangeModifier addRanges(ExtType type, ExtTariff tariff, RangeTree ranges) { + RangeTree overlap = csvRanges.getAllRanges().intersect(ranges); + checkArgument(overlap.isEmpty(), "ranges already added: %s", overlap); + csvRanges.assign(RangesTableSchema.TYPE, checkNotNull(type), ranges, OverwriteMode.NEVER); + csvRanges.assign(RangesTableSchema.TARIFF, checkNotNull(tariff), ranges, OverwriteMode.NEVER); + // Setting all regions here generates "legal" numbering schemes by default. + regionMap.values().forEach(c -> csvRanges.assign(c, true, ranges, OverwriteMode.NEVER)); + return new RangeModifier(ranges); + } + + /** Removes ranges (which need not already exist) from the scheme. */ + public void removeRanges(String... specs) { + removeRanges(rangesOf(specs)); + } + + /** Removes ranges (which need not already exist) from the scheme. */ + public void removeRanges(RangeTree ranges) { + csvRanges.remove(ranges); + } + + /** Returns a fluent API for modifying existing ranges (constrained by the given bounds). */ + public RangeModifier forRangesIn(String... specs) { + return forRangesIn(rangesOf(specs)); + } + + /** Returns a fluent API for modifying existing ranges (constrained by the given bounds). */ + public RangeModifier forRangesIn(RangeTree ranges) { + return new RangeModifier(ranges.intersect(csvRanges.getAllRanges())); + } + + /** + * Adds shortcodes (which must not already exist) to a given region in the scheme. This method + * returns a fluent API for modifying the newly added shortcodes. + */ + public ShortcodeModifier addShortcodes( + PhoneRegion region, ShortcodeType type, ExtTariff tariff, String... specs) { + return addShortcodes(region, type, tariff, rangesOf(specs)); + } + + /** + * Adds shortcodes (which must not already exist) to a given region in the scheme. This method + * returns a fluent API for modifying the newly added shortcodes. + */ + public ShortcodeModifier addShortcodes( + PhoneRegion region, ShortcodeType type, ExtTariff tariff, RangeTree ranges) { + RangeTable.Builder table = shortcodes + .computeIfAbsent(region, r -> RangeTable.builder(ShortcodesTableSchema.SCHEMA.columns())); + RangeTree overlap = table.getAllRanges().intersect(ranges); + checkArgument(overlap.isEmpty(), "ranges already added: %s", overlap); + table.assign(ShortcodesTableSchema.TYPE, checkNotNull(type), ranges, OverwriteMode.NEVER); + table.assign(ShortcodesTableSchema.TARIFF, checkNotNull(tariff), ranges, OverwriteMode.NEVER); + return new ShortcodeModifier(region, ranges); + } + + /** Returns a fluent API for modifying existing shortcodes (constrained by the given bounds). */ + public ShortcodeModifier forShortcodesIn(PhoneRegion region, String... specs) { + return forShortcodesIn(region, rangesOf(specs)); + } + + /** Returns a fluent API for modifying existing shortcodes (constrained by the given bounds). */ + public ShortcodeModifier forShortcodesIn(PhoneRegion region, RangeTree ranges) { + RangeTable.Builder shortcodeTable = + checkNotNull(shortcodes.get(region), "no shortcodes in region %s", region); + return new ShortcodeModifier(region, ranges.intersect(shortcodeTable.getAllRanges())); + } + + public TypeModifier forRangeTypes(PhoneRegion region, ExtType type, ExtTariff tariff) { + return new TypeModifier(region, type, tariff); + } + + public TestNumberingScheme setAlternateFormats(List altFormats) { + this.altFormats = ImmutableList.copyOf(altFormats); + return this; + } + + /** Builds a valid numbering scheme from the current state of this builder. */ + public NumberingScheme build() { + Attributes attributes = Attributes.create( + callingCode, + mainRegion, + otherRegions, + ImmutableSet.copyOf(nationalPrefix), + carrierPrefixes, + // This is currently simplistic (only 1 value) and could be extended for tests if needed. + internationalPrefix.map(Object::toString).orElse(""), + internationalPrefix.map(p -> RangeTree.from(RangeSpecification.from(p))) + .orElse(RangeTree.empty()), + "", + ImmutableSet.of()); + RangeTable xmlTable = XmlRangesSchema.fromExternalTable(csvRanges.build()); + ImmutableMap shortcodeMap = + shortcodes.entrySet().stream() + .collect(toImmutableMap(Entry::getKey, e -> e.getValue().build())); + // Some formats may have been unassigned by modifications to the test scheme. Only copy the + // formats with keys that exist in the range tables at the time the scheme is built. + ImmutableSet assignedFormats = Stream.concat( + xmlTable.getAssignedValues(XmlRangesSchema.FORMAT).stream(), + shortcodeMap.values().stream() + .flatMap(t -> t.getAssignedValues(ShortcodesTableSchema.FORMAT).stream())) + .collect(toImmutableSet()); + ImmutableMap formatMap = formats.entrySet().stream() + .filter(e -> assignedFormats.contains(e.getValue())) + .collect(toImmutableMap(Entry::getValue, Entry::getKey)); + return NumberingScheme.from( + attributes, + xmlTable, + Maps.transformValues(shortcodes, RangeTable.Builder::build), + formatMap, + altFormats, + fillInMissingExampleNumbersFrom(xmlTable, examples), + comments); + } + + public TerritoryModifier forTerritory(PhoneRegion region) { + return new TerritoryModifier(region); + } + + /** Fluent API for modifying a set of ranges. */ + public final class RangeModifier { + private final RangeTree ranges; + + private RangeModifier(RangeTree ranges) { + checkArgument(!ranges.isEmpty(), "cannot modify empty ranges"); + this.ranges = ranges; + } + + /** Sets the regions in which the ranges are valid. */ + public RangeModifier setRegions(PhoneRegion... regions) { + ImmutableSet regionsToSet = ImmutableSet.copyOf(regions); + checkArgument(regionMap.keySet().containsAll(regionsToSet)); + regionMap.forEach((r, c) -> + csvRanges.assign(c, regionsToSet.contains(r), ranges, OverwriteMode.ALWAYS)); + return this; + } + + /** Sets ranges to be "national only" dialing. */ + public RangeModifier setNationalOnly(boolean nationalOnly) { + csvRanges.assign(RangesTableSchema.NATIONAL_ONLY, nationalOnly, ranges, OverwriteMode.ALWAYS); + return this; + } + + /** Sets the area code length of the ranges. */ + public RangeModifier setAreaCodeLength(int n) { + csvRanges.assign(RangesTableSchema.AREA_CODE_LENGTH, n, ranges, OverwriteMode.ALWAYS); + return this; + } + + /** Sets the format assigned to the ranges. */ + public RangeModifier setFormat(FormatSpec format) { + String id = + formats.computeIfAbsent(format, f -> String.format("__fmt_%02d", formats.size() + 1)); + csvRanges.assign(RangesTableSchema.FORMAT, id, ranges, OverwriteMode.ALWAYS); + return this; + } + + public RangeModifier setFormat(String id, FormatSpec format) { + formats.put(format, id); + csvRanges.assign(RangesTableSchema.FORMAT, id, ranges, OverwriteMode.ALWAYS); + return this; + } + + /** Clears the format assigned to the ranges. */ + public RangeModifier clearFormat() { + csvRanges.assign(RangesTableSchema.FORMAT, null, ranges, OverwriteMode.ALWAYS); + return this; + } + + public RangeModifier setGeocode(SimpleLanguageTag lang, String name) { + csvRanges.assign( + RangesTableSchema.GEOCODES.getColumn(lang), name, ranges, OverwriteMode.ALWAYS); + return this; + } + } + + /** Fluent API for modifying a set of shortcodes in a region. */ + public final class ShortcodeModifier { + private final PhoneRegion region; + private final RangeTree ranges; + + private ShortcodeModifier(PhoneRegion region, RangeTree ranges) { + checkArgument(!ranges.isEmpty(), "cannot modify empty ranges"); + this.region = checkNotNull(region); + this.ranges = ranges; + } + + private RangeTable.Builder shortcode() { + return shortcodes.get(region); + } + + /** Sets the format assigned to the shortcodes. */ + public ShortcodeModifier setFormat(FormatSpec format) { + String id = + formats.computeIfAbsent(format, f -> String.format("__fmt_%02d", formats.size() + 1)); + shortcode().assign(ShortcodesTableSchema.FORMAT, id, ranges, OverwriteMode.ALWAYS); + return this; + } + + /** Sets the format assigned to the shortcodes. */ + public ShortcodeModifier setFormat(String id, FormatSpec format) { + formats.put(format, id); + shortcode().assign(ShortcodesTableSchema.FORMAT, id, ranges, OverwriteMode.ALWAYS); + return this; + } + + /** Clears the format assigned to the shortcodes. */ + public ShortcodeModifier clearFormat() { + shortcode().assign(ShortcodesTableSchema.FORMAT, null, ranges, OverwriteMode.ALWAYS); + return this; + } + } + + /** Fluent API for modifying attributes of range types. */ + public final class TypeModifier { + private final PhoneRegion region; + private final ExtType type; + private final ExtTariff tariff; + + public TypeModifier(PhoneRegion region, ExtType type, ExtTariff tariff) { + this.region = checkNotNull(region); + this.type = checkNotNull(type); + this.tariff = checkNotNull(tariff); + checkArgument(regionMap.containsKey(region), + "invalid test region '%s' not in: %s", region, regionMap.keySet()); + } + + public TypeModifier setExampleNumber(String ex) { + inferValidNumberType(type, tariff) + .ifPresent(t -> examples.put(region, t, DigitSequence.of(ex))); + return this; + } + + public TypeModifier addComment(String... lines) { + inferValidNumberType(type, tariff) + .flatMap(Types::toXmlType) + .ifPresent(t -> comments.add( + Comment.create(Comment.anchor(region, t), Arrays.asList(lines)))); + return this; + } + } + + /** Fluent API for modifying territory-level attributes. */ + public final class TerritoryModifier { + private final PhoneRegion region; + + public TerritoryModifier(PhoneRegion region) { + this.region = checkNotNull(region); + } + + public TerritoryModifier addComment(String... lines) { + comments.add(Comment.create(Comment.anchor(region), Arrays.asList(lines))); + return this; + } + } + + private Table fillInMissingExampleNumbersFrom( + RangeTable xmlTable, Table examples) { + // Take a copy since the build() method is not meant to be modifying the builder itself. + HashBasedTable examplesCopy = + HashBasedTable.create(examples); + addMissingExampleNumbersFor(mainRegion, xmlTable, examplesCopy); + otherRegions.forEach(r -> addMissingExampleNumbersFor(r, xmlTable, examplesCopy)); + return examplesCopy; + } + + private static void addMissingExampleNumbersFor( + PhoneRegion region, + RangeTable xmlTable, + Table examples) { + Column regionColumn = XmlRangesSchema.REGIONS.getColumn(region); + RangeTable regionTable = + xmlTable.subTable(xmlTable.getRanges(regionColumn, TRUE), XmlRangesSchema.TYPE); + for (ValidNumberType type : regionTable.getAssignedValues(XmlRangesSchema.TYPE)) { + if (examples.contains(region, type)) { + continue; + } + RangeTree ranges = regionTable.getRanges(XmlRangesSchema.TYPE, type); + // Assigned types must be assigned via non empty ranges (so first() cannot fail). + examples.put(region, type, ranges.first()); + } + } + + private static RangeTree rangesOf(String... specs) { + checkArgument(specs.length > 0, "must provide at least one range specifier"); + RangeTree ranges = RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse)); + checkArgument(!ranges.getInitial().canTerminate(), "cannot add the empty digit sequence"); + return ranges; + } + + private static Optional inferValidNumberType(ExtType type, ExtTariff tariff) { + // Tariff takes precedence over type. + Optional vnt = tariff.toValidNumberType(); + if (!vnt.isPresent()) { + vnt = type.toValidNumberType(); + } + return vnt; + } +}