From ae71e8b72650a0fcf01e59d7d22ea81f7d8f2aac Mon Sep 17 00:00:00 2001 From: lararennie Date: Wed, 21 Sep 2016 11:39:16 +0200 Subject: [PATCH] Using possibleLengths info for length calculations in Java (#1355) Changing PhoneNumberUtil to use the possibleLengths information, not the reg-exes. Note the API is not changing, but the metadata is now somewhat stricter for many countries, since before we applied only a minimum and maximum length for most countries, and now we specify exactly which lengths are possible. This has a flow-on effect when parsing, since we decide whether to do certain operations like strip a national prefix based on whether the number is a possible length before/after - when parsing, if the number is shorter than the *national* pattern, we no longer strip the national prefix. Affected countries: AD (7 digits now invalid) AM (7 digits now invalid) AR (9 digits now invalid) AZ (8 digits now invalid) BG (4 digits now valid for local-only numbers) BJ (5-7 digits now invalid) CC/CX (5 digit numbers now possible: this should always have been the case, but the generalDesc was wrong and didn't reflect its child elements. We now calculate it based on them, which allows 5 digit numbers.) CO (9 digits now invalid) CR (9 digits now invalid) ET (8 digits now invalid) GE (7 and 8 digits now invalid) GH (8 digits now invalid) IL (5 and 6 digits now invalid) IM/JE/GG (7, 8 and 9 digits now invalid, shortest national number length now 10, so parsing affected for numbers shorter than this) IS (8 digits now invalid) KG (7,8 digits now invalid) KR (11 digits now invalid) LA (7 digits now invalid) LI (8 digits now invalid) LY (8 digits now invalid) MV (8 and 9 digits now invalid) MW (8 digits now invalid) MX (9 digits now invalid) NP (9 digits now invalid) SE (11 digits now invalid) SG (9 digits now invalid) SL (7 digits now invalid) SM (7-9 digits now invalid) UA (8 digits now invalid) UG (8 digits now invalid) UZ (8 digits now invalid) --- .../i18n/phonenumbers/carrier/data/config | Bin 2230 -> 2230 bytes .../i18n/phonenumbers/geocoding/data/config | Bin 5792 -> 5792 bytes .../i18n/phonenumbers/PhoneNumberUtil.java | 79 +++++++++--------- .../phonenumbers/PhoneNumberUtilTest.java | 8 +- .../PhoneNumberMetadataProtoForTesting_AD | Bin 299 -> 349 bytes .../PhoneNumberMetadataProtoForTesting_BB | Bin 300 -> 376 bytes .../PhoneNumberMetadataProtoForTesting_BR | Bin 297 -> 373 bytes .../PhoneNumberMetadataProtoForTesting_CA | Bin 300 -> 376 bytes .../PhoneNumberMetadataProtoForTesting_CC | Bin 297 -> 373 bytes .../PhoneNumberMetadataProtoForTesting_CX | Bin 299 -> 375 bytes .../PhoneNumberMetadataProtoForTesting_FR | Bin 407 -> 415 bytes .../PhoneNumberMetadataProtoForTesting_GB | Bin 665 -> 705 bytes .../PhoneNumberMetadataProtoForTesting_GG | Bin 297 -> 373 bytes .../PhoneNumberMetadataProtoForTesting_SE | Bin 299 -> 349 bytes .../PhoneNumberMetadataProtoForTesting_US | Bin 693 -> 697 bytes java/pending_code_changes.txt | 6 ++ resources/PhoneNumberMetadataForTesting.xml | 71 +++++++++++++++- 17 files changed, 118 insertions(+), 46 deletions(-) diff --git a/java/carrier/src/com/google/i18n/phonenumbers/carrier/data/config b/java/carrier/src/com/google/i18n/phonenumbers/carrier/data/config index 4a4e6e4416ab78468d03ce90837cc45d92598fd7..99bb17154d96ae64310fa01c03879ea428626340 100644 GIT binary patch delta 116 zcmdlcxJ__F7oz}EY90f7RYtr=VqOUYQ&q;~S&VvYKryDY#L4Fvm7vVc-x!&hrHV=! pK*|{y%NZCLm>HOoQW=;aVw;6oYq6NMfUS%zu?VDTvkdzzMgTh&9>o9v delta 116 zcmdlcxJ__F7oz}sRYtr=VqOUYQ&k27Q)=GiS&VvYX^CLwIYuQYbMrSwW@f3>JO-ws oQU(Ubas~zlW{?svpD8JIvoLEd7PA(xm9c>oB^GU#VV}ha07Bdz#Q*>R diff --git a/java/geocoder/src/com/google/i18n/phonenumbers/geocoding/data/config b/java/geocoder/src/com/google/i18n/phonenumbers/geocoding/data/config index 7b3822ce12cab781ea865da7b37952393af3ee81..38783fb3952de0d397278d91fa23ccacf9d3e29d 100644 GIT binary patch delta 538 zcmYjOO-jR15S~o>Bd)rNSm;7GYM~yWixx%1tu957v`tKGOv6jlLZP~MWqB8uIZ^GGZ@YG*0VYOzbP-I3l7Q1WQG7i+Q~x zD9DiAD=Og_c)xh75%2+`L_F|JH#nm3aa<9FB+?D*K8N8zKU>bjILLWK=SJt&UA-^x z-p0%j{Rr`yK&y9ST*3=yL*g`|>MXcEb7K_yXio8k;Rqs;G2o@d;gooWL49Bs^$7%J zP!erG8RW6j9j-fK5j>(xV@BR6#q8vkeOEv7ZqD*uFy0H6uk;(Y4NGaZ+fC_}bz(lz z_kSkv55hM7(COQ1-T7tzuHk2?xrWv9;bw}3$|Ag2cgx+Q{A{^-vL;5&oDx60BeJF@ e*F)DTPu3i{)lpNWU&~od4qU8n7QCEPu4;d(OL4XU delta 538 zcmY*WO-jRH5S>h#R9tlxvCxHX)IvQ#7cGj2TV0AEX`7g|HI}q#p-|m-WyuBV89adC z85DQ!^a5VMH<<)<@saoD@4cC<8|%j7mhDBvak8qNR{fY*STrP}9Xux1*lJO21xc1A z+S5!p$@#wW_%7!M;3eXoM=Uk@g!f~6G?a?8X58ChFN05p^B{_o06phAM|j@_zQEh2 z@;tacb0Z9z2QUheu1h=((FD60*$B;<|_=1Ns;sJVG;O-cj$c->dt;pgV z2qMvU(-v19(SQM>OEn`JupF^t`;Y}(@~)#06>nOlE2gj9v~$R2v#u~3PxSS-68Ia> zERK`)>eOWRiaNY9N8h>WRDb8odGi_J799pK#YP`5w~ WKUFS+RbH&5^8dVE^n7ApRek}~z;U(! diff --git a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java index e1fe8d949..0b25dbf6b 100644 --- a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java +++ b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java @@ -1306,8 +1306,8 @@ public class PhoneNumberUtil { // short numbers, which are always dialled in national format. PhoneMetadata regionMetadata = getMetadataForRegion(regionCallingFrom); if (canBeInternationallyDialled(numberNoExt) - && !isShorterThanPossibleNormalNumber(regionMetadata, - getNationalSignificantNumber(numberNoExt))) { + && testNumberLength(getNationalSignificantNumber(numberNoExt), + regionMetadata.getGeneralDesc()) != ValidationResult.TOO_SHORT) { formattedNumber = format(numberNoExt, PhoneNumberFormat.INTERNATIONAL); } else { formattedNumber = format(numberNoExt, PhoneNumberFormat.NATIONAL); @@ -2113,19 +2113,19 @@ public class PhoneNumberUtil { return metadataSource.getMetadataForNonGeographicalRegion(countryCallingCode); } - boolean isNumberPossibleForDesc(String nationalNumber, PhoneNumberDesc numberDesc) { - Matcher possibleNumberPatternMatcher = - regexCache.getPatternForRegex(numberDesc.getPossibleNumberPattern()) - .matcher(nationalNumber); - return possibleNumberPatternMatcher.matches(); - } - boolean isNumberMatchingDesc(String nationalNumber, PhoneNumberDesc numberDesc) { + // Check if any possible number lengths are present; if so, we use them to avoid checking the + // validation pattern if they don't match. If they are absent, this means they match the general + // description, which we have already checked before checking a specific number type. + int actualLength = nationalNumber.length(); + List possibleLengths = numberDesc.getPossibleLengthList(); + if (possibleLengths.size() > 0 && !possibleLengths.contains(actualLength)) { + return false; + } Matcher nationalNumberPatternMatcher = regexCache.getPatternForRegex(numberDesc.getNationalNumberPattern()) .matcher(nationalNumber); - return isNumberPossibleForDesc(nationalNumber, numberDesc) - && nationalNumberPatternMatcher.matches(); + return nationalNumberPatternMatcher.matches(); } /** @@ -2362,32 +2362,35 @@ public class PhoneNumberUtil { } /** - * Helper method to check a number against a particular pattern and determine whether it matches, - * or is too short or too long. Currently, if a number pattern suggests that numbers of length 7 - * and 10 are possible, and a number in between these possible lengths is entered, such as of - * length 8, this will return TOO_LONG. + * Helper method to check a number against possible lengths for this number, and determine whether + * it matches, or is too short or too long. Currently, if a number pattern suggests that numbers + * of length 7 and 10 are possible, and a number in between these possible lengths is entered, + * such as of length 8, this will return TOO_LONG. */ - private ValidationResult testNumberLengthAgainstPattern(Pattern numberPattern, String number) { - Matcher numberMatcher = numberPattern.matcher(number); - if (numberMatcher.matches()) { + private ValidationResult testNumberLength(String number, PhoneNumberDesc phoneNumberDesc) { + List possibleLengths = phoneNumberDesc.getPossibleLengthList(); + List localLengths = phoneNumberDesc.getPossibleLengthLocalOnlyList(); + int actualLength = number.length(); + if (localLengths.contains(actualLength)) { return ValidationResult.IS_POSSIBLE; } - if (numberMatcher.lookingAt()) { - return ValidationResult.TOO_LONG; - } else { + // There should always be "possibleLengths" set for every element. This will be a build-time + // check once ShortNumberMetadata.xml is migrated to contain this information as well. + int minimumLength = possibleLengths.get(0); + if (minimumLength == actualLength) { + return ValidationResult.IS_POSSIBLE; + } else if (minimumLength > actualLength) { return ValidationResult.TOO_SHORT; + } else if (possibleLengths.get(possibleLengths.size() - 1) < actualLength) { + return ValidationResult.TOO_LONG; } - } - - /** - * Helper method to check whether a number is too short to be a regular length phone number in a - * region. - */ - private boolean isShorterThanPossibleNormalNumber(PhoneMetadata regionMetadata, String number) { - Pattern possibleNumberPattern = regexCache.getPatternForRegex( - regionMetadata.getGeneralDesc().getPossibleNumberPattern()); - return testNumberLengthAgainstPattern(possibleNumberPattern, number) == - ValidationResult.TOO_SHORT; + // Note that actually the number is not too long if possibleLengths does not contain the length: + // we know it is less than the highest possible number length, and higher than the lowest + // possible number length. However, we don't currently have an enum to express this, so we + // return TOO_LONG in the short-term. + // We skip the first element; we've already checked it. + return possibleLengths.subList(1, possibleLengths.size()).contains(actualLength) + ? ValidationResult.IS_POSSIBLE : ValidationResult.TOO_LONG; } /** @@ -2424,9 +2427,7 @@ public class PhoneNumberUtil { String regionCode = getRegionCodeForCountryCode(countryCode); // Metadata cannot be null because the country calling code is valid. PhoneMetadata metadata = getMetadataForRegionOrCallingCode(countryCode, regionCode); - Pattern possibleNumberPattern = - regexCache.getPatternForRegex(metadata.getGeneralDesc().getPossibleNumberPattern()); - return testNumberLengthAgainstPattern(possibleNumberPattern, nationalNumber); + return testNumberLength(nationalNumber, metadata.getGeneralDesc()); } /** @@ -2596,15 +2597,12 @@ public class PhoneNumberUtil { regexCache.getPatternForRegex(generalDesc.getNationalNumberPattern()); maybeStripNationalPrefixAndCarrierCode( potentialNationalNumber, defaultRegionMetadata, null /* Don't need the carrier code */); - Pattern possibleNumberPattern = - regexCache.getPatternForRegex(generalDesc.getPossibleNumberPattern()); // If the number was not valid before but is valid now, or if it was too long before, we // consider the number with the country calling code stripped to be a better result and // keep that instead. if ((!validNumberPattern.matcher(fullNumber).matches() && validNumberPattern.matcher(potentialNationalNumber).matches()) - || testNumberLengthAgainstPattern(possibleNumberPattern, fullNumber.toString()) - == ValidationResult.TOO_LONG) { + || testNumberLength(fullNumber.toString(), generalDesc) == ValidationResult.TOO_LONG) { nationalNumber.append(potentialNationalNumber); if (keepRawInput) { phoneNumber.setCountryCodeSource(CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN); @@ -3013,7 +3011,8 @@ public class PhoneNumberUtil { // We require that the NSN remaining after stripping the national prefix and carrier code be // long enough to be a possible length for the region. Otherwise, we don't do the stripping, // since the original number could be a valid short number. - if (!isShorterThanPossibleNormalNumber(regionMetadata, potentialNationalNumber.toString())) { + if (testNumberLength(potentialNationalNumber.toString(), regionMetadata.getGeneralDesc()) + != ValidationResult.TOO_SHORT) { normalizedNationalNumber = potentialNationalNumber; if (keepRawInput) { phoneNumber.setPreferredDomesticCarrierCode(carrierCode.toString()); diff --git a/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java b/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java index e5313c7b3..274f874c4 100644 --- a/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java +++ b/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java @@ -145,8 +145,12 @@ public class PhoneNumberUtilTest extends TestMetadataTestCase { assertEquals("[13-689]\\d{9}|2[0-35-9]\\d{8}", metadata.getGeneralDesc().getNationalNumberPattern()); assertEquals("\\d{7}(?:\\d{3})?", metadata.getGeneralDesc().getPossibleNumberPattern()); - assertTrue(metadata.getGeneralDesc().exactlySameAs(metadata.getFixedLine())); + // Fixed-line data should be inherited from the general desc for the national number pattern, + // since it wasn't overridden. + assertEquals(metadata.getGeneralDesc().getNationalNumberPattern(), + metadata.getFixedLine().getNationalNumberPattern()); assertEquals("\\d{10}", metadata.getTollFree().getPossibleNumberPattern()); + assertEquals(1, metadata.getGeneralDesc().getPossibleLengthCount()); assertEquals(10, metadata.getGeneralDesc().getPossibleLength(0)); // Possible lengths are the same as the general description, so aren't stored separately in the // toll free element as well. @@ -1315,7 +1319,7 @@ public class PhoneNumberUtilTest extends TestMetadataTestCase { assertTrue(phoneUtil.isPossibleNumber("253-0000", RegionCode.US)); assertTrue(phoneUtil.isPossibleNumber("+1 650 253 0000", RegionCode.GB)); assertTrue(phoneUtil.isPossibleNumber("+44 20 7031 3000", RegionCode.GB)); - assertTrue(phoneUtil.isPossibleNumber("(020) 7031 3000", RegionCode.GB)); + assertTrue(phoneUtil.isPossibleNumber("(020) 7031 300", RegionCode.GB)); assertTrue(phoneUtil.isPossibleNumber("7031 3000", RegionCode.GB)); assertTrue(phoneUtil.isPossibleNumber("3331 6005", RegionCode.NZ)); assertTrue(phoneUtil.isPossibleNumber("+800 1234 5678", RegionCode.UN001)); diff --git a/java/libphonenumber/test/com/google/i18n/phonenumbers/data/PhoneNumberMetadataProtoForTesting_AD b/java/libphonenumber/test/com/google/i18n/phonenumbers/data/PhoneNumberMetadataProtoForTesting_AD index 42e2c85721298889f9f0731ba77422941164b49c..517691e37fa8e3b2ab86b339ef1ee2a26e8e4c97 100644 GIT binary patch literal 349 zcmZ4UmVvd3fq^lE0Z1@1GO)&^RGZbpXpkTfvjH(k8dDZ5fmH@YACsRWBLk2Cnerb9 kz}kT34JvVvH?TSg;&<#)OpY!V!Z delta 53 lcmcc1w3-JpcW(r#9UCC9jFkOI-nAeDxC5d u<}mp=GBN-Okgoqg0MkFH#6cdy>L7?;u}d*IIf0}Zm<1*JKF3UR3eDgmj&DUV?elb<6a n1CRje`VRy!{ewyz-JpcW(r#9UCC9jFkOI-nAeDxC5d t<}mp=GBN-Okgoqg0MkFH#6cdy>L7?;u}d*IgFVH-Y+z^z@fDba5&-SpK}P@p delta 59 pcmeytw1!E0&07Z6Dh38dWd1*O@53UR3eDgmj&DUV?elb<6a n1CRje`VRy!{ewyz1*JKF3UR3eDgmj&DUV?elb<6a o1CRje`VRy!{ewyz8ftX{Wk)s+TkO7ilU|?iq zV7D-cNvXD|Wn^Fj(uM}LAYmZpnmAE`6~fr017+NofinI;8IvU$VJv?}xy>Pru8aVC COcYB1 delta 99 zcmX@eI+K-i&07Z6Dh39o2@^SGco`WO7`T8KM6*q_bCd(pK)}ezz;0m>lTvL_%gDe6 qqzw&fLE;maDljrkJf#C9e#1*O@53UR3eDgmj&DUV?elb<6a n1CRje`VRy!{ewyztXec!d!F@n#CB delta 49 rcmdnVx|Nl4&07Z6Dh39oH4`}<85t(}J8?2HFfecdF)C-{(<_Vs6%7gd diff --git a/java/pending_code_changes.txt b/java/pending_code_changes.txt index 74cf0cd95..5bbc20bcc 100644 --- a/java/pending_code_changes.txt +++ b/java/pending_code_changes.txt @@ -4,6 +4,12 @@ Build changes: - Fix geocoding file generation to handle language codes that are not just two letters long properly and to process filenames in the same order. Code changes: + - Switching the internal implementation of isPossible and related functions + to use the new possibleLengths metadata. This affects a lot of countries, + making isPossible more restrictive as more precise data is available. It + also affects parsing ambiguous and invalid numbers, as we decide whether + to strip a possible national prefix (1) or country code based on the length + of the number and whether it is possible before or after this. - Formatting, naming (LOGGER -> logger) and comment tweaks to follow style guide - Removal of unneeded canBeGeocoded method in the diff --git a/resources/PhoneNumberMetadataForTesting.xml b/resources/PhoneNumberMetadataForTesting.xml index 7e1913803..e1f1c33e7 100644 --- a/resources/PhoneNumberMetadataForTesting.xml +++ b/resources/PhoneNumberMetadataForTesting.xml @@ -23,6 +23,13 @@ @@ -169,12 +176,27 @@ + + 246\d{7} + \d{7,10} + + + + + recognised by the library. It is also used for formatInOriginalFormat tests, so some length + metadata is needed. --> + + \d{8,10} + \d{8,10} + + + + @@ -262,13 +284,27 @@ + + 226\d{7} + \d{7,10} + + + + + recognised by the library, and some length information is needed for parsing. --> + + \d{6,10} + \d{6,10} + + + + @@ -326,8 +362,15 @@ + recognised by the library, and some length information is needed for parsing. --> + + \d{8,10} + \d{8,10} + + + + @@ -415,6 +458,9 @@ 3\d{6} \d{7} + + + @@ -445,7 +491,7 @@ [1-6]\d{9} - + 7[1-57-9]\d{8} @@ -488,6 +534,13 @@ + + \d{6,10} + \d{6,10} + + + + @@ -903,6 +956,13 @@ + + \d{9} + \d{9} + + + + @@ -976,6 +1036,9 @@ \d{7}(?:\d{3})? 1234567890 + + + 800\d{7}