diff --git a/cpp/src/phonenumbers/phonenumbermatcher.cc b/cpp/src/phonenumbers/phonenumbermatcher.cc index a3ebbd15b..d69225998 100644 --- a/cpp/src/phonenumbers/phonenumbermatcher.cc +++ b/cpp/src/phonenumbers/phonenumbermatcher.cc @@ -49,6 +49,7 @@ #include "phonenumbers/phonenumberutil.h" #include "phonenumbers/regexp_adapter.h" #include "phonenumbers/regexp_adapter_icu.h" +#include "phonenumbers/regexp_cache.h" #include "phonenumbers/stringutil.h" #ifdef I18N_PHONENUMBERS_USE_RE2 @@ -58,7 +59,6 @@ using std::map; using std::numeric_limits; using std::string; -using std::vector; namespace i18n { namespace phonenumbers { @@ -117,7 +117,7 @@ bool AllNumberGroupsRemainGrouped( const PhoneNumberUtil& util, const PhoneNumber& number, const string& normalized_candidate, - const vector& formatted_number_groups) { + const std::vector& formatted_number_groups) { size_t from_index = 0; if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) { // First skip the country code if the normalized candidate contained it. @@ -227,6 +227,10 @@ class PhoneNumberMatcherRegExps : public Singleton { scoped_ptr regexp_factory_for_pattern_; scoped_ptr regexp_factory_; + // A cache for popular reg-exps of leading digits used to match formatting + // patterns and the factory used to create it. + mutable RegExpCache regexp_cache_; + // Matches strings that look like publication pages. Example: // Computing Complete Answers to Queries in the Presence of Limited Access // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003). @@ -254,7 +258,7 @@ class PhoneNumberMatcherRegExps : public Singleton { // // Note that if there is a match, we will always check any text found up to // the first match as well. - scoped_ptr > inner_matches_; + scoped_ptr > inner_matches_; scoped_ptr capture_up_to_second_number_start_pattern_; scoped_ptr capturing_ascii_digits_pattern_; // Compiled reg-ex representing lead_class_; @@ -289,6 +293,12 @@ class PhoneNumberMatcherRegExps : public Singleton { #else regexp_factory_(new ICURegExpFactory()), #endif // I18N_PHONENUMBERS_USE_RE2 + // A cache for frequently used country-specific regular expressions. Set + // to 32 to cover ~2-3 countries being used for the same doc with ~10 + // patterns for each country. Some pages will have a lot more countries + // in use, but typically fewer numbers for each so expanding the cache + // for that use-case won't have a lot of benefit. + regexp_cache_(*regexp_factory_, 32), pub_pages_(regexp_factory_->CreateRegExp( "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")), slash_separated_dates_(regexp_factory_->CreateRegExp( @@ -300,19 +310,19 @@ class PhoneNumberMatcherRegExps : public Singleton { matching_brackets_(regexp_factory_->CreateRegExp( StrCat(leading_maybe_matched_bracket_, non_parens_, "+", bracket_pairs_, non_parens_, "*"))), - inner_matches_(new vector()), + inner_matches_(new std::vector()), capture_up_to_second_number_start_pattern_( regexp_factory_->CreateRegExp( PhoneNumberUtil::kCaptureUpToSecondNumberStart)), capturing_ascii_digits_pattern_( regexp_factory_->CreateRegExp("(\\d+)")), lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)), - pattern_(regexp_factory_for_pattern_->CreateRegExp( - StrCat("((?:", lead_class_, punctuation_, ")", lead_limit_, - digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", - block_limit_, "(?i)(?:", - PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), - ")?)"))) { + pattern_(regexp_factory_for_pattern_->CreateRegExp(StrCat( + "((?:", lead_class_, punctuation_, ")", lead_limit_, + digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", + block_limit_, "(?i)(?:", + PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), + ")?)"))) { inner_matches_->push_back( // Breaks on the slash - e.g. "651-234-2345/332-445-1234" regexp_factory_->CreateRegExp("/+(.*)")); @@ -521,7 +531,7 @@ bool PhoneNumberMatcher::VerifyAccordingToLeniency( return false; } ResultCallback4&>* callback = + const string&, const std::vector&>* callback = NewPermanentCallback(&AllNumberGroupsRemainGrouped); bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); delete(callback); @@ -536,7 +546,7 @@ bool PhoneNumberMatcher::VerifyAccordingToLeniency( return false; } ResultCallback4&>* callback = + const string&, const std::vector&>* callback = NewPermanentCallback( this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent); bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback); @@ -553,7 +563,7 @@ bool PhoneNumberMatcher::VerifyAccordingToLeniency( bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset, PhoneNumberMatch* match) { DCHECK(match); - for (vector::const_iterator regex = + for (std::vector::const_iterator regex = reg_exps_->inner_matches_->begin(); regex != reg_exps_->inner_matches_->end(); regex++) { scoped_ptr candidate_input( @@ -668,30 +678,42 @@ bool PhoneNumberMatcher::CheckNumberGroupingIsValid( const PhoneNumber& phone_number, const string& candidate, ResultCallback4&>* checker) const { + const string&, const std::vector&>* checker) const { DCHECK(checker); - // TODO: Evaluate how this works for other locales (testing has been limited - // to NANPA regions) and optimise if necessary. string normalized_candidate = NormalizeUTF8::NormalizeDecimalDigits(candidate); - vector formatted_number_groups; - GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern - &formatted_number_groups); + std::vector formatted_number_groups; + GetNationalNumberGroups(phone_number, &formatted_number_groups); if (checker->Run(phone_util_, phone_number, normalized_candidate, formatted_number_groups)) { return true; } - // If this didn't pass, see if there are any alternate formats, and try them - // instead. + // If this didn't pass, see if there are any alternate formats that match, and + // try them instead. const PhoneMetadata* alternate_formats = alternate_formats_->GetAlternateFormatsForCountry( phone_number.country_code()); if (alternate_formats) { + string national_significant_number; + phone_util_.GetNationalSignificantNumber(phone_number, + &national_significant_number); for (RepeatedPtrField::const_iterator it = alternate_formats->number_format().begin(); it != alternate_formats->number_format().end(); ++it) { + if (it->leading_digits_pattern_size() > 0) { + std::unique_ptr nsn_input( + reg_exps_->regexp_factory_->CreateInput( + national_significant_number)); + // There is only one leading digits pattern for alternate formats. + if (!reg_exps_->regexp_cache_.GetRegExp( + it->leading_digits_pattern(0)).Consume(nsn_input.get())) { + // Leading digits don't match; try another one. + continue; + } + } formatted_number_groups.clear(); - GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups); + GetNationalNumberGroupsForPattern(phone_number, &*it, + &formatted_number_groups); if (checker->Run(phone_util_, phone_number, normalized_candidate, formatted_number_groups)) { return true; @@ -701,40 +723,40 @@ bool PhoneNumberMatcher::CheckNumberGroupingIsValid( return false; } -// Helper method to get the national-number part of a number, formatted without -// any national prefix, and return it as a set of digit blocks that would be -// formatted together. void PhoneNumberMatcher::GetNationalNumberGroups( const PhoneNumber& number, - const NumberFormat* formatting_pattern, - vector* digit_blocks) const { + std::vector* digit_blocks) const { string rfc3966_format; - if (!formatting_pattern) { - // This will be in the format +CC-DG;ext=EXT where DG represents groups of - // digits. - phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); - // We remove the extension part from the formatted string before splitting - // it into different groups. - size_t end_index = rfc3966_format.find(';'); - if (end_index == string::npos) { - end_index = rfc3966_format.length(); - } - // The country-code will have a '-' following it. - size_t start_index = rfc3966_format.find('-') + 1; - SplitStringUsing(rfc3966_format.substr(start_index, - end_index - start_index), - "-", digit_blocks); - } else { - // We format the NSN only, and split that according to the separator. - string national_significant_number; - phone_util_.GetNationalSignificantNumber(number, - &national_significant_number); - phone_util_.FormatNsnUsingPattern(national_significant_number, - *formatting_pattern, - PhoneNumberUtil::RFC3966, - &rfc3966_format); - SplitStringUsing(rfc3966_format, "-", digit_blocks); + // This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX + // represents groups of digits. + phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); + // We remove the extension part from the formatted string before splitting + // it into different groups. + size_t end_index = rfc3966_format.find(';'); + if (end_index == string::npos) { + end_index = rfc3966_format.length(); } + // The country-code will have a '-' following it. + size_t start_index = rfc3966_format.find('-') + 1; + SplitStringUsing(rfc3966_format.substr(start_index, + end_index - start_index), + "-", digit_blocks); +} + +void PhoneNumberMatcher::GetNationalNumberGroupsForPattern( + const PhoneNumber& number, + const NumberFormat* formatting_pattern, + std::vector* digit_blocks) const { + string rfc3966_format; + // We format the NSN only, and split that according to the separator. + string national_significant_number; + phone_util_.GetNationalSignificantNumber(number, + &national_significant_number); + phone_util_.FormatNsnUsingPattern(national_significant_number, + *formatting_pattern, + PhoneNumberUtil::RFC3966, + &rfc3966_format); + SplitStringUsing(rfc3966_format, "-", digit_blocks); } bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired( @@ -788,10 +810,10 @@ bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent( const PhoneNumberUtil& util, const PhoneNumber& phone_number, const string& normalized_candidate, - const vector& formatted_number_groups) const { + const std::vector& formatted_number_groups) const { const scoped_ptr candidate_number( reg_exps_->regexp_factory_->CreateInput(normalized_candidate)); - vector candidate_groups; + std::vector candidate_groups; string digit_block; while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume( candidate_number.get(), diff --git a/cpp/src/phonenumbers/phonenumbermatcher.h b/cpp/src/phonenumbers/phonenumbermatcher.h index 471c89acb..90cea9c15 100644 --- a/cpp/src/phonenumbers/phonenumbermatcher.h +++ b/cpp/src/phonenumbers/phonenumbermatcher.h @@ -135,7 +135,17 @@ class PhoneNumberMatcher { ResultCallback4&>* checker) const; + // Helper method to get the national-number part of a number, formatted + // without any national prefix, and return it as a set of digit blocks that + // would be formatted together following standard formatting rules. void GetNationalNumberGroups( + const PhoneNumber& number, + vector* digit_blocks) const; + + // Helper method to get the national-number part of a number, formatted + // without any national prefix, and return it as a set of digit blocks that + // should be formatted together according to the formatting pattern passed in. + void GetNationalNumberGroupsForPattern( const PhoneNumber& number, const NumberFormat* formatting_pattern, vector* digit_blocks) const; diff --git a/cpp/test/phonenumbers/phonenumbermatcher_test.cc b/cpp/test/phonenumbers/phonenumbermatcher_test.cc index 49818f167..78520608f 100644 --- a/cpp/test/phonenumbers/phonenumbermatcher_test.cc +++ b/cpp/test/phonenumbers/phonenumbermatcher_test.cc @@ -35,7 +35,6 @@ namespace i18n { namespace phonenumbers { using std::string; -using std::vector; using icu::UnicodeString; namespace { @@ -103,10 +102,10 @@ class PhoneNumberMatcherTest : public testing::Test { // Tests each number in the test cases provided is found in its entirety for // the specified leniency level. void DoTestNumberMatchesForLeniency( - const vector& test_cases, + const std::vector& test_cases, PhoneNumberMatcher::Leniency leniency) const { scoped_ptr matcher; - for (vector::const_iterator test = test_cases.begin(); + for (std::vector::const_iterator test = test_cases.begin(); test != test_cases.end(); ++test) { matcher.reset(GetMatcherWithLeniency( test->raw_string_, test->region_, leniency)); @@ -126,10 +125,10 @@ class PhoneNumberMatcherTest : public testing::Test { // Tests no number in the test cases provided is found for the specified // leniency level. void DoTestNumberNonMatchesForLeniency( - const vector& test_cases, + const std::vector& test_cases, PhoneNumberMatcher::Leniency leniency) const { scoped_ptr matcher; - for (vector::const_iterator test = test_cases.begin(); + for (std::vector::const_iterator test = test_cases.begin(); test != test_cases.end(); ++test) { matcher.reset(GetMatcherWithLeniency( test->raw_string_, test->region_, leniency)); @@ -183,13 +182,13 @@ class PhoneNumberMatcherTest : public testing::Test { // -- if is_possible is true, they all find a test number inserted in the // middle when leniency of matching is set to POSSIBLE; else no test number // should be extracted at that leniency level - void FindMatchesInContexts(const vector& contexts, + void FindMatchesInContexts(const std::vector& contexts, bool is_valid, bool is_possible, const string& region, const string& number) { if (is_valid) { DoTestInContext(number, region, contexts, PhoneNumberMatcher::VALID); } else { - for (vector::const_iterator it = contexts.begin(); + for (std::vector::const_iterator it = contexts.begin(); it != contexts.end(); ++it) { string text = StrCat(it->leading_text_, number, it->trailing_text_); PhoneNumberMatcher matcher(text, region); @@ -199,7 +198,7 @@ class PhoneNumberMatcherTest : public testing::Test { if (is_possible) { DoTestInContext(number, region, contexts, PhoneNumberMatcher::POSSIBLE); } else { - for (vector::const_iterator it = contexts.begin(); + for (std::vector::const_iterator it = contexts.begin(); it != contexts.end(); ++it) { string text = StrCat(it->leading_text_, number, it->trailing_text_); PhoneNumberMatcher matcher(phone_util_, text, region, @@ -211,7 +210,7 @@ class PhoneNumberMatcherTest : public testing::Test { } // Variant of FindMatchesInContexts that uses a default number and region. - void FindMatchesInContexts(const vector& contexts, + void FindMatchesInContexts(const std::vector& contexts, bool is_valid, bool is_possible) { const string& region = RegionCode::US(); const string number("415-666-7777"); @@ -223,7 +222,7 @@ class PhoneNumberMatcherTest : public testing::Test { // PhoneNumberMatcher::POSSIBLE. void FindPossibleInContext(const string& number, const string& default_country) { - vector context_pairs; + std::vector context_pairs; context_pairs.push_back(NumberContext("", "")); // no context context_pairs.push_back(NumberContext(" ", "\t")); // whitespace only context_pairs.push_back(NumberContext("Hello ", "")); // no context at end @@ -267,7 +266,7 @@ class PhoneNumberMatcherTest : public testing::Test { // Tests valid numbers in contexts that fail for PhoneNumberMatcher::POSSIBLE // but are valid for PhoneNumberMatcher::VALID. void FindValidInContext(const string& number, const string& default_country) { - vector context_pairs; + std::vector context_pairs; // With other small numbers. context_pairs.push_back(NumberContext("It's only 9.99! Call ", " to buy")); // With a number Day.Month.Year date. @@ -284,9 +283,9 @@ class PhoneNumberMatcherTest : public testing::Test { } void DoTestInContext(const string& number, const string& default_country, - const vector& context_pairs, + const std::vector& context_pairs, PhoneNumberMatcher::Leniency leniency) { - for (vector::const_iterator it = context_pairs.begin(); + for (std::vector::const_iterator it = context_pairs.begin(); it != context_pairs.end(); ++it) { string prefix = it->leading_text_; string text = StrCat(prefix, number, it->trailing_text_); @@ -654,7 +653,7 @@ TEST_F(PhoneNumberMatcherTest, IsLatinLetter) { } TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingLatinChars) { - vector possible_only_contexts; + std::vector possible_only_contexts; possible_only_contexts.push_back(NumberContext("abc", "def")); possible_only_contexts.push_back(NumberContext("abc", "")); possible_only_contexts.push_back(NumberContext("", "def")); @@ -669,7 +668,7 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingLatinChars) { } TEST_F(PhoneNumberMatcherTest, MoneyNotSeenAsPhoneNumber) { - vector possible_only_contexts; + std::vector possible_only_contexts; possible_only_contexts.push_back(NumberContext("$", "")); possible_only_contexts.push_back(NumberContext("", "$")); possible_only_contexts.push_back(NumberContext("\xC2\xA3" /* "£" */, "")); @@ -678,14 +677,14 @@ TEST_F(PhoneNumberMatcherTest, MoneyNotSeenAsPhoneNumber) { } TEST_F(PhoneNumberMatcherTest, PercentageNotSeenAsPhoneNumber) { - vector possible_only_contexts; + std::vector possible_only_contexts; possible_only_contexts.push_back(NumberContext("", "%")); // Numbers followed by % should be dropped. FindMatchesInContexts(possible_only_contexts, false, true); } TEST_F(PhoneNumberMatcherTest, PhoneNumberWithLeadingOrTrailingMoneyMatches) { - vector contexts; + std::vector contexts; contexts.push_back(NumberContext("$20 ", "")); contexts.push_back(NumberContext("", " 100$")); // Because of the space after the 20 (or before the 100) these dollar amounts @@ -695,7 +694,7 @@ TEST_F(PhoneNumberMatcherTest, PhoneNumberWithLeadingOrTrailingMoneyMatches) { TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingLatinCharsAndLeadingPunctuation) { - vector possible_only_contexts; + std::vector possible_only_contexts; // Contexts with trailing characters. Leading characters are okay here since // the numbers we will insert start with punctuation, but trailing characters // are still not allowed. @@ -712,7 +711,7 @@ TEST_F(PhoneNumberMatcherTest, FindMatchesInContexts(possible_only_contexts, false, true, RegionCode::US(), number_with_brackets); - vector valid_contexts; + std::vector valid_contexts; valid_contexts.push_back(NumberContext("abc", "")); valid_contexts.push_back(NumberContext("\xC3\x89" /* "É" */, "")); valid_contexts.push_back( @@ -728,7 +727,7 @@ TEST_F(PhoneNumberMatcherTest, } TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingChineseChars) { - vector valid_contexts; + std::vector valid_contexts; valid_contexts.push_back(NumberContext( /* "我的电话号码是" */ "\xE6\x88\x91\xE7\x9A\x84\xE7\x94\xB5\xE8\xAF\x9D\xE5\x8F\xB7\xE7\xA0\x81" @@ -747,7 +746,7 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingChineseChars) { } TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingPunctuation) { - vector valid_contexts; + std::vector valid_contexts; // At end of text. valid_contexts.push_back(NumberContext("My number-", "")); // At start of text. @@ -858,6 +857,10 @@ static const NumberTest kValidCases[] = { NumberTest("03 0 -3 2 23 12 34", RegionCode::DE()), NumberTest("(0)3 0 -3 2 23 12 34", RegionCode::DE()), NumberTest("0 3 0 -3 2 23 12 34", RegionCode::DE()), +#ifdef I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS + // Fits an alternate pattern, but the leading digits don't match. + NumberTest("+52 332 123 23 23", RegionCode::MX()), +#endif // I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS }; // Strings with number-like things that should only be found up to and including @@ -926,38 +929,40 @@ static const NumberTest kExactGroupingCases[] = { }; TEST_F(PhoneNumberMatcherTest, MatchesWithPossibleLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kPossibleOnlyCases, kPossibleOnlyCases + arraysize(kPossibleOnlyCases)); test_cases.insert(test_cases.begin(), kValidCases, kValidCases + arraysize(kValidCases)); - test_cases.insert(test_cases.begin(), kStrictGroupingCases, - kStrictGroupingCases + arraysize(kStrictGroupingCases)); + test_cases.insert( + test_cases.begin(), kStrictGroupingCases, + kStrictGroupingCases + arraysize(kStrictGroupingCases)); test_cases.insert(test_cases.begin(), kExactGroupingCases, kExactGroupingCases + arraysize(kExactGroupingCases)); DoTestNumberMatchesForLeniency(test_cases, PhoneNumberMatcher::POSSIBLE); } TEST_F(PhoneNumberMatcherTest, NonMatchesWithPossibleLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kImpossibleCases, kImpossibleCases + arraysize(kImpossibleCases)); DoTestNumberNonMatchesForLeniency(test_cases, PhoneNumberMatcher::POSSIBLE); } TEST_F(PhoneNumberMatcherTest, MatchesWithValidLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kValidCases, kValidCases + arraysize(kValidCases)); - test_cases.insert(test_cases.begin(), kStrictGroupingCases, - kStrictGroupingCases + arraysize(kStrictGroupingCases)); + test_cases.insert( + test_cases.begin(), kStrictGroupingCases, + kStrictGroupingCases + arraysize(kStrictGroupingCases)); test_cases.insert(test_cases.begin(), kExactGroupingCases, kExactGroupingCases + arraysize(kExactGroupingCases)); DoTestNumberMatchesForLeniency(test_cases, PhoneNumberMatcher::VALID); } TEST_F(PhoneNumberMatcherTest, NonMatchesWithValidLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kImpossibleCases, kImpossibleCases + arraysize(kImpossibleCases)); test_cases.insert(test_cases.begin(), kPossibleOnlyCases, @@ -966,9 +971,10 @@ TEST_F(PhoneNumberMatcherTest, NonMatchesWithValidLeniency) { } TEST_F(PhoneNumberMatcherTest, MatchesWithStrictGroupingLeniency) { - vector test_cases; - test_cases.insert(test_cases.begin(), kStrictGroupingCases, - kStrictGroupingCases + arraysize(kStrictGroupingCases)); + std::vector test_cases; + test_cases.insert( + test_cases.begin(), kStrictGroupingCases, + kStrictGroupingCases + arraysize(kStrictGroupingCases)); test_cases.insert(test_cases.begin(), kExactGroupingCases, kExactGroupingCases + arraysize(kExactGroupingCases)); DoTestNumberMatchesForLeniency(test_cases, @@ -976,7 +982,7 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithStrictGroupingLeniency) { } TEST_F(PhoneNumberMatcherTest, NonMatchesWithStrictGroupingLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kImpossibleCases, kImpossibleCases + arraysize(kImpossibleCases)); test_cases.insert(test_cases.begin(), kPossibleOnlyCases, @@ -988,7 +994,7 @@ TEST_F(PhoneNumberMatcherTest, NonMatchesWithStrictGroupingLeniency) { } TEST_F(PhoneNumberMatcherTest, MatchesWithExactGroupingLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kExactGroupingCases, kExactGroupingCases + arraysize(kExactGroupingCases)); DoTestNumberMatchesForLeniency(test_cases, @@ -996,15 +1002,16 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithExactGroupingLeniency) { } TEST_F(PhoneNumberMatcherTest, NonMatchesWithExactGroupingLeniency) { - vector test_cases; + std::vector test_cases; test_cases.insert(test_cases.begin(), kImpossibleCases, kImpossibleCases + arraysize(kImpossibleCases)); test_cases.insert(test_cases.begin(), kPossibleOnlyCases, kPossibleOnlyCases + arraysize(kPossibleOnlyCases)); test_cases.insert(test_cases.begin(), kValidCases, kValidCases + arraysize(kValidCases)); - test_cases.insert(test_cases.begin(), kStrictGroupingCases, - kStrictGroupingCases + arraysize(kStrictGroupingCases)); + test_cases.insert( + test_cases.begin(), kStrictGroupingCases, + kStrictGroupingCases + arraysize(kStrictGroupingCases)); DoTestNumberNonMatchesForLeniency(test_cases, PhoneNumberMatcher::EXACT_GROUPING); } @@ -1107,11 +1114,11 @@ TEST_F(PhoneNumberMatcherTest, MaxMatches) { // Matches all 100. Max only applies to failed cases. PhoneNumber number; phone_util_.Parse("+14156667777", RegionCode::US(), &number); - vector expected(100, number); + std::vector expected(100, number); PhoneNumberMatcher matcher( phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10); - vector actual; + std::vector actual; PhoneNumberMatch match; while (matcher.HasNext()) { matcher.Next(&match); @@ -1144,11 +1151,11 @@ TEST_F(PhoneNumberMatcherTest, MaxMatchesMixed) { PhoneNumber number; phone_util_.Parse("+14156667777", RegionCode::ZZ(), &number); - vector expected(10, number); + std::vector expected(10, number); PhoneNumberMatcher matcher( phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10); - vector actual; + std::vector actual; PhoneNumberMatch match; while (matcher.HasNext()) { matcher.Next(&match); diff --git a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java index 3a6e3a687..0cb73fd1b 100644 --- a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java +++ b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java @@ -23,7 +23,7 @@ import com.google.i18n.phonenumbers.Phonemetadata.NumberFormat; import com.google.i18n.phonenumbers.Phonemetadata.PhoneMetadata; import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber.CountryCodeSource; import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber; - +import com.google.i18n.phonenumbers.internal.RegexCache; import java.lang.Character.UnicodeBlock; import java.util.Iterator; import java.util.NoSuchElementException; @@ -207,6 +207,12 @@ final class PhoneNumberMatcher implements Iterator { /** The next index to start searching at. Undefined in {@link State#DONE}. */ private int searchIndex = 0; + // A cache for frequently used country-specific regular expressions. Set to 32 to cover ~2-3 + // countries being used for the same doc with ~10 patterns for each country. Some pages will have + // a lot more countries in use, but typically fewer numbers for each so expanding the cache for + // that use-case won't have a lot of benefit. + private final RegexCache regexCache = new RegexCache(32); + /** * Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a * new instance. @@ -413,7 +419,7 @@ final class PhoneNumberMatcher implements Iterator { PhoneNumber number = phoneUtil.parseAndKeepRawInput(candidate, preferredRegion); - if (leniency.verify(number, candidate, phoneUtil)) { + if (leniency.verify(number, candidate, phoneUtil, this)) { // We used parseAndKeepRawInput to create this number, but for now we don't return the extra // values parsed. TODO: stop clearing all values here and switch all users over // to using rawInput() rather than the rawString() of PhoneNumberMatch. @@ -527,46 +533,61 @@ final class PhoneNumberMatcher implements Iterator { /** * Helper method to get the national-number part of a number, formatted without any national - * prefix, and return it as a set of digit blocks that would be formatted together. + * prefix, and return it as a set of digit blocks that would be formatted together following + * standard formatting rules. + */ + private static String[] getNationalNumberGroups(PhoneNumberUtil util, PhoneNumber number) { + // This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of + // digits. + String rfc3966Format = util.format(number, PhoneNumberFormat.RFC3966); + // We remove the extension part from the formatted string before splitting it into different + // groups. + int endIndex = rfc3966Format.indexOf(';'); + if (endIndex < 0) { + endIndex = rfc3966Format.length(); + } + // The country-code will have a '-' following it. + int startIndex = rfc3966Format.indexOf('-') + 1; + return rfc3966Format.substring(startIndex, endIndex).split("-"); + } + + /** + * Helper method to get the national-number part of a number, formatted without any national + * prefix, and return it as a set of digit blocks that should be formatted together according to + * the formatting pattern passed in. */ private static String[] getNationalNumberGroups(PhoneNumberUtil util, PhoneNumber number, NumberFormat formattingPattern) { - if (formattingPattern == null) { - // This will be in the format +CC-DG;ext=EXT where DG represents groups of digits. - String rfc3966Format = util.format(number, PhoneNumberFormat.RFC3966); - // We remove the extension part from the formatted string before splitting it into different - // groups. - int endIndex = rfc3966Format.indexOf(';'); - if (endIndex < 0) { - endIndex = rfc3966Format.length(); - } - // The country-code will have a '-' following it. - int startIndex = rfc3966Format.indexOf('-') + 1; - return rfc3966Format.substring(startIndex, endIndex).split("-"); - } else { - // We format the NSN only, and split that according to the separator. - String nationalSignificantNumber = util.getNationalSignificantNumber(number); - return util.formatNsnUsingPattern(nationalSignificantNumber, - formattingPattern, PhoneNumberFormat.RFC3966).split("-"); - } + // If a format is provided, we format the NSN only, and split that according to the separator. + String nationalSignificantNumber = util.getNationalSignificantNumber(number); + return util.formatNsnUsingPattern(nationalSignificantNumber, + formattingPattern, PhoneNumberFormat.RFC3966).split("-"); } - static boolean checkNumberGroupingIsValid( + boolean checkNumberGroupingIsValid( PhoneNumber number, CharSequence candidate, PhoneNumberUtil util, NumberGroupingChecker checker) { - // TODO: Evaluate how this works for other locales (testing has been limited to NANPA regions) - // and optimise if necessary. StringBuilder normalizedCandidate = PhoneNumberUtil.normalizeDigits(candidate, true /* keep non-digits */); - String[] formattedNumberGroups = getNationalNumberGroups(util, number, null); + String[] formattedNumberGroups = getNationalNumberGroups(util, number); if (checker.checkGroups(util, number, normalizedCandidate, formattedNumberGroups)) { return true; } - // If this didn't pass, see if there are any alternate formats, and try them instead. + // If this didn't pass, see if there are any alternate formats that match, and try them instead. PhoneMetadata alternateFormats = MetadataManager.getAlternateFormatsForCountry(number.getCountryCode()); + String nationalSignificantNumber = util.getNationalSignificantNumber(number); if (alternateFormats != null) { for (NumberFormat alternateFormat : alternateFormats.numberFormats()) { + if (alternateFormat.leadingDigitsPatternSize() > 0) { + // There is only one leading digits pattern for alternate formats. + Pattern pattern = + regexCache.getPatternForRegex(alternateFormat.getLeadingDigitsPattern(0)); + if (!pattern.matcher(nationalSignificantNumber).lookingAt()) { + // Leading digits don't match; try another one. + continue; + } + } formattedNumberGroups = getNationalNumberGroups(util, number, alternateFormat); if (checker.checkGroups(util, number, normalizedCandidate, formattedNumberGroups)) { return true; diff --git a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java index 954ced6a8..bbdc16edb 100644 --- a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java +++ b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java @@ -481,7 +481,11 @@ public class PhoneNumberUtil { */ POSSIBLE { @Override - boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) { + boolean verify( + PhoneNumber number, + CharSequence candidate, + PhoneNumberUtil util, + PhoneNumberMatcher matcher) { return util.isPossibleNumber(number); } }, @@ -493,7 +497,11 @@ public class PhoneNumberUtil { */ VALID { @Override - boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) { + boolean verify( + PhoneNumber number, + CharSequence candidate, + PhoneNumberUtil util, + PhoneNumberMatcher matcher) { if (!util.isValidNumber(number) || !PhoneNumberMatcher.containsOnlyValidXChars(number, candidate.toString(), util)) { return false; @@ -515,7 +523,11 @@ public class PhoneNumberUtil { */ STRICT_GROUPING { @Override - boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) { + boolean verify( + PhoneNumber number, + CharSequence candidate, + PhoneNumberUtil util, + PhoneNumberMatcher matcher) { String candidateString = candidate.toString(); if (!util.isValidNumber(number) || !PhoneNumberMatcher.containsOnlyValidXChars(number, candidateString, util) @@ -523,7 +535,7 @@ public class PhoneNumberUtil { || !PhoneNumberMatcher.isNationalPrefixPresentIfRequired(number, util)) { return false; } - return PhoneNumberMatcher.checkNumberGroupingIsValid( + return matcher.checkNumberGroupingIsValid( number, candidate, util, new PhoneNumberMatcher.NumberGroupingChecker() { @Override public boolean checkGroups(PhoneNumberUtil util, PhoneNumber number, @@ -548,7 +560,11 @@ public class PhoneNumberUtil { */ EXACT_GROUPING { @Override - boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) { + boolean verify( + PhoneNumber number, + CharSequence candidate, + PhoneNumberUtil util, + PhoneNumberMatcher matcher) { String candidateString = candidate.toString(); if (!util.isValidNumber(number) || !PhoneNumberMatcher.containsOnlyValidXChars(number, candidateString, util) @@ -556,7 +572,7 @@ public class PhoneNumberUtil { || !PhoneNumberMatcher.isNationalPrefixPresentIfRequired(number, util)) { return false; } - return PhoneNumberMatcher.checkNumberGroupingIsValid( + return matcher.checkNumberGroupingIsValid( number, candidate, util, new PhoneNumberMatcher.NumberGroupingChecker() { @Override public boolean checkGroups(PhoneNumberUtil util, PhoneNumber number, @@ -570,7 +586,11 @@ public class PhoneNumberUtil { }; /** Returns true if {@code number} is a verified number according to this leniency. */ - abstract boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util); + abstract boolean verify( + PhoneNumber number, + CharSequence candidate, + PhoneNumberUtil util, + PhoneNumberMatcher matcher); } // A source of metadata for different regions. @@ -974,7 +994,7 @@ public class PhoneNumberUtil { return Collections.unmodifiableSet(countryCodesForNonGeographicalRegion); } - /** + /** * Returns all country calling codes the library has metadata for, covering both non-geographical * entities (global network calling codes) and those used for geographical entities. This could be * used to populate a drop-down box of country calling codes for a phone-number widget, for diff --git a/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java b/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java index 175f52846..d31b33951 100644 --- a/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java +++ b/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java @@ -484,6 +484,8 @@ public class PhoneNumberMatcherTest extends TestMetadataTestCase { new NumberTest("03 0 -3 2 23 12 34", RegionCode.DE), new NumberTest("(0)3 0 -3 2 23 12 34", RegionCode.DE), new NumberTest("0 3 0 -3 2 23 12 34", RegionCode.DE), + // Fits an alternate pattern, but the leading digits don't match. + new NumberTest("+52 332 123 23 23", RegionCode.MX), }; /** diff --git a/pending_code_changes.txt b/pending_code_changes.txt index 8b1378917..a3d3976e9 100644 --- a/pending_code_changes.txt +++ b/pending_code_changes.txt @@ -1 +1,6 @@ - +* Code changes + - Making the application of alternate formats when finding phone numbers in + text in strict-grouping and exact-match mode depend on the leading digits + for each rule. This was always assumed but never actually done. This means + that the false positive rate will decrease but also that more valid numbers + are skipped. A subsequent CL will update patterns to increase recall.