Browse Source

Application of alternate formats respects leading digits. (#2331)

* Adding checking of leading digits before using an alternate format rule
when finding phone numbers in text (previously we thought this was done, hence why the data was there, but it wasn't).

Also some minor comment/formatting changes, and splitting a private helper method
into two.
pull/2332/head
lararennie 7 years ago
committed by GitHub
parent
commit
5824720790
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 217 additions and 130 deletions
  1. +76
    -54
      cpp/src/phonenumbers/phonenumbermatcher.cc
  2. +10
    -0
      cpp/src/phonenumbers/phonenumbermatcher.h
  3. +48
    -41
      cpp/test/phonenumbers/phonenumbermatcher_test.cc
  4. +47
    -26
      java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
  5. +28
    -8
      java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java
  6. +2
    -0
      java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java
  7. +6
    -1
      pending_code_changes.txt

+ 76
- 54
cpp/src/phonenumbers/phonenumbermatcher.cc View File

@ -49,6 +49,7 @@
#include "phonenumbers/phonenumberutil.h"
#include "phonenumbers/regexp_adapter.h"
#include "phonenumbers/regexp_adapter_icu.h"
#include "phonenumbers/regexp_cache.h"
#include "phonenumbers/stringutil.h"
#ifdef I18N_PHONENUMBERS_USE_RE2
@ -58,7 +59,6 @@
using std::map;
using std::numeric_limits;
using std::string;
using std::vector;
namespace i18n {
namespace phonenumbers {
@ -117,7 +117,7 @@ bool AllNumberGroupsRemainGrouped(
const PhoneNumberUtil& util,
const PhoneNumber& number,
const string& normalized_candidate,
const vector<string>& formatted_number_groups) {
const std::vector<string>& formatted_number_groups) {
size_t from_index = 0;
if (number.country_code_source() != PhoneNumber::FROM_DEFAULT_COUNTRY) {
// First skip the country code if the normalized candidate contained it.
@ -227,6 +227,10 @@ class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
scoped_ptr<const AbstractRegExpFactory> regexp_factory_for_pattern_;
scoped_ptr<const AbstractRegExpFactory> regexp_factory_;
// A cache for popular reg-exps of leading digits used to match formatting
// patterns and the factory used to create it.
mutable RegExpCache regexp_cache_;
// Matches strings that look like publication pages. Example:
// Computing Complete Answers to Queries in the Presence of Limited Access
// Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003).
@ -254,7 +258,7 @@ class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
//
// Note that if there is a match, we will always check any text found up to
// the first match as well.
scoped_ptr<vector<const RegExp*> > inner_matches_;
scoped_ptr<std::vector<const RegExp*> > inner_matches_;
scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_;
scoped_ptr<const RegExp> capturing_ascii_digits_pattern_;
// Compiled reg-ex representing lead_class_;
@ -289,6 +293,12 @@ class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
#else
regexp_factory_(new ICURegExpFactory()),
#endif // I18N_PHONENUMBERS_USE_RE2
// A cache for frequently used country-specific regular expressions. Set
// to 32 to cover ~2-3 countries being used for the same doc with ~10
// patterns for each country. Some pages will have a lot more countries
// in use, but typically fewer numbers for each so expanding the cache
// for that use-case won't have a lot of benefit.
regexp_cache_(*regexp_factory_, 32),
pub_pages_(regexp_factory_->CreateRegExp(
"\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")),
slash_separated_dates_(regexp_factory_->CreateRegExp(
@ -300,19 +310,19 @@ class PhoneNumberMatcherRegExps : public Singleton<PhoneNumberMatcherRegExps> {
matching_brackets_(regexp_factory_->CreateRegExp(
StrCat(leading_maybe_matched_bracket_, non_parens_, "+",
bracket_pairs_, non_parens_, "*"))),
inner_matches_(new vector<const RegExp*>()),
inner_matches_(new std::vector<const RegExp*>()),
capture_up_to_second_number_start_pattern_(
regexp_factory_->CreateRegExp(
PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
capturing_ascii_digits_pattern_(
regexp_factory_->CreateRegExp("(\\d+)")),
lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)),
pattern_(regexp_factory_for_pattern_->CreateRegExp(
StrCat("((?:", lead_class_, punctuation_, ")", lead_limit_,
digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
block_limit_, "(?i)(?:",
PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
")?)"))) {
pattern_(regexp_factory_for_pattern_->CreateRegExp(StrCat(
"((?:", lead_class_, punctuation_, ")", lead_limit_,
digit_sequence_, "(?:", punctuation_, digit_sequence_, ")",
block_limit_, "(?i)(?:",
PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(),
")?)"))) {
inner_matches_->push_back(
// Breaks on the slash - e.g. "651-234-2345/332-445-1234"
regexp_factory_->CreateRegExp("/+(.*)"));
@ -521,7 +531,7 @@ bool PhoneNumberMatcher::VerifyAccordingToLeniency(
return false;
}
ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
const string&, const vector<string>&>* callback =
const string&, const std::vector<string>&>* callback =
NewPermanentCallback(&AllNumberGroupsRemainGrouped);
bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
delete(callback);
@ -536,7 +546,7 @@ bool PhoneNumberMatcher::VerifyAccordingToLeniency(
return false;
}
ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
const string&, const vector<string>&>* callback =
const string&, const std::vector<string>&>* callback =
NewPermanentCallback(
this, &PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent);
bool is_valid = CheckNumberGroupingIsValid(number, candidate, callback);
@ -553,7 +563,7 @@ bool PhoneNumberMatcher::VerifyAccordingToLeniency(
bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset,
PhoneNumberMatch* match) {
DCHECK(match);
for (vector<const RegExp*>::const_iterator regex =
for (std::vector<const RegExp*>::const_iterator regex =
reg_exps_->inner_matches_->begin();
regex != reg_exps_->inner_matches_->end(); regex++) {
scoped_ptr<RegExpInput> candidate_input(
@ -668,30 +678,42 @@ bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
const PhoneNumber& phone_number,
const string& candidate,
ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
const string&, const vector<string>&>* checker) const {
const string&, const std::vector<string>&>* checker) const {
DCHECK(checker);
// TODO: Evaluate how this works for other locales (testing has been limited
// to NANPA regions) and optimise if necessary.
string normalized_candidate =
NormalizeUTF8::NormalizeDecimalDigits(candidate);
vector<string> formatted_number_groups;
GetNationalNumberGroups(phone_number, NULL, // Use default formatting pattern
&formatted_number_groups);
std::vector<string> formatted_number_groups;
GetNationalNumberGroups(phone_number, &formatted_number_groups);
if (checker->Run(phone_util_, phone_number, normalized_candidate,
formatted_number_groups)) {
return true;
}
// If this didn't pass, see if there are any alternate formats, and try them
// instead.
// If this didn't pass, see if there are any alternate formats that match, and
// try them instead.
const PhoneMetadata* alternate_formats =
alternate_formats_->GetAlternateFormatsForCountry(
phone_number.country_code());
if (alternate_formats) {
string national_significant_number;
phone_util_.GetNationalSignificantNumber(phone_number,
&national_significant_number);
for (RepeatedPtrField<NumberFormat>::const_iterator it =
alternate_formats->number_format().begin();
it != alternate_formats->number_format().end(); ++it) {
if (it->leading_digits_pattern_size() > 0) {
std::unique_ptr<RegExpInput> nsn_input(
reg_exps_->regexp_factory_->CreateInput(
national_significant_number));
// There is only one leading digits pattern for alternate formats.
if (!reg_exps_->regexp_cache_.GetRegExp(
it->leading_digits_pattern(0)).Consume(nsn_input.get())) {
// Leading digits don't match; try another one.
continue;
}
}
formatted_number_groups.clear();
GetNationalNumberGroups(phone_number, &*it, &formatted_number_groups);
GetNationalNumberGroupsForPattern(phone_number, &*it,
&formatted_number_groups);
if (checker->Run(phone_util_, phone_number, normalized_candidate,
formatted_number_groups)) {
return true;
@ -701,40 +723,40 @@ bool PhoneNumberMatcher::CheckNumberGroupingIsValid(
return false;
}
// Helper method to get the national-number part of a number, formatted without
// any national prefix, and return it as a set of digit blocks that would be
// formatted together.
void PhoneNumberMatcher::GetNationalNumberGroups(
const PhoneNumber& number,
const NumberFormat* formatting_pattern,
vector<string>* digit_blocks) const {
std::vector<string>* digit_blocks) const {
string rfc3966_format;
if (!formatting_pattern) {
// This will be in the format +CC-DG;ext=EXT where DG represents groups of
// digits.
phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
// We remove the extension part from the formatted string before splitting
// it into different groups.
size_t end_index = rfc3966_format.find(';');
if (end_index == string::npos) {
end_index = rfc3966_format.length();
}
// The country-code will have a '-' following it.
size_t start_index = rfc3966_format.find('-') + 1;
SplitStringUsing(rfc3966_format.substr(start_index,
end_index - start_index),
"-", digit_blocks);
} else {
// We format the NSN only, and split that according to the separator.
string national_significant_number;
phone_util_.GetNationalSignificantNumber(number,
&national_significant_number);
phone_util_.FormatNsnUsingPattern(national_significant_number,
*formatting_pattern,
PhoneNumberUtil::RFC3966,
&rfc3966_format);
SplitStringUsing(rfc3966_format, "-", digit_blocks);
// This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX
// represents groups of digits.
phone_util_.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format);
// We remove the extension part from the formatted string before splitting
// it into different groups.
size_t end_index = rfc3966_format.find(';');
if (end_index == string::npos) {
end_index = rfc3966_format.length();
}
// The country-code will have a '-' following it.
size_t start_index = rfc3966_format.find('-') + 1;
SplitStringUsing(rfc3966_format.substr(start_index,
end_index - start_index),
"-", digit_blocks);
}
void PhoneNumberMatcher::GetNationalNumberGroupsForPattern(
const PhoneNumber& number,
const NumberFormat* formatting_pattern,
std::vector<string>* digit_blocks) const {
string rfc3966_format;
// We format the NSN only, and split that according to the separator.
string national_significant_number;
phone_util_.GetNationalSignificantNumber(number,
&national_significant_number);
phone_util_.FormatNsnUsingPattern(national_significant_number,
*formatting_pattern,
PhoneNumberUtil::RFC3966,
&rfc3966_format);
SplitStringUsing(rfc3966_format, "-", digit_blocks);
}
bool PhoneNumberMatcher::IsNationalPrefixPresentIfRequired(
@ -788,10 +810,10 @@ bool PhoneNumberMatcher::AllNumberGroupsAreExactlyPresent(
const PhoneNumberUtil& util,
const PhoneNumber& phone_number,
const string& normalized_candidate,
const vector<string>& formatted_number_groups) const {
const std::vector<string>& formatted_number_groups) const {
const scoped_ptr<RegExpInput> candidate_number(
reg_exps_->regexp_factory_->CreateInput(normalized_candidate));
vector<string> candidate_groups;
std::vector<string> candidate_groups;
string digit_block;
while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume(
candidate_number.get(),


+ 10
- 0
cpp/src/phonenumbers/phonenumbermatcher.h View File

@ -135,7 +135,17 @@ class PhoneNumberMatcher {
ResultCallback4<bool, const PhoneNumberUtil&, const PhoneNumber&,
const string&, const vector<string>&>* checker) const;
// Helper method to get the national-number part of a number, formatted
// without any national prefix, and return it as a set of digit blocks that
// would be formatted together following standard formatting rules.
void GetNationalNumberGroups(
const PhoneNumber& number,
vector<string>* digit_blocks) const;
// Helper method to get the national-number part of a number, formatted
// without any national prefix, and return it as a set of digit blocks that
// should be formatted together according to the formatting pattern passed in.
void GetNationalNumberGroupsForPattern(
const PhoneNumber& number,
const NumberFormat* formatting_pattern,
vector<string>* digit_blocks) const;


+ 48
- 41
cpp/test/phonenumbers/phonenumbermatcher_test.cc View File

@ -35,7 +35,6 @@ namespace i18n {
namespace phonenumbers {
using std::string;
using std::vector;
using icu::UnicodeString;
namespace {
@ -103,10 +102,10 @@ class PhoneNumberMatcherTest : public testing::Test {
// Tests each number in the test cases provided is found in its entirety for
// the specified leniency level.
void DoTestNumberMatchesForLeniency(
const vector<NumberTest>& test_cases,
const std::vector<NumberTest>& test_cases,
PhoneNumberMatcher::Leniency leniency) const {
scoped_ptr<PhoneNumberMatcher> matcher;
for (vector<NumberTest>::const_iterator test = test_cases.begin();
for (std::vector<NumberTest>::const_iterator test = test_cases.begin();
test != test_cases.end(); ++test) {
matcher.reset(GetMatcherWithLeniency(
test->raw_string_, test->region_, leniency));
@ -126,10 +125,10 @@ class PhoneNumberMatcherTest : public testing::Test {
// Tests no number in the test cases provided is found for the specified
// leniency level.
void DoTestNumberNonMatchesForLeniency(
const vector<NumberTest>& test_cases,
const std::vector<NumberTest>& test_cases,
PhoneNumberMatcher::Leniency leniency) const {
scoped_ptr<PhoneNumberMatcher> matcher;
for (vector<NumberTest>::const_iterator test = test_cases.begin();
for (std::vector<NumberTest>::const_iterator test = test_cases.begin();
test != test_cases.end(); ++test) {
matcher.reset(GetMatcherWithLeniency(
test->raw_string_, test->region_, leniency));
@ -183,13 +182,13 @@ class PhoneNumberMatcherTest : public testing::Test {
// -- if is_possible is true, they all find a test number inserted in the
// middle when leniency of matching is set to POSSIBLE; else no test number
// should be extracted at that leniency level
void FindMatchesInContexts(const vector<NumberContext>& contexts,
void FindMatchesInContexts(const std::vector<NumberContext>& contexts,
bool is_valid, bool is_possible,
const string& region, const string& number) {
if (is_valid) {
DoTestInContext(number, region, contexts, PhoneNumberMatcher::VALID);
} else {
for (vector<NumberContext>::const_iterator it = contexts.begin();
for (std::vector<NumberContext>::const_iterator it = contexts.begin();
it != contexts.end(); ++it) {
string text = StrCat(it->leading_text_, number, it->trailing_text_);
PhoneNumberMatcher matcher(text, region);
@ -199,7 +198,7 @@ class PhoneNumberMatcherTest : public testing::Test {
if (is_possible) {
DoTestInContext(number, region, contexts, PhoneNumberMatcher::POSSIBLE);
} else {
for (vector<NumberContext>::const_iterator it = contexts.begin();
for (std::vector<NumberContext>::const_iterator it = contexts.begin();
it != contexts.end(); ++it) {
string text = StrCat(it->leading_text_, number, it->trailing_text_);
PhoneNumberMatcher matcher(phone_util_, text, region,
@ -211,7 +210,7 @@ class PhoneNumberMatcherTest : public testing::Test {
}
// Variant of FindMatchesInContexts that uses a default number and region.
void FindMatchesInContexts(const vector<NumberContext>& contexts,
void FindMatchesInContexts(const std::vector<NumberContext>& contexts,
bool is_valid, bool is_possible) {
const string& region = RegionCode::US();
const string number("415-666-7777");
@ -223,7 +222,7 @@ class PhoneNumberMatcherTest : public testing::Test {
// PhoneNumberMatcher::POSSIBLE.
void FindPossibleInContext(const string& number,
const string& default_country) {
vector<NumberContext> context_pairs;
std::vector<NumberContext> context_pairs;
context_pairs.push_back(NumberContext("", "")); // no context
context_pairs.push_back(NumberContext(" ", "\t")); // whitespace only
context_pairs.push_back(NumberContext("Hello ", "")); // no context at end
@ -267,7 +266,7 @@ class PhoneNumberMatcherTest : public testing::Test {
// Tests valid numbers in contexts that fail for PhoneNumberMatcher::POSSIBLE
// but are valid for PhoneNumberMatcher::VALID.
void FindValidInContext(const string& number, const string& default_country) {
vector<NumberContext> context_pairs;
std::vector<NumberContext> context_pairs;
// With other small numbers.
context_pairs.push_back(NumberContext("It's only 9.99! Call ", " to buy"));
// With a number Day.Month.Year date.
@ -284,9 +283,9 @@ class PhoneNumberMatcherTest : public testing::Test {
}
void DoTestInContext(const string& number, const string& default_country,
const vector<NumberContext>& context_pairs,
const std::vector<NumberContext>& context_pairs,
PhoneNumberMatcher::Leniency leniency) {
for (vector<NumberContext>::const_iterator it = context_pairs.begin();
for (std::vector<NumberContext>::const_iterator it = context_pairs.begin();
it != context_pairs.end(); ++it) {
string prefix = it->leading_text_;
string text = StrCat(prefix, number, it->trailing_text_);
@ -654,7 +653,7 @@ TEST_F(PhoneNumberMatcherTest, IsLatinLetter) {
}
TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingLatinChars) {
vector<NumberContext> possible_only_contexts;
std::vector<NumberContext> possible_only_contexts;
possible_only_contexts.push_back(NumberContext("abc", "def"));
possible_only_contexts.push_back(NumberContext("abc", ""));
possible_only_contexts.push_back(NumberContext("", "def"));
@ -669,7 +668,7 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingLatinChars) {
}
TEST_F(PhoneNumberMatcherTest, MoneyNotSeenAsPhoneNumber) {
vector<NumberContext> possible_only_contexts;
std::vector<NumberContext> possible_only_contexts;
possible_only_contexts.push_back(NumberContext("$", ""));
possible_only_contexts.push_back(NumberContext("", "$"));
possible_only_contexts.push_back(NumberContext("\xC2\xA3" /* "£" */, ""));
@ -678,14 +677,14 @@ TEST_F(PhoneNumberMatcherTest, MoneyNotSeenAsPhoneNumber) {
}
TEST_F(PhoneNumberMatcherTest, PercentageNotSeenAsPhoneNumber) {
vector<NumberContext> possible_only_contexts;
std::vector<NumberContext> possible_only_contexts;
possible_only_contexts.push_back(NumberContext("", "%"));
// Numbers followed by % should be dropped.
FindMatchesInContexts(possible_only_contexts, false, true);
}
TEST_F(PhoneNumberMatcherTest, PhoneNumberWithLeadingOrTrailingMoneyMatches) {
vector<NumberContext> contexts;
std::vector<NumberContext> contexts;
contexts.push_back(NumberContext("$20 ", ""));
contexts.push_back(NumberContext("", " 100$"));
// Because of the space after the 20 (or before the 100) these dollar amounts
@ -695,7 +694,7 @@ TEST_F(PhoneNumberMatcherTest, PhoneNumberWithLeadingOrTrailingMoneyMatches) {
TEST_F(PhoneNumberMatcherTest,
MatchesWithSurroundingLatinCharsAndLeadingPunctuation) {
vector<NumberContext> possible_only_contexts;
std::vector<NumberContext> possible_only_contexts;
// Contexts with trailing characters. Leading characters are okay here since
// the numbers we will insert start with punctuation, but trailing characters
// are still not allowed.
@ -712,7 +711,7 @@ TEST_F(PhoneNumberMatcherTest,
FindMatchesInContexts(possible_only_contexts, false, true, RegionCode::US(),
number_with_brackets);
vector<NumberContext> valid_contexts;
std::vector<NumberContext> valid_contexts;
valid_contexts.push_back(NumberContext("abc", ""));
valid_contexts.push_back(NumberContext("\xC3\x89" /* "É" */, ""));
valid_contexts.push_back(
@ -728,7 +727,7 @@ TEST_F(PhoneNumberMatcherTest,
}
TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingChineseChars) {
vector<NumberContext> valid_contexts;
std::vector<NumberContext> valid_contexts;
valid_contexts.push_back(NumberContext(
/* "我的电话号码是" */
"\xE6\x88\x91\xE7\x9A\x84\xE7\x94\xB5\xE8\xAF\x9D\xE5\x8F\xB7\xE7\xA0\x81"
@ -747,7 +746,7 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingChineseChars) {
}
TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingPunctuation) {
vector<NumberContext> valid_contexts;
std::vector<NumberContext> valid_contexts;
// At end of text.
valid_contexts.push_back(NumberContext("My number-", ""));
// At start of text.
@ -858,6 +857,10 @@ static const NumberTest kValidCases[] = {
NumberTest("03 0 -3 2 23 12 34", RegionCode::DE()),
NumberTest("(0)3 0 -3 2 23 12 34", RegionCode::DE()),
NumberTest("0 3 0 -3 2 23 12 34", RegionCode::DE()),
#ifdef I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS
// Fits an alternate pattern, but the leading digits don't match.
NumberTest("+52 332 123 23 23", RegionCode::MX()),
#endif // I18N_PHONENUMBERS_USE_ALTERNATE_FORMATS
};
// Strings with number-like things that should only be found up to and including
@ -926,38 +929,40 @@ static const NumberTest kExactGroupingCases[] = {
};
TEST_F(PhoneNumberMatcherTest, MatchesWithPossibleLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kPossibleOnlyCases,
kPossibleOnlyCases + arraysize(kPossibleOnlyCases));
test_cases.insert(test_cases.begin(), kValidCases,
kValidCases + arraysize(kValidCases));
test_cases.insert(test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
test_cases.insert(
test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
test_cases.insert(test_cases.begin(), kExactGroupingCases,
kExactGroupingCases + arraysize(kExactGroupingCases));
DoTestNumberMatchesForLeniency(test_cases, PhoneNumberMatcher::POSSIBLE);
}
TEST_F(PhoneNumberMatcherTest, NonMatchesWithPossibleLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kImpossibleCases,
kImpossibleCases + arraysize(kImpossibleCases));
DoTestNumberNonMatchesForLeniency(test_cases, PhoneNumberMatcher::POSSIBLE);
}
TEST_F(PhoneNumberMatcherTest, MatchesWithValidLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kValidCases,
kValidCases + arraysize(kValidCases));
test_cases.insert(test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
test_cases.insert(
test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
test_cases.insert(test_cases.begin(), kExactGroupingCases,
kExactGroupingCases + arraysize(kExactGroupingCases));
DoTestNumberMatchesForLeniency(test_cases, PhoneNumberMatcher::VALID);
}
TEST_F(PhoneNumberMatcherTest, NonMatchesWithValidLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kImpossibleCases,
kImpossibleCases + arraysize(kImpossibleCases));
test_cases.insert(test_cases.begin(), kPossibleOnlyCases,
@ -966,9 +971,10 @@ TEST_F(PhoneNumberMatcherTest, NonMatchesWithValidLeniency) {
}
TEST_F(PhoneNumberMatcherTest, MatchesWithStrictGroupingLeniency) {
vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
std::vector<NumberTest> test_cases;
test_cases.insert(
test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
test_cases.insert(test_cases.begin(), kExactGroupingCases,
kExactGroupingCases + arraysize(kExactGroupingCases));
DoTestNumberMatchesForLeniency(test_cases,
@ -976,7 +982,7 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithStrictGroupingLeniency) {
}
TEST_F(PhoneNumberMatcherTest, NonMatchesWithStrictGroupingLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kImpossibleCases,
kImpossibleCases + arraysize(kImpossibleCases));
test_cases.insert(test_cases.begin(), kPossibleOnlyCases,
@ -988,7 +994,7 @@ TEST_F(PhoneNumberMatcherTest, NonMatchesWithStrictGroupingLeniency) {
}
TEST_F(PhoneNumberMatcherTest, MatchesWithExactGroupingLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kExactGroupingCases,
kExactGroupingCases + arraysize(kExactGroupingCases));
DoTestNumberMatchesForLeniency(test_cases,
@ -996,15 +1002,16 @@ TEST_F(PhoneNumberMatcherTest, MatchesWithExactGroupingLeniency) {
}
TEST_F(PhoneNumberMatcherTest, NonMatchesWithExactGroupingLeniency) {
vector<NumberTest> test_cases;
std::vector<NumberTest> test_cases;
test_cases.insert(test_cases.begin(), kImpossibleCases,
kImpossibleCases + arraysize(kImpossibleCases));
test_cases.insert(test_cases.begin(), kPossibleOnlyCases,
kPossibleOnlyCases + arraysize(kPossibleOnlyCases));
test_cases.insert(test_cases.begin(), kValidCases,
kValidCases + arraysize(kValidCases));
test_cases.insert(test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
test_cases.insert(
test_cases.begin(), kStrictGroupingCases,
kStrictGroupingCases + arraysize(kStrictGroupingCases));
DoTestNumberNonMatchesForLeniency(test_cases,
PhoneNumberMatcher::EXACT_GROUPING);
}
@ -1107,11 +1114,11 @@ TEST_F(PhoneNumberMatcherTest, MaxMatches) {
// Matches all 100. Max only applies to failed cases.
PhoneNumber number;
phone_util_.Parse("+14156667777", RegionCode::US(), &number);
vector<PhoneNumber> expected(100, number);
std::vector<PhoneNumber> expected(100, number);
PhoneNumberMatcher matcher(
phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10);
vector<PhoneNumber> actual;
std::vector<PhoneNumber> actual;
PhoneNumberMatch match;
while (matcher.HasNext()) {
matcher.Next(&match);
@ -1144,11 +1151,11 @@ TEST_F(PhoneNumberMatcherTest, MaxMatchesMixed) {
PhoneNumber number;
phone_util_.Parse("+14156667777", RegionCode::ZZ(), &number);
vector<PhoneNumber> expected(10, number);
std::vector<PhoneNumber> expected(10, number);
PhoneNumberMatcher matcher(
phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10);
vector<PhoneNumber> actual;
std::vector<PhoneNumber> actual;
PhoneNumberMatch match;
while (matcher.HasNext()) {
matcher.Next(&match);


+ 47
- 26
java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java View File

@ -23,7 +23,7 @@ import com.google.i18n.phonenumbers.Phonemetadata.NumberFormat;
import com.google.i18n.phonenumbers.Phonemetadata.PhoneMetadata;
import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber.CountryCodeSource;
import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber;
import com.google.i18n.phonenumbers.internal.RegexCache;
import java.lang.Character.UnicodeBlock;
import java.util.Iterator;
import java.util.NoSuchElementException;
@ -207,6 +207,12 @@ final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> {
/** The next index to start searching at. Undefined in {@link State#DONE}. */
private int searchIndex = 0;
// A cache for frequently used country-specific regular expressions. Set to 32 to cover ~2-3
// countries being used for the same doc with ~10 patterns for each country. Some pages will have
// a lot more countries in use, but typically fewer numbers for each so expanding the cache for
// that use-case won't have a lot of benefit.
private final RegexCache regexCache = new RegexCache(32);
/**
* Creates a new instance. See the factory methods in {@link PhoneNumberUtil} on how to obtain a
* new instance.
@ -413,7 +419,7 @@ final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> {
PhoneNumber number = phoneUtil.parseAndKeepRawInput(candidate, preferredRegion);
if (leniency.verify(number, candidate, phoneUtil)) {
if (leniency.verify(number, candidate, phoneUtil, this)) {
// We used parseAndKeepRawInput to create this number, but for now we don't return the extra
// values parsed. TODO: stop clearing all values here and switch all users over
// to using rawInput() rather than the rawString() of PhoneNumberMatch.
@ -527,46 +533,61 @@ final class PhoneNumberMatcher implements Iterator<PhoneNumberMatch> {
/**
* Helper method to get the national-number part of a number, formatted without any national
* prefix, and return it as a set of digit blocks that would be formatted together.
* prefix, and return it as a set of digit blocks that would be formatted together following
* standard formatting rules.
*/
private static String[] getNationalNumberGroups(PhoneNumberUtil util, PhoneNumber number) {
// This will be in the format +CC-DG1-DG2-DGX;ext=EXT where DG1..DGX represents groups of
// digits.
String rfc3966Format = util.format(number, PhoneNumberFormat.RFC3966);
// We remove the extension part from the formatted string before splitting it into different
// groups.
int endIndex = rfc3966Format.indexOf(';');
if (endIndex < 0) {
endIndex = rfc3966Format.length();
}
// The country-code will have a '-' following it.
int startIndex = rfc3966Format.indexOf('-') + 1;
return rfc3966Format.substring(startIndex, endIndex).split("-");
}
/**
* Helper method to get the national-number part of a number, formatted without any national
* prefix, and return it as a set of digit blocks that should be formatted together according to
* the formatting pattern passed in.
*/
private static String[] getNationalNumberGroups(PhoneNumberUtil util, PhoneNumber number,
NumberFormat formattingPattern) {
if (formattingPattern == null) {
// This will be in the format +CC-DG;ext=EXT where DG represents groups of digits.
String rfc3966Format = util.format(number, PhoneNumberFormat.RFC3966);
// We remove the extension part from the formatted string before splitting it into different
// groups.
int endIndex = rfc3966Format.indexOf(';');
if (endIndex < 0) {
endIndex = rfc3966Format.length();
}
// The country-code will have a '-' following it.
int startIndex = rfc3966Format.indexOf('-') + 1;
return rfc3966Format.substring(startIndex, endIndex).split("-");
} else {
// We format the NSN only, and split that according to the separator.
String nationalSignificantNumber = util.getNationalSignificantNumber(number);
return util.formatNsnUsingPattern(nationalSignificantNumber,
formattingPattern, PhoneNumberFormat.RFC3966).split("-");
}
// If a format is provided, we format the NSN only, and split that according to the separator.
String nationalSignificantNumber = util.getNationalSignificantNumber(number);
return util.formatNsnUsingPattern(nationalSignificantNumber,
formattingPattern, PhoneNumberFormat.RFC3966).split("-");
}
static boolean checkNumberGroupingIsValid(
boolean checkNumberGroupingIsValid(
PhoneNumber number, CharSequence candidate, PhoneNumberUtil util,
NumberGroupingChecker checker) {
// TODO: Evaluate how this works for other locales (testing has been limited to NANPA regions)
// and optimise if necessary.
StringBuilder normalizedCandidate =
PhoneNumberUtil.normalizeDigits(candidate, true /* keep non-digits */);
String[] formattedNumberGroups = getNationalNumberGroups(util, number, null);
String[] formattedNumberGroups = getNationalNumberGroups(util, number);
if (checker.checkGroups(util, number, normalizedCandidate, formattedNumberGroups)) {
return true;
}
// If this didn't pass, see if there are any alternate formats, and try them instead.
// If this didn't pass, see if there are any alternate formats that match, and try them instead.
PhoneMetadata alternateFormats =
MetadataManager.getAlternateFormatsForCountry(number.getCountryCode());
String nationalSignificantNumber = util.getNationalSignificantNumber(number);
if (alternateFormats != null) {
for (NumberFormat alternateFormat : alternateFormats.numberFormats()) {
if (alternateFormat.leadingDigitsPatternSize() > 0) {
// There is only one leading digits pattern for alternate formats.
Pattern pattern =
regexCache.getPatternForRegex(alternateFormat.getLeadingDigitsPattern(0));
if (!pattern.matcher(nationalSignificantNumber).lookingAt()) {
// Leading digits don't match; try another one.
continue;
}
}
formattedNumberGroups = getNationalNumberGroups(util, number, alternateFormat);
if (checker.checkGroups(util, number, normalizedCandidate, formattedNumberGroups)) {
return true;


+ 28
- 8
java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java View File

@ -481,7 +481,11 @@ public class PhoneNumberUtil {
*/
POSSIBLE {
@Override
boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) {
boolean verify(
PhoneNumber number,
CharSequence candidate,
PhoneNumberUtil util,
PhoneNumberMatcher matcher) {
return util.isPossibleNumber(number);
}
},
@ -493,7 +497,11 @@ public class PhoneNumberUtil {
*/
VALID {
@Override
boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) {
boolean verify(
PhoneNumber number,
CharSequence candidate,
PhoneNumberUtil util,
PhoneNumberMatcher matcher) {
if (!util.isValidNumber(number)
|| !PhoneNumberMatcher.containsOnlyValidXChars(number, candidate.toString(), util)) {
return false;
@ -515,7 +523,11 @@ public class PhoneNumberUtil {
*/
STRICT_GROUPING {
@Override
boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) {
boolean verify(
PhoneNumber number,
CharSequence candidate,
PhoneNumberUtil util,
PhoneNumberMatcher matcher) {
String candidateString = candidate.toString();
if (!util.isValidNumber(number)
|| !PhoneNumberMatcher.containsOnlyValidXChars(number, candidateString, util)
@ -523,7 +535,7 @@ public class PhoneNumberUtil {
|| !PhoneNumberMatcher.isNationalPrefixPresentIfRequired(number, util)) {
return false;
}
return PhoneNumberMatcher.checkNumberGroupingIsValid(
return matcher.checkNumberGroupingIsValid(
number, candidate, util, new PhoneNumberMatcher.NumberGroupingChecker() {
@Override
public boolean checkGroups(PhoneNumberUtil util, PhoneNumber number,
@ -548,7 +560,11 @@ public class PhoneNumberUtil {
*/
EXACT_GROUPING {
@Override
boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util) {
boolean verify(
PhoneNumber number,
CharSequence candidate,
PhoneNumberUtil util,
PhoneNumberMatcher matcher) {
String candidateString = candidate.toString();
if (!util.isValidNumber(number)
|| !PhoneNumberMatcher.containsOnlyValidXChars(number, candidateString, util)
@ -556,7 +572,7 @@ public class PhoneNumberUtil {
|| !PhoneNumberMatcher.isNationalPrefixPresentIfRequired(number, util)) {
return false;
}
return PhoneNumberMatcher.checkNumberGroupingIsValid(
return matcher.checkNumberGroupingIsValid(
number, candidate, util, new PhoneNumberMatcher.NumberGroupingChecker() {
@Override
public boolean checkGroups(PhoneNumberUtil util, PhoneNumber number,
@ -570,7 +586,11 @@ public class PhoneNumberUtil {
};
/** Returns true if {@code number} is a verified number according to this leniency. */
abstract boolean verify(PhoneNumber number, CharSequence candidate, PhoneNumberUtil util);
abstract boolean verify(
PhoneNumber number,
CharSequence candidate,
PhoneNumberUtil util,
PhoneNumberMatcher matcher);
}
// A source of metadata for different regions.
@ -974,7 +994,7 @@ public class PhoneNumberUtil {
return Collections.unmodifiableSet(countryCodesForNonGeographicalRegion);
}
/**
/**
* Returns all country calling codes the library has metadata for, covering both non-geographical
* entities (global network calling codes) and those used for geographical entities. This could be
* used to populate a drop-down box of country calling codes for a phone-number widget, for


+ 2
- 0
java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java View File

@ -484,6 +484,8 @@ public class PhoneNumberMatcherTest extends TestMetadataTestCase {
new NumberTest("03 0 -3 2 23 12 34", RegionCode.DE),
new NumberTest("(0)3 0 -3 2 23 12 34", RegionCode.DE),
new NumberTest("0 3 0 -3 2 23 12 34", RegionCode.DE),
// Fits an alternate pattern, but the leading digits don't match.
new NumberTest("+52 332 123 23 23", RegionCode.MX),
};
/**


+ 6
- 1
pending_code_changes.txt View File

@ -1 +1,6 @@
* Code changes
- Making the application of alternate formats when finding phone numbers in
text in strict-grouping and exact-match mode depend on the leading digits
for each rule. This was always assumed but never actually done. This means
that the false positive rate will decrease but also that more valid numbers
are skipped. A subsequent CL will update patterns to increase recall.

Loading…
Cancel
Save