Browse Source

Increase max length allowed for phone extension based on anbiguity of labels. (#2526)

pull/2423/head
penmetsaa 5 years ago
committed by GitHub
parent
commit
32719cf74e
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 631 additions and 113 deletions
  1. +135
    -44
      cpp/src/phonenumbers/phonenumberutil.cc
  2. +45
    -9
      cpp/src/phonenumbers/regexp_adapter.h
  3. +7
    -4
      cpp/src/phonenumbers/regexp_adapter_icu.cc
  4. +19
    -9
      cpp/src/phonenumbers/regexp_adapter_re2.cc
  5. +163
    -0
      cpp/test/phonenumbers/phonenumberutil_test.cc
  6. +42
    -10
      cpp/test/phonenumbers/regexp_adapter_test.cc
  7. +89
    -37
      java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java
  8. +131
    -0
      java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java

+ 135
- 44
cpp/src/phonenumbers/phonenumberutil.cc View File

@ -118,6 +118,15 @@ const char kDefaultExtnPrefix[] = " ext. ";
const char kSingleExtnSymbolsForMatching[] =
"x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E";
const char kPossibleSeparatorsBetweenNumberAndExtLabel[] =
"[ \xC2\xA0\\t,]*";
// Optional full stop (.) or colon, followed by zero or more
// spaces/tabs/commas.
const char kPossibleCharsAfterExtLabel[] =
"[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*";
const char kOptionalExtSuffix[] = "#?";
bool LoadCompiledInMetadata(PhoneMetadataCollection* metadata) {
if (!metadata->ParseFromArray(metadata_get(), metadata_size())) {
LOG(ERROR) << "Could not parse binary data.";
@ -202,35 +211,109 @@ char32 ToUnicodeCodepoint(const char* unicode_char) {
return codepoint;
}
// Helper method for constructing regular expressions for parsing. Creates an
// expression that captures up to max_length digits.
std::string ExtnDigits(int max_length) {
return StrCat("([", kDigits, "]{1,", max_length, "})");
}
// Helper initialiser method to create the regular-expression pattern to match
// extensions, allowing the one-codepoint extension symbols provided by
// single_extn_symbols.
// Note that there are currently three capturing groups for the extension itself
// - if this number is changed, MaybeStripExtension needs to be updated.
string CreateExtnPattern(const string& single_extn_symbols) {
static const string capturing_extn_digits = StrCat("([", kDigits, "]{1,7})");
// The first regular expression covers RFC 3966 format, where the extension is
// added using ";ext=". The second more generic one starts with optional white
// space and ends with an optional full stop (.), followed by zero or more
// spaces/tabs/commas and then the numbers themselves. The third one covers
// the special case of American numbers where the extension is written with a
// hash at the end, such as "- 503#".
// Note that the only capturing groups should be around the digits that you
// want to capture as part of the extension, or else parsing will fail!
// extensions. Note that:
// - There are currently six capturing groups for the extension itself. If this
// number is changed, MaybeStripExtension needs to be updated.
// - The only capturing groups should be around the digits that you want to
// capture as part of the extension, or else parsing will fail!
std::string CreateExtnPattern(bool for_parsing) {
// We cap the maximum length of an extension based on the ambiguity of the
// way the extension is prefixed. As per ITU, the officially allowed
// length for extensions is actually 40, but we don't support this since we
// haven't seen real examples and this introduces many false interpretations
// as the extension labels are not standardized.
int ext_limit_after_explicit_label = 20;
int ext_limit_after_likely_label = 15;
int ext_limit_after_ambiguous_char = 9;
int ext_limit_when_not_sure = 6;
// Canonical-equivalence doesn't seem to be an option with RE2, so we allow
// two options for representing the ó - the character itself, and one in the
// unicode decomposed form with the combining acute accent.
return (StrCat(
kRfc3966ExtnPrefix, capturing_extn_digits, "|"
/* "[  \\t,]*(?:e?xt(?:ensi(?:ó?|ó))?n?|e?xtn?|single_extn_symbols|"
"int|int|anexo)"
"[:\\..]?[  \\t,-]*", capturing_extn_digits, "#?|" */
"[ \xC2\xA0\\t,]*(?:e?xt(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|"
"(?:\xEF\xBD\x85)?\xEF\xBD\x98\xEF\xBD\x94(?:\xEF\xBD\x8E)?|"
"\xD0\xB4\xD0\xBE\xD0\xB1|[", single_extn_symbols, "]|int|"
"\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)"
"[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits,
"#?|[- ]+([", kDigits, "]{1,5})#"));
// two options for representing any non-ASCII character like ó - the character
// itself, and one in the unicode decomposed form with the combining acute
// accent.
// Here the extension is called out in a more explicit way, i.e mentioning it
// obvious patterns like "ext.".
string explicit_ext_labels =
"(?:e?xt(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|(?:\xEF\xBD\x85)?"
"\xEF\xBD\x98\xEF\xBD\x94(?:\xEF\xBD\x8E)?|\xD0\xB4\xD0\xBE\xD0\xB1|"
"anexo)";
// One-character symbols that can be used to indicate an extension, and less
// commonly used or more ambiguous extension labels.
string ambiguous_ext_labels =
"(?:[x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E]|int|"
"\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94)";
// When extension is not separated clearly.
string ambiguous_separator = "[- ]+";
string rfc_extn = StrCat(kRfc3966ExtnPrefix,
ExtnDigits(ext_limit_after_explicit_label));
string explicit_extn = StrCat(
kPossibleSeparatorsBetweenNumberAndExtLabel,
explicit_ext_labels, kPossibleCharsAfterExtLabel,
ExtnDigits(ext_limit_after_explicit_label),
kOptionalExtSuffix);
string ambiguous_extn = StrCat(
kPossibleSeparatorsBetweenNumberAndExtLabel,
ambiguous_ext_labels, kPossibleCharsAfterExtLabel,
ExtnDigits(ext_limit_after_ambiguous_char),
kOptionalExtSuffix);
string american_style_extn_with_suffix = StrCat(
ambiguous_separator, ExtnDigits(ext_limit_when_not_sure), "#");
// The first regular expression covers RFC 3966 format, where the extension is
// added using ";ext=". The second more generic where extension is mentioned
// with explicit labels like "ext:". In both the above cases we allow more
// numbers in extension than any other extension labels. The third one
// captures when single character extension labels or less commonly used
// labels are present. In such cases we capture fewer extension digits in
// order to reduce the chance of falsely interpreting two numbers beside each
// other as a number + extension. The fourth one covers the special case of
// American numbers where the extension is written with a hash at the end,
// such as "- 503#".
string extension_pattern = StrCat(
rfc_extn, "|",
explicit_extn, "|",
ambiguous_extn, "|",
american_style_extn_with_suffix);
// Additional pattern that is supported when parsing extensions, not when
// matching.
if (for_parsing) {
// ",," is commonly used for auto dialling the extension when connected.
// Semi-colon works in Iphone and also in Android to pop up a button with
// the extension number following.
string auto_dialling_and_ext_labels_found = "(?:,{2}|;)";
// This is same as kPossibleSeparatorsBetweenNumberAndExtLabel, but not
// matching comma as extension label may have it.
string possible_separators_number_extLabel_no_comma = "[ \xC2\xA0\\t]*";
string auto_dialling_extn = StrCat(
possible_separators_number_extLabel_no_comma,
auto_dialling_and_ext_labels_found, kPossibleCharsAfterExtLabel,
ExtnDigits(ext_limit_after_likely_label),
kOptionalExtSuffix);
string only_commas_extn = StrCat(
possible_separators_number_extLabel_no_comma,
"(?:,)+", kPossibleCharsAfterExtLabel,
ExtnDigits(ext_limit_after_ambiguous_char),
kOptionalExtSuffix);
// Here the first pattern is exclusive for extension autodialling formats
// which are used when dialling and in this case we accept longer
// extensions. However, the second pattern is more liberal on number of
// commas that acts as extension labels, so we have strict cap on number of
// digits in such extensions.
return StrCat(extension_pattern, "|",
auto_dialling_extn, "|",
only_commas_extn);
}
return extension_pattern;
}
// Normalizes a string of characters representing a phone number by replacing
@ -683,8 +766,7 @@ class PhoneNumberRegExpsAndMappings {
PhoneNumberUtil::kValidPunctuation, kStarSign, "]*",
kDigits, "){3,}[", PhoneNumberUtil::kValidPunctuation,
kStarSign, kValidAlpha, kDigits, "]*")),
extn_patterns_for_parsing_(
CreateExtnPattern(StrCat(",;", kSingleExtnSymbolsForMatching))),
extn_patterns_for_parsing_(CreateExtnPattern(/* for_parsing= */ true)),
regexp_factory_(new RegExpFactory()),
regexp_cache_(new RegExpCache(*regexp_factory_.get(), 128)),
diallable_char_mappings_(),
@ -710,11 +792,10 @@ class PhoneNumberRegExpsAndMappings {
PhoneNumberUtil::kCaptureUpToSecondNumberStart)),
unwanted_end_char_pattern_(
regexp_factory_->CreateRegExp("[^\\p{N}\\p{L}#]")),
separator_pattern_(
regexp_factory_->CreateRegExp(
StrCat("[", PhoneNumberUtil::kValidPunctuation, "]+"))),
separator_pattern_(regexp_factory_->CreateRegExp(
StrCat("[", PhoneNumberUtil::kValidPunctuation, "]+"))),
extn_patterns_for_matching_(
CreateExtnPattern(kSingleExtnSymbolsForMatching)),
CreateExtnPattern(/* for_parsing= */ false)),
extn_pattern_(regexp_factory_->CreateRegExp(
StrCat("(?i)(?:", extn_patterns_for_parsing_, ")$"))),
valid_phone_number_pattern_(regexp_factory_->CreateRegExp(
@ -730,9 +811,8 @@ class PhoneNumberRegExpsAndMappings {
first_group_capturing_pattern_(
regexp_factory_->CreateRegExp("(\\$\\d)")),
carrier_code_pattern_(regexp_factory_->CreateRegExp("\\$CC")),
plus_chars_pattern_(
regexp_factory_->CreateRegExp(
StrCat("[", PhoneNumberUtil::kPlusChars, "]+"))) {
plus_chars_pattern_(regexp_factory_->CreateRegExp(
StrCat("[", PhoneNumberUtil::kPlusChars, "]+"))) {
InitializeMapsAndSets();
}
@ -2791,28 +2871,33 @@ bool PhoneNumberUtil::MaybeStripNationalPrefixAndCarrierCode(
// Strips any extension (as in, the part of the number dialled after the call is
// connected, usually indicated with extn, ext, x or similar) from the end of
// the number, and returns it. The number passed in should be non-normalized.
bool PhoneNumberUtil::MaybeStripExtension(string* number, string* extension)
bool PhoneNumberUtil::MaybeStripExtension(string* number, std::string* extension)
const {
DCHECK(number);
DCHECK(extension);
// There are three extension capturing groups in the regular expression.
// There are six extension capturing groups in the regular expression.
string possible_extension_one;
string possible_extension_two;
string possible_extension_three;
string possible_extension_four;
string possible_extension_five;
string possible_extension_six;
string number_copy(*number);
const scoped_ptr<RegExpInput> number_copy_as_regexp_input(
reg_exps_->regexp_factory_->CreateInput(number_copy));
if (reg_exps_->extn_pattern_->Consume(number_copy_as_regexp_input.get(),
false,
&possible_extension_one,
&possible_extension_two,
&possible_extension_three)) {
if (reg_exps_->extn_pattern_->Consume(
number_copy_as_regexp_input.get(), false, &possible_extension_one,
&possible_extension_two, &possible_extension_three,
&possible_extension_four, &possible_extension_five,
&possible_extension_six)) {
// Replace the extensions in the original string here.
reg_exps_->extn_pattern_->Replace(&number_copy, "");
// If we find a potential extension, and the number preceding this is a
// viable number, we assume it is an extension.
if ((!possible_extension_one.empty() || !possible_extension_two.empty() ||
!possible_extension_three.empty()) &&
!possible_extension_three.empty() ||
!possible_extension_four.empty() || !possible_extension_five.empty() ||
!possible_extension_six.empty()) &&
IsViablePhoneNumber(number_copy)) {
number->assign(number_copy);
if (!possible_extension_one.empty()) {
@ -2821,6 +2906,12 @@ bool PhoneNumberUtil::MaybeStripExtension(string* number, string* extension)
extension->assign(possible_extension_two);
} else if (!possible_extension_three.empty()) {
extension->assign(possible_extension_three);
} else if (!possible_extension_four.empty()) {
extension->assign(possible_extension_four);
} else if (!possible_extension_five.empty()) {
extension->assign(possible_extension_five);
} else if (!possible_extension_six.empty()) {
extension->assign(possible_extension_six);
}
return true;
}


+ 45
- 9
cpp/src/phonenumbers/regexp_adapter.h View File

@ -56,14 +56,47 @@ class RegExp {
// input_string - string to be searched.
// anchor_at_start - if true, match would be successful only if it appears at
// the beginning of the tested region of the string.
// matched_string1 - the first string extracted from the match. Can be NULL.
// matched_string2 - the second string extracted from the match. Can be NULL.
// matched_string3 - the third string extracted from the match. Can be NULL.
// matched_string1..6 - string extracted from the match in sequential order.
// Can be NULL.
virtual bool Consume(RegExpInput* input_string,
bool anchor_at_start,
string* matched_string1,
string* matched_string2,
string* matched_string3) const = 0;
string* matched_string3,
string* matched_string4,
string* matched_string5,
string* matched_string6) const = 0;
// Helper methods calling the Consume method that assume the match must start
// at the beginning.
inline bool Consume(RegExpInput* input_string, string* matched_string1,
string* matched_string2,
string* matched_string3,
string* matched_string4,
string* matched_string5,
string* matched_string6) const {
return Consume(input_string, true, matched_string1, matched_string2,
matched_string3, matched_string4, matched_string5,
matched_string6);
}
inline bool Consume(RegExpInput* input_string, string* matched_string1,
string* matched_string2,
string* matched_string3,
string* matched_string4,
string* matched_string5) const {
return Consume(input_string, true, matched_string1, matched_string2,
matched_string3, matched_string4, matched_string5, NULL);
}
inline bool Consume(RegExpInput* input_string, string* matched_string1,
string* matched_string2,
string* matched_string3,
string* matched_string4) const {
return Consume(input_string, true, matched_string1, matched_string2,
matched_string3, matched_string4, NULL, NULL);
}
// Helper methods calling the Consume method that assume the match must start
// at the beginning.
@ -72,28 +105,31 @@ class RegExp {
string* matched_string2,
string* matched_string3) const {
return Consume(input_string, true, matched_string1, matched_string2,
matched_string3);
matched_string3, NULL, NULL, NULL);
}
inline bool Consume(RegExpInput* input_string,
string* matched_string1,
string* matched_string2) const {
return Consume(input_string, true, matched_string1, matched_string2, NULL);
return Consume(input_string, true, matched_string1, matched_string2, NULL,
NULL, NULL, NULL);
}
inline bool Consume(RegExpInput* input_string, string* matched_string) const {
return Consume(input_string, true, matched_string, NULL, NULL);
return Consume(input_string, true, matched_string, NULL, NULL, NULL, NULL,
NULL);
}
inline bool Consume(RegExpInput* input_string) const {
return Consume(input_string, true, NULL, NULL, NULL);
return Consume(input_string, true, NULL, NULL, NULL, NULL, NULL, NULL);
}
// Helper method calling the Consume method that assumes the match can start
// at any place in the string.
inline bool FindAndConsume(RegExpInput* input_string,
string* matched_string) const {
return Consume(input_string, false, matched_string, NULL, NULL);
return Consume(input_string, false, matched_string, NULL, NULL, NULL, NULL,
NULL);
}
// Matches string to regular expression, returns true if the expression was


+ 7
- 4
cpp/src/phonenumbers/regexp_adapter_icu.cc View File

@ -120,7 +120,10 @@ class IcuRegExp : public RegExp {
bool anchor_at_start,
string* matched_string1,
string* matched_string2,
string* matched_string3) const {
string* matched_string3,
string* matched_string4,
string* matched_string5,
string* matched_string6) const {
DCHECK(input_string);
if (!utf8_regexp_.get()) {
return false;
@ -135,9 +138,9 @@ class IcuRegExp : public RegExp {
if (!match_succeeded || U_FAILURE(status)) {
return false;
}
string* const matched_strings[] = {
matched_string1, matched_string2, matched_string3
};
string* const matched_strings[] = {matched_string1, matched_string2,
matched_string3, matched_string4,
matched_string5, matched_string6};
// If less matches than expected - fail.
for (size_t i = 0; i < arraysize(matched_strings); ++i) {
if (matched_strings[i]) {


+ 19
- 9
cpp/src/phonenumbers/regexp_adapter_re2.cc View File

@ -60,10 +60,15 @@ bool DispatchRE2Call(Function regex_function,
const RE2& regexp,
string* out1,
string* out2,
string* out3) {
const RE2::Arg outs[] = { out1, out2, out3, };
const RE2::Arg* const args[] = { &outs[0], &outs[1], &outs[2], };
const int argc = out3 ? 3 : out2 ? 2 : out1 ? 1 : 0;
string* out3,
string* out4,
string* out5,
string* out6) {
const RE2::Arg outs[] = { out1, out2, out3, out4, out5, out6};
const RE2::Arg* const args[] = {&outs[0], &outs[1], &outs[2],
&outs[3], &outs[4], &outs[5]};
const int argc =
out6 ? 6 : out5 ? 5 : out4 ? 4 : out3 ? 3 : out2 ? 2 : out1 ? 1 : 0;
return regex_function(input, regexp, args, argc);
}
@ -94,7 +99,10 @@ class RE2RegExp : public RegExp {
bool anchor_at_start,
string* matched_string1,
string* matched_string2,
string* matched_string3) const {
string* matched_string3,
string* matched_string4,
string* matched_string5,
string* matched_string6) const {
DCHECK(input_string);
StringPiece* utf8_input =
static_cast<RE2RegExpInput*>(input_string)->Data();
@ -102,11 +110,13 @@ class RE2RegExp : public RegExp {
if (anchor_at_start) {
return DispatchRE2Call(RE2::ConsumeN, utf8_input, utf8_regexp_,
matched_string1, matched_string2,
matched_string3);
matched_string3, matched_string4,
matched_string5, matched_string6);
} else {
return DispatchRE2Call(RE2::FindAndConsumeN, utf8_input, utf8_regexp_,
matched_string1, matched_string2,
matched_string3);
matched_string3, matched_string4,
matched_string5, matched_string6);
}
}
@ -115,10 +125,10 @@ class RE2RegExp : public RegExp {
string* matched_string) const {
if (full_match) {
return DispatchRE2Call(RE2::FullMatchN, input_string, utf8_regexp_,
matched_string, NULL, NULL);
matched_string, NULL, NULL, NULL, NULL, NULL);
} else {
return DispatchRE2Call(RE2::PartialMatchN, input_string, utf8_regexp_,
matched_string, NULL, NULL);
matched_string, NULL, NULL), NULL, NULL, NULL);
}
}


+ 163
- 0
cpp/test/phonenumbers/phonenumberutil_test.cc View File

@ -4399,6 +4399,169 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) {
EXPECT_EQ(us_with_extension, test_number);
}
TEST_F(PhoneNumberUtilTest, TestParseHandlesLongExtensionsWithExplicitLabels) {
// Test lower and upper limits of extension lengths for each type of label.
PhoneNumber nz_number;
nz_number.set_country_code(64);
nz_number.set_national_number(33316005ULL);
PhoneNumber test_number;
// Firstly, when in RFC format: ext_limit_after_explicit_label
nz_number.set_extension("0");
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("tel:+6433316005;ext=0", RegionCode::NZ(),
&test_number));
EXPECT_EQ(nz_number, test_number);
nz_number.set_extension("01234567890123456789");
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("tel:+6433316005;ext=01234567890123456789",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
// Extension too long.
EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER,
phone_util_.Parse("tel:+6433316005;ext=012345678901234567890",
RegionCode::NZ(), &test_number));
// Explicit extension label: ext_limit_after_explicit_label
nz_number.set_extension("1");
EXPECT_EQ(
PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005ext:1", RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
nz_number.set_extension("12345678901234567890");
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 xtn:12345678901234567890",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 extension\t12345678901234567890",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 xtensio:12345678901234567890",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 xtensión, 12345678901234567890#",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005extension.12345678901234567890",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 доб:12345678901234567890",
RegionCode::NZ(), &test_number));
EXPECT_EQ(nz_number, test_number);
// Extension too long.
EXPECT_EQ(PhoneNumberUtil::TOO_LONG_NSN,
phone_util_.Parse("03 3316005 extension 123456789012345678901",
RegionCode::NZ(), &test_number));
}
TEST_F(PhoneNumberUtilTest,
TestParseHandlesLongExtensionsWithAutoDiallingLabels) {
// Secondly, cases of auto-dialling and other standard extension labels:
// ext_limit_after_likely_label
PhoneNumber us_number_user_input;
us_number_user_input.set_country_code(1);
us_number_user_input.set_national_number(2679000000ULL);
PhoneNumber test_number;
us_number_user_input.set_extension("123456789012345");
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("+12679000000,,123456789012345#",
RegionCode::US(), &test_number));
EXPECT_EQ(us_number_user_input, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("+12679000000;123456789012345#", RegionCode::US(),
&test_number));
EXPECT_EQ(us_number_user_input, test_number);
PhoneNumber uk_number_user_input;
uk_number_user_input.set_country_code(44);
uk_number_user_input.set_national_number(2034000000ULL);
uk_number_user_input.set_extension("123456789");
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("+442034000000,,123456789#", RegionCode::GB(),
&test_number));
// Extension too long.
EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER,
phone_util_.Parse("+12679000000,,1234567890123456#",
RegionCode::US(), &test_number));
}
TEST_F(PhoneNumberUtilTest, TestParseHandlesShortExtensionsWithAmbiguousChar) {
// Thirdly, for single and non-standard cases: ext_limit_after_ambiguous_char
PhoneNumber nz_number;
nz_number.set_country_code(64);
nz_number.set_national_number(33316005ULL);
PhoneNumber test_number;
nz_number.set_extension("123456789");//
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 x 123456789", RegionCode::NZ(),
&test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 x. 123456789", RegionCode::NZ(),
&test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 #123456789#", RegionCode::NZ(),
&test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("03 3316005 ~ 123456789", RegionCode::NZ(),
&test_number));
EXPECT_EQ(nz_number, test_number);
EXPECT_EQ(PhoneNumberUtil::TOO_LONG_NSN,
phone_util_.Parse("03 3316005 ~ 1234567890", RegionCode::NZ(),
&test_number));
}
TEST_F(PhoneNumberUtilTest, TestParseHandlesShortExtensionsWhenNotSureOfLabel) {
// Thirdly, when no explicit extension label present, but denoted by
// tailing #: ext_limit_when_not_sure
PhoneNumber us_number;
us_number.set_country_code(1);
us_number.set_national_number(1234567890ULL);
PhoneNumber test_number;
us_number.set_extension("666666");
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("+1123-456-7890 666666#", RegionCode::US(),
&test_number));
EXPECT_EQ(us_number, test_number);
us_number.set_extension("6");
EXPECT_EQ(
PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("+11234567890-6#", RegionCode::US(), &test_number));
EXPECT_EQ(us_number, test_number);
// Extension too long.
EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER,
phone_util_.Parse("+1123-456-7890 7777777#", RegionCode::US(),
&test_number));
}
TEST_F(PhoneNumberUtilTest, ParseAndKeepRaw) {
PhoneNumber alpha_numeric_number;
alpha_numeric_number.set_country_code(1);


+ 42
- 10
cpp/test/phonenumbers/regexp_adapter_test.cc View File

@ -47,7 +47,9 @@ struct RegExpTestContext {
digits(factory->CreateRegExp("\\d+")),
parentheses_digits(factory->CreateRegExp("\\((\\d+)\\)")),
single_digit(factory->CreateRegExp("\\d")),
two_digit_groups(factory->CreateRegExp("(\\d+)-(\\d+)")) {}
two_digit_groups(factory->CreateRegExp("(\\d+)-(\\d+)")),
six_digit_groups(factory->CreateRegExp(
"(\\d+)-(\\d+)-(\\d+)-(\\d+)-(\\d+)-(\\d+)")) {}
const string name;
const scoped_ptr<const AbstractRegExpFactory> factory;
@ -55,6 +57,7 @@ struct RegExpTestContext {
const scoped_ptr<const RegExp> parentheses_digits;
const scoped_ptr<const RegExp> single_digit;
const scoped_ptr<const RegExp> two_digit_groups;
const scoped_ptr<const RegExp> six_digit_groups;
};
class RegExpAdapterTest : public testing::Test {
@ -89,18 +92,21 @@ TEST_F(RegExpAdapterTest, TestConsumeNoMatch) {
// When 'true' is passed to Consume(), the match occurs from the beginning
// of the input.
ASSERT_FALSE(context.digits->Consume(input.get(), true, NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_FALSE(context.digits->Consume(
input.get(), true, NULL, NULL, NULL, NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("+1-123-456-789", input->ToString()) << ErrorMessage(context);
string res1;
ASSERT_FALSE(context.parentheses_digits->Consume(
input.get(), true, &res1, NULL, NULL)) << ErrorMessage(context);
input.get(), true, &res1, NULL, NULL, NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("+1-123-456-789", input->ToString()) << ErrorMessage(context);
ASSERT_EQ("", res1) << ErrorMessage(context);
}
}
TEST_F(RegExpAdapterTest, TestConsumeWithNull) {
for (TestContextIterator it = contexts_.begin(); it != contexts_.end();
++it) {
@ -109,7 +115,8 @@ TEST_F(RegExpAdapterTest, TestConsumeWithNull) {
const scoped_ptr<RegExpInput> input(factory.CreateInput("+123"));
const scoped_ptr<const RegExp> plus_sign(factory.CreateRegExp("(\\+)"));
ASSERT_TRUE(plus_sign->Consume(input.get(), true, NULL, NULL, NULL))
ASSERT_TRUE(plus_sign->Consume(input.get(), true, NULL, NULL, NULL, NULL,
NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("123", input->ToString()) << ErrorMessage(context);
}
@ -124,7 +131,8 @@ TEST_F(RegExpAdapterTest, TestConsumeRetainsMatches) {
string res1, res2;
ASSERT_TRUE(context.two_digit_groups->Consume(
input.get(), true, &res1, &res2, NULL)) << ErrorMessage(context);
input.get(), true, &res1, &res2, NULL, NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("-456-789", input->ToString()) << ErrorMessage(context);
ASSERT_EQ("1", res1) << ErrorMessage(context);
ASSERT_EQ("123", res2) << ErrorMessage(context);
@ -137,27 +145,51 @@ TEST_F(RegExpAdapterTest, TestFindAndConsume) {
const RegExpTestContext& context = **it;
const scoped_ptr<RegExpInput> input(
context.factory->CreateInput("+1-123-456-789"));
const scoped_ptr<RegExpInput> input_with_six_digit_groups(
context.factory->CreateInput("111-222-333-444-555-666"));
// When 'false' is passed to Consume(), the match can occur from any place
// in the input.
ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL))
ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL,
NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("-123-456-789", input->ToString()) << ErrorMessage(context);
ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL))
ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL,
NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("-456-789", input->ToString()) << ErrorMessage(context);
ASSERT_FALSE(context.parentheses_digits->Consume(
input.get(), false, NULL, NULL, NULL)) << ErrorMessage(context);
input.get(), false, NULL, NULL, NULL, NULL, NULL, NULL))
<< ErrorMessage(context);
ASSERT_EQ("-456-789", input->ToString()) << ErrorMessage(context);
string res1, res2;
ASSERT_TRUE(context.two_digit_groups->Consume(
input.get(), false, &res1, &res2, NULL)) << ErrorMessage(context);
input.get(), false, &res1, &res2, NULL, NULL, NULL, NULL))
<< ErrorMessage(context);
printf("previous input: %s", input.get()->ToString().c_str());
ASSERT_EQ("", input->ToString()) << ErrorMessage(context);
ASSERT_EQ("456", res1) << ErrorMessage(context);
ASSERT_EQ("789", res2) << ErrorMessage(context);
// Testing maximum no of substrings that can be matched presently, six.
string mat1, mat2, res3, res4, res5, res6;
ASSERT_TRUE(context.six_digit_groups->Consume(
input_with_six_digit_groups.get(), false, &mat1, &mat2, &res3, &res4,
&res5, &res6))
<< ErrorMessage(context);
printf("Present input: %s",
input_with_six_digit_groups.get()->ToString().c_str());
ASSERT_EQ("", input_with_six_digit_groups->ToString())
<< ErrorMessage(context);
ASSERT_EQ("111", mat1) << ErrorMessage(context);
ASSERT_EQ("222", mat2) << ErrorMessage(context);
ASSERT_EQ("333", res3) << ErrorMessage(context);
ASSERT_EQ("444", res4) << ErrorMessage(context);
ASSERT_EQ("555", res5) << ErrorMessage(context);
ASSERT_EQ("666", res6) << ErrorMessage(context);
}
}


+ 89
- 37
java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java View File

@ -303,46 +303,98 @@ public class PhoneNumberUtil {
// as the default extension prefix. This can be overridden by region-specific preferences.
private static final String DEFAULT_EXTN_PREFIX = " ext. ";
// Pattern to capture digits used in an extension. Places a maximum length of "7" for an
// extension.
private static final String CAPTURING_EXTN_DIGITS = "(" + DIGITS + "{1,7})";
// Regexp of all possible ways to write extensions, for use when parsing. This will be run as a
// case-insensitive regexp match. Wide character versions are also provided after each ASCII
// version.
private static final String EXTN_PATTERNS_FOR_PARSING;
static final String EXTN_PATTERNS_FOR_MATCHING;
static {
// One-character symbols that can be used to indicate an extension.
String singleExtnSymbolsForMatching = "x\uFF58#\uFF03~\uFF5E";
// For parsing, we are slightly more lenient in our interpretation than for matching. Here we
// allow "comma" and "semicolon" as possible extension indicators. When matching, these are
// hardly ever used to indicate this.
String singleExtnSymbolsForParsing = ",;" + singleExtnSymbolsForMatching;
EXTN_PATTERNS_FOR_PARSING = createExtnPattern(singleExtnSymbolsForParsing);
EXTN_PATTERNS_FOR_MATCHING = createExtnPattern(singleExtnSymbolsForMatching);
}
/**
* Helper initialiser method to create the regular-expression pattern to match extensions,
* allowing the one-char extension symbols provided by {@code singleExtnSymbols}.
*/
private static String createExtnPattern(String singleExtnSymbols) {
// There are three regular expressions here. The first covers RFC 3966 format, where the
// extension is added using ";ext=". The second more generic one starts with optional white
// space and ends with an optional full stop (.), followed by zero or more spaces/tabs/commas
// and then the numbers themselves. The other one covers the special case of American numbers
// where the extension is written with a hash at the end, such as "- 503#"
// Note that the only capturing groups should be around the digits that you want to capture as
// part of the extension, or else parsing will fail!
// Canonical-equivalence doesn't seem to be an option with Android java, so we allow two options
// for representing the accented o - the character itself, and one in the unicode decomposed
// form with the combining acute accent.
return (RFC3966_EXTN_PREFIX + CAPTURING_EXTN_DIGITS + "|" + "[ \u00A0\\t,]*"
+ "(?:e?xt(?:ensi(?:o\u0301?|\u00F3))?n?|\uFF45?\uFF58\uFF54\uFF4E?|"
+ "\u0434\u043E\u0431|" + "[" + singleExtnSymbols + "]|int|anexo|\uFF49\uFF4E\uFF54)"
+ "[:\\.\uFF0E]?[ \u00A0\\t,-]*" + CAPTURING_EXTN_DIGITS + "#?|"
+ "[- ]+(" + DIGITS + "{1,5})#");
private static final String EXTN_PATTERNS_FOR_PARSING = createExtnPattern(true);
static final String EXTN_PATTERNS_FOR_MATCHING = createExtnPattern(false);
/**
* Helper method for constructing regular expressions for parsing. Creates an expression that
* captures up to maxLength digits.
*/
private static String extnDigits(int maxLength) {
return "(" + DIGITS + "{1," + maxLength + "})";
}
/**
* Helper initialiser method to create the regular-expression pattern to match extensions.
* Note that there are currently six capturing groups for the extension itself. If this number is
* changed, MaybeStripExtension needs to be updated.
*/
private static String createExtnPattern(boolean forParsing) {
// We cap the maximum length of an extension based on the ambiguity of the way the extension is
// prefixed. As per ITU, the officially allowed length for extensions is actually 40, but we
// don't support this since we haven't seen real examples and this introduces many false
// interpretations as the extension labels are not standardized.
int extLimitAfterExplicitLabel = 20;
int extLimitAfterLikelyLabel = 15;
int extLimitAfterAmbiguousChar = 9;
int extLimitWhenNotSure = 6;
String possibleSeparatorsBetweenNumberAndExtLabel = "[ \u00A0\\t,]*";
// Optional full stop (.) or colon, followed by zero or more spaces/tabs/commas.
String possibleCharsAfterExtLabel = "[:\\.\uFF0E]?[ \u00A0\\t,-]*";
String optionalExtnSuffix = "#?";
// Here the extension is called out in more explicit way, i.e mentioning it obvious patterns
// like "ext.". Canonical-equivalence doesn't seem to be an option with Android java, so we
// allow two options for representing the accented o - the character itself, and one in the
// unicode decomposed form with the combining acute accent.
String explicitExtLabels =
"(?:e?xt(?:ensi(?:o\u0301?|\u00F3))?n?|\uFF45?\uFF58\uFF54\uFF4E?|\u0434\u043E\u0431|anexo)";
// One-character symbols that can be used to indicate an extension, and less commonly used
// or more ambiguous extension labels.
String ambiguousExtLabels = "(?:[x\uFF58#\uFF03~\uFF5E]|int|\uFF49\uFF4E\uFF54)";
// When extension is not separated clearly.
String ambiguousSeparator = "[- ]+";
String rfcExtn = RFC3966_EXTN_PREFIX + extnDigits(extLimitAfterExplicitLabel);
String explicitExtn = possibleSeparatorsBetweenNumberAndExtLabel + explicitExtLabels
+ possibleCharsAfterExtLabel + extnDigits(extLimitAfterExplicitLabel)
+ optionalExtnSuffix;
String ambiguousExtn = possibleSeparatorsBetweenNumberAndExtLabel + ambiguousExtLabels
+ possibleCharsAfterExtLabel + extnDigits(extLimitAfterAmbiguousChar) + optionalExtnSuffix;
String americanStyleExtnWithSuffix = ambiguousSeparator + extnDigits(extLimitWhenNotSure) + "#";
// The first regular expression covers RFC 3966 format, where the extension is added using
// ";ext=". The second more generic where extension is mentioned with explicit labels like
// "ext:". In both the above cases we allow more numbers in extension than any other extension
// labels. The third one captures when single character extension labels or less commonly used
// labels are used. In such cases we capture fewer extension digits in order to reduce the
// chance of falsely interpreting two numbers beside each other as a number + extension. The
// fourth one covers the special case of American numbers where the extension is written with a
// hash at the end, such as "- 503#".
String extensionPattern =
rfcExtn + "|"
+ explicitExtn + "|"
+ ambiguousExtn + "|"
+ americanStyleExtnWithSuffix;
// Additional pattern that is supported when parsing extensions, not when matching.
if (forParsing) {
// This is same as possibleSeparatorsBetweenNumberAndExtLabel, but not matching comma as
// extension label may have it.
String possibleSeparatorsNumberExtLabelNoComma = "[ \u00A0\\t]*";
// ",," is commonly used for auto dialling the extension when connected. First comma is matched
// through possibleSeparatorsBetweenNumberAndExtLabel, so we do not repeat it here. Semi-colon
// works in Iphone and Android also to pop up a button with the extension number following.
String autoDiallingAndExtLabelsFound = "(?:,{2}|;)";
String autoDiallingExtn = possibleSeparatorsNumberExtLabelNoComma
+ autoDiallingAndExtLabelsFound + possibleCharsAfterExtLabel
+ extnDigits(extLimitAfterLikelyLabel) + optionalExtnSuffix;
String onlyCommasExtn = possibleSeparatorsNumberExtLabelNoComma
+ "(?:,)+" + possibleCharsAfterExtLabel + extnDigits(extLimitAfterAmbiguousChar)
+ optionalExtnSuffix;
// Here the first pattern is exclusively for extension autodialling formats which are used
// when dialling and in this case we accept longer extensions. However, the second pattern
// is more liberal on the number of commas that acts as extension labels, so we have a strict
// cap on the number of digits in such extensions.
return extensionPattern + "|"
+ autoDiallingExtn + "|"
+ onlyCommasExtn;
}
return extensionPattern;
}
// Regexp of all known extension prefixes used by different regions followed by 1 or more valid


+ 131
- 0
java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java View File

@ -2677,6 +2677,137 @@ public class PhoneNumberUtilTest extends TestMetadataTestCase {
// Retry with the same number in a slightly different format.
assertEquals(usWithExtension, phoneUtil.parse("+1 (645) 123 1234 ext. 910#", RegionCode.US));
}
public void testParseHandlesLongExtensionsWithExplicitLabels() throws Exception {
// Test lower and upper limits of extension lengths for each type of label.
PhoneNumber nzNumber = new PhoneNumber();
nzNumber.setCountryCode(64).setNationalNumber(33316005L);
// Firstly, when in RFC format: PhoneNumberUtil.extLimitAfterExplicitLabel
nzNumber.setExtension("0");
assertEquals(nzNumber, phoneUtil.parse("tel:+6433316005;ext=0", RegionCode.NZ));
nzNumber.setExtension("01234567890123456789");
assertEquals(
nzNumber, phoneUtil.parse("tel:+6433316005;ext=01234567890123456789", RegionCode.NZ));
// Extension too long.
try {
phoneUtil.parse("tel:+6433316005;ext=012345678901234567890", RegionCode.NZ);
fail(
"This should not parse as length of extension is higher than allowed: "
+ "tel:+6433316005;ext=012345678901234567890");
} catch (NumberParseException e) {
// Expected this exception.
assertEquals(
"Wrong error type stored in exception.",
NumberParseException.ErrorType.NOT_A_NUMBER,
e.getErrorType());
}
// Explicit extension label: PhoneNumberUtil.extLimitAfterExplicitLabel
nzNumber.setExtension("1");
assertEquals(nzNumber, phoneUtil.parse("03 3316005ext:1", RegionCode.NZ));
nzNumber.setExtension("12345678901234567890");
assertEquals(nzNumber, phoneUtil.parse("03 3316005 xtn:12345678901234567890", RegionCode.NZ));
assertEquals(
nzNumber, phoneUtil.parse("03 3316005 extension\t12345678901234567890", RegionCode.NZ));
assertEquals(
nzNumber, phoneUtil.parse("03 3316005 xtensio:12345678901234567890", RegionCode.NZ));
assertEquals(
nzNumber, phoneUtil.parse("03 3316005 xtensi\u00F3n, 12345678901234567890#", RegionCode.NZ));
assertEquals(
nzNumber, phoneUtil.parse("03 3316005extension.12345678901234567890", RegionCode.NZ));
assertEquals(nzNumber, phoneUtil.parse("03 3316005 \u0434\u043E\u0431:12345678901234567890", RegionCode.NZ));
// Extension too long.
try {
phoneUtil.parse("03 3316005 extension 123456789012345678901", RegionCode.NZ);
fail(
"This should not parse as length of extension is higher than allowed: "
+ "03 3316005 extension 123456789012345678901");
} catch (NumberParseException e) {
// Expected this exception.
assertEquals(
"Wrong error type stored in exception.",
NumberParseException.ErrorType.TOO_LONG,
e.getErrorType());
}
}
public void testParseHandlesLongExtensionsWithAutoDiallingLabels() throws Exception {
// Secondly, cases of auto-dialling and other standard extension labels,
// PhoneNumberUtil.extLimitAfterLikelyLabel
PhoneNumber usNumberUserInput = new PhoneNumber();
usNumberUserInput.setCountryCode(1).setNationalNumber(2679000000L);
usNumberUserInput.setExtension("123456789012345");
assertEquals(
usNumberUserInput, phoneUtil.parse("+12679000000,,123456789012345#", RegionCode.US));
assertEquals(
usNumberUserInput, phoneUtil.parse("+12679000000;123456789012345#", RegionCode.US));
PhoneNumber ukNumberUserInput = new PhoneNumber();
ukNumberUserInput.setCountryCode(44).setNationalNumber(2034000000L).setExtension("123456789");
assertEquals(ukNumberUserInput, phoneUtil.parse("+442034000000,,123456789#", RegionCode.GB));
// Extension too long.
try {
phoneUtil.parse("+12679000000,,1234567890123456#", RegionCode.US);
fail(
"This should not parse as length of extension is higher than allowed: "
+ "+12679000000,,1234567890123456#");
} catch (NumberParseException e) {
// Expected this exception.
assertEquals(
"Wrong error type stored in exception.",
NumberParseException.ErrorType.NOT_A_NUMBER,
e.getErrorType());
}
}
public void testParseHandlesShortExtensionsWithAmbiguousChar() throws Exception {
PhoneNumber nzNumber = new PhoneNumber();
nzNumber.setCountryCode(64).setNationalNumber(33316005L);
// Thirdly, for single and non-standard cases:
// PhoneNumberUtil.extLimitAfterAmbiguousChar
nzNumber.setExtension("123456789");
assertEquals(nzNumber, phoneUtil.parse("03 3316005 x 123456789", RegionCode.NZ));
assertEquals(nzNumber, phoneUtil.parse("03 3316005 x. 123456789", RegionCode.NZ));
assertEquals(nzNumber, phoneUtil.parse("03 3316005 #123456789#", RegionCode.NZ));
assertEquals(nzNumber, phoneUtil.parse("03 3316005 ~ 123456789", RegionCode.NZ));
// Extension too long.
try {
phoneUtil.parse("03 3316005 ~ 1234567890", RegionCode.NZ);
fail(
"This should not parse as length of extension is higher than allowed: "
+ "03 3316005 ~ 1234567890");
} catch (NumberParseException e) {
// Expected this exception.
assertEquals(
"Wrong error type stored in exception.",
NumberParseException.ErrorType.TOO_LONG,
e.getErrorType());
}
}
public void testParseHandlesShortExtensionsWhenNotSureOfLabel() throws Exception {
// Lastly, when no explicit extension label present, but denoted by tailing #:
// PhoneNumberUtil.extLimitWhenNotSure
PhoneNumber usNumber = new PhoneNumber();
usNumber.setCountryCode(1).setNationalNumber(1234567890L).setExtension("666666");
assertEquals(usNumber, phoneUtil.parse("+1123-456-7890 666666#", RegionCode.US));
usNumber.setExtension("6");
assertEquals(usNumber, phoneUtil.parse("+11234567890-6#", RegionCode.US));
// Extension too long.
try {
phoneUtil.parse("+1123-456-7890 7777777#", RegionCode.US);
fail(
"This should not parse as length of extension is higher than allowed: "
+ "+1123-456-7890 7777777#");
} catch (NumberParseException e) {
// Expected this exception.
assertEquals(
"Wrong error type stored in exception.",
NumberParseException.ErrorType.NOT_A_NUMBER,
e.getErrorType());
}
}
public void testParseAndKeepRaw() throws Exception {
PhoneNumber alphaNumericNumber = new PhoneNumber().mergeFrom(ALPHA_NUMERIC_NUMBER).


Loading…
Cancel
Save