diff --git a/cpp/src/phonenumbers/phonenumberutil.cc b/cpp/src/phonenumbers/phonenumberutil.cc index 6c17861c8..e3b83bf97 100644 --- a/cpp/src/phonenumbers/phonenumberutil.cc +++ b/cpp/src/phonenumbers/phonenumberutil.cc @@ -118,6 +118,15 @@ const char kDefaultExtnPrefix[] = " ext. "; const char kSingleExtnSymbolsForMatching[] = "x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E"; +const char kPossibleSeparatorsBetweenNumberAndExtLabel[] = + "[ \xC2\xA0\\t,]*"; + +// Optional full stop (.) or colon, followed by zero or more +// spaces/tabs/commas. +const char kPossibleCharsAfterExtLabel[] = + "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*"; +const char kOptionalExtSuffix[] = "#?"; + bool LoadCompiledInMetadata(PhoneMetadataCollection* metadata) { if (!metadata->ParseFromArray(metadata_get(), metadata_size())) { LOG(ERROR) << "Could not parse binary data."; @@ -202,35 +211,109 @@ char32 ToUnicodeCodepoint(const char* unicode_char) { return codepoint; } +// Helper method for constructing regular expressions for parsing. Creates an +// expression that captures up to max_length digits. +std::string ExtnDigits(int max_length) { + return StrCat("([", kDigits, "]{1,", max_length, "})"); +} + // Helper initialiser method to create the regular-expression pattern to match -// extensions, allowing the one-codepoint extension symbols provided by -// single_extn_symbols. -// Note that there are currently three capturing groups for the extension itself -// - if this number is changed, MaybeStripExtension needs to be updated. -string CreateExtnPattern(const string& single_extn_symbols) { - static const string capturing_extn_digits = StrCat("([", kDigits, "]{1,7})"); - // The first regular expression covers RFC 3966 format, where the extension is - // added using ";ext=". The second more generic one starts with optional white - // space and ends with an optional full stop (.), followed by zero or more - // spaces/tabs/commas and then the numbers themselves. The third one covers - // the special case of American numbers where the extension is written with a - // hash at the end, such as "- 503#". - // Note that the only capturing groups should be around the digits that you - // want to capture as part of the extension, or else parsing will fail! +// extensions. Note that: +// - There are currently six capturing groups for the extension itself. If this +// number is changed, MaybeStripExtension needs to be updated. +// - The only capturing groups should be around the digits that you want to +// capture as part of the extension, or else parsing will fail! +std::string CreateExtnPattern(bool for_parsing) { + // We cap the maximum length of an extension based on the ambiguity of the + // way the extension is prefixed. As per ITU, the officially allowed + // length for extensions is actually 40, but we don't support this since we + // haven't seen real examples and this introduces many false interpretations + // as the extension labels are not standardized. + int ext_limit_after_explicit_label = 20; + int ext_limit_after_likely_label = 15; + int ext_limit_after_ambiguous_char = 9; + int ext_limit_when_not_sure = 6; + // Canonical-equivalence doesn't seem to be an option with RE2, so we allow - // two options for representing the ó - the character itself, and one in the - // unicode decomposed form with the combining acute accent. - return (StrCat( - kRfc3966ExtnPrefix, capturing_extn_digits, "|" - /* "[  \\t,]*(?:e?xt(?:ensi(?:ó?|ó))?n?|e?xtn?|single_extn_symbols|" - "int|int|anexo)" - "[:\\..]?[  \\t,-]*", capturing_extn_digits, "#?|" */ - "[ \xC2\xA0\\t,]*(?:e?xt(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|" - "(?:\xEF\xBD\x85)?\xEF\xBD\x98\xEF\xBD\x94(?:\xEF\xBD\x8E)?|" - "\xD0\xB4\xD0\xBE\xD0\xB1|[", single_extn_symbols, "]|int|" - "\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94|anexo)" - "[:\\.\xEF\xBC\x8E]?[ \xC2\xA0\\t,-]*", capturing_extn_digits, - "#?|[- ]+([", kDigits, "]{1,5})#")); + // two options for representing any non-ASCII character like ó - the character + // itself, and one in the unicode decomposed form with the combining acute + // accent. + + // Here the extension is called out in a more explicit way, i.e mentioning it + // obvious patterns like "ext.". + string explicit_ext_labels = + "(?:e?xt(?:ensi(?:o\xCC\x81?|\xC3\xB3))?n?|(?:\xEF\xBD\x85)?" + "\xEF\xBD\x98\xEF\xBD\x94(?:\xEF\xBD\x8E)?|\xD0\xB4\xD0\xBE\xD0\xB1|" + "anexo)"; + // One-character symbols that can be used to indicate an extension, and less + // commonly used or more ambiguous extension labels. + string ambiguous_ext_labels = + "(?:[x\xEF\xBD\x98#\xEF\xBC\x83~\xEF\xBD\x9E]|int|" + "\xEF\xBD\x89\xEF\xBD\x8E\xEF\xBD\x94)"; + // When extension is not separated clearly. + string ambiguous_separator = "[- ]+"; + + string rfc_extn = StrCat(kRfc3966ExtnPrefix, + ExtnDigits(ext_limit_after_explicit_label)); + string explicit_extn = StrCat( + kPossibleSeparatorsBetweenNumberAndExtLabel, + explicit_ext_labels, kPossibleCharsAfterExtLabel, + ExtnDigits(ext_limit_after_explicit_label), + kOptionalExtSuffix); + string ambiguous_extn = StrCat( + kPossibleSeparatorsBetweenNumberAndExtLabel, + ambiguous_ext_labels, kPossibleCharsAfterExtLabel, + ExtnDigits(ext_limit_after_ambiguous_char), + kOptionalExtSuffix); + string american_style_extn_with_suffix = StrCat( + ambiguous_separator, ExtnDigits(ext_limit_when_not_sure), "#"); + + // The first regular expression covers RFC 3966 format, where the extension is + // added using ";ext=". The second more generic where extension is mentioned + // with explicit labels like "ext:". In both the above cases we allow more + // numbers in extension than any other extension labels. The third one + // captures when single character extension labels or less commonly used + // labels are present. In such cases we capture fewer extension digits in + // order to reduce the chance of falsely interpreting two numbers beside each + // other as a number + extension. The fourth one covers the special case of + // American numbers where the extension is written with a hash at the end, + // such as "- 503#". + string extension_pattern = StrCat( + rfc_extn, "|", + explicit_extn, "|", + ambiguous_extn, "|", + american_style_extn_with_suffix); + // Additional pattern that is supported when parsing extensions, not when + // matching. + if (for_parsing) { + // ",," is commonly used for auto dialling the extension when connected. + // Semi-colon works in Iphone and also in Android to pop up a button with + // the extension number following. + string auto_dialling_and_ext_labels_found = "(?:,{2}|;)"; + // This is same as kPossibleSeparatorsBetweenNumberAndExtLabel, but not + // matching comma as extension label may have it. + string possible_separators_number_extLabel_no_comma = "[ \xC2\xA0\\t]*"; + + string auto_dialling_extn = StrCat( + possible_separators_number_extLabel_no_comma, + auto_dialling_and_ext_labels_found, kPossibleCharsAfterExtLabel, + ExtnDigits(ext_limit_after_likely_label), + kOptionalExtSuffix); + string only_commas_extn = StrCat( + possible_separators_number_extLabel_no_comma, + "(?:,)+", kPossibleCharsAfterExtLabel, + ExtnDigits(ext_limit_after_ambiguous_char), + kOptionalExtSuffix); + // Here the first pattern is exclusive for extension autodialling formats + // which are used when dialling and in this case we accept longer + // extensions. However, the second pattern is more liberal on number of + // commas that acts as extension labels, so we have strict cap on number of + // digits in such extensions. + return StrCat(extension_pattern, "|", + auto_dialling_extn, "|", + only_commas_extn); + } + return extension_pattern; } // Normalizes a string of characters representing a phone number by replacing @@ -683,8 +766,7 @@ class PhoneNumberRegExpsAndMappings { PhoneNumberUtil::kValidPunctuation, kStarSign, "]*", kDigits, "){3,}[", PhoneNumberUtil::kValidPunctuation, kStarSign, kValidAlpha, kDigits, "]*")), - extn_patterns_for_parsing_( - CreateExtnPattern(StrCat(",;", kSingleExtnSymbolsForMatching))), + extn_patterns_for_parsing_(CreateExtnPattern(/* for_parsing= */ true)), regexp_factory_(new RegExpFactory()), regexp_cache_(new RegExpCache(*regexp_factory_.get(), 128)), diallable_char_mappings_(), @@ -710,11 +792,10 @@ class PhoneNumberRegExpsAndMappings { PhoneNumberUtil::kCaptureUpToSecondNumberStart)), unwanted_end_char_pattern_( regexp_factory_->CreateRegExp("[^\\p{N}\\p{L}#]")), - separator_pattern_( - regexp_factory_->CreateRegExp( - StrCat("[", PhoneNumberUtil::kValidPunctuation, "]+"))), + separator_pattern_(regexp_factory_->CreateRegExp( + StrCat("[", PhoneNumberUtil::kValidPunctuation, "]+"))), extn_patterns_for_matching_( - CreateExtnPattern(kSingleExtnSymbolsForMatching)), + CreateExtnPattern(/* for_parsing= */ false)), extn_pattern_(regexp_factory_->CreateRegExp( StrCat("(?i)(?:", extn_patterns_for_parsing_, ")$"))), valid_phone_number_pattern_(regexp_factory_->CreateRegExp( @@ -730,9 +811,8 @@ class PhoneNumberRegExpsAndMappings { first_group_capturing_pattern_( regexp_factory_->CreateRegExp("(\\$\\d)")), carrier_code_pattern_(regexp_factory_->CreateRegExp("\\$CC")), - plus_chars_pattern_( - regexp_factory_->CreateRegExp( - StrCat("[", PhoneNumberUtil::kPlusChars, "]+"))) { + plus_chars_pattern_(regexp_factory_->CreateRegExp( + StrCat("[", PhoneNumberUtil::kPlusChars, "]+"))) { InitializeMapsAndSets(); } @@ -2791,28 +2871,33 @@ bool PhoneNumberUtil::MaybeStripNationalPrefixAndCarrierCode( // Strips any extension (as in, the part of the number dialled after the call is // connected, usually indicated with extn, ext, x or similar) from the end of // the number, and returns it. The number passed in should be non-normalized. -bool PhoneNumberUtil::MaybeStripExtension(string* number, string* extension) +bool PhoneNumberUtil::MaybeStripExtension(string* number, std::string* extension) const { DCHECK(number); DCHECK(extension); - // There are three extension capturing groups in the regular expression. + // There are six extension capturing groups in the regular expression. string possible_extension_one; string possible_extension_two; string possible_extension_three; + string possible_extension_four; + string possible_extension_five; + string possible_extension_six; string number_copy(*number); const scoped_ptr number_copy_as_regexp_input( reg_exps_->regexp_factory_->CreateInput(number_copy)); - if (reg_exps_->extn_pattern_->Consume(number_copy_as_regexp_input.get(), - false, - &possible_extension_one, - &possible_extension_two, - &possible_extension_three)) { + if (reg_exps_->extn_pattern_->Consume( + number_copy_as_regexp_input.get(), false, &possible_extension_one, + &possible_extension_two, &possible_extension_three, + &possible_extension_four, &possible_extension_five, + &possible_extension_six)) { // Replace the extensions in the original string here. reg_exps_->extn_pattern_->Replace(&number_copy, ""); // If we find a potential extension, and the number preceding this is a // viable number, we assume it is an extension. if ((!possible_extension_one.empty() || !possible_extension_two.empty() || - !possible_extension_three.empty()) && + !possible_extension_three.empty() || + !possible_extension_four.empty() || !possible_extension_five.empty() || + !possible_extension_six.empty()) && IsViablePhoneNumber(number_copy)) { number->assign(number_copy); if (!possible_extension_one.empty()) { @@ -2821,6 +2906,12 @@ bool PhoneNumberUtil::MaybeStripExtension(string* number, string* extension) extension->assign(possible_extension_two); } else if (!possible_extension_three.empty()) { extension->assign(possible_extension_three); + } else if (!possible_extension_four.empty()) { + extension->assign(possible_extension_four); + } else if (!possible_extension_five.empty()) { + extension->assign(possible_extension_five); + } else if (!possible_extension_six.empty()) { + extension->assign(possible_extension_six); } return true; } diff --git a/cpp/src/phonenumbers/regexp_adapter.h b/cpp/src/phonenumbers/regexp_adapter.h index e6e90831c..72d0f762f 100644 --- a/cpp/src/phonenumbers/regexp_adapter.h +++ b/cpp/src/phonenumbers/regexp_adapter.h @@ -56,14 +56,47 @@ class RegExp { // input_string - string to be searched. // anchor_at_start - if true, match would be successful only if it appears at // the beginning of the tested region of the string. - // matched_string1 - the first string extracted from the match. Can be NULL. - // matched_string2 - the second string extracted from the match. Can be NULL. - // matched_string3 - the third string extracted from the match. Can be NULL. + // matched_string1..6 - string extracted from the match in sequential order. + // Can be NULL. virtual bool Consume(RegExpInput* input_string, bool anchor_at_start, string* matched_string1, string* matched_string2, - string* matched_string3) const = 0; + string* matched_string3, + string* matched_string4, + string* matched_string5, + string* matched_string6) const = 0; + + // Helper methods calling the Consume method that assume the match must start + // at the beginning. + inline bool Consume(RegExpInput* input_string, string* matched_string1, + string* matched_string2, + string* matched_string3, + string* matched_string4, + string* matched_string5, + string* matched_string6) const { + return Consume(input_string, true, matched_string1, matched_string2, + matched_string3, matched_string4, matched_string5, + matched_string6); + } + + inline bool Consume(RegExpInput* input_string, string* matched_string1, + string* matched_string2, + string* matched_string3, + string* matched_string4, + string* matched_string5) const { + return Consume(input_string, true, matched_string1, matched_string2, + matched_string3, matched_string4, matched_string5, NULL); + } + + inline bool Consume(RegExpInput* input_string, string* matched_string1, + string* matched_string2, + string* matched_string3, + string* matched_string4) const { + return Consume(input_string, true, matched_string1, matched_string2, + matched_string3, matched_string4, NULL, NULL); + } + // Helper methods calling the Consume method that assume the match must start // at the beginning. @@ -72,28 +105,31 @@ class RegExp { string* matched_string2, string* matched_string3) const { return Consume(input_string, true, matched_string1, matched_string2, - matched_string3); + matched_string3, NULL, NULL, NULL); } inline bool Consume(RegExpInput* input_string, string* matched_string1, string* matched_string2) const { - return Consume(input_string, true, matched_string1, matched_string2, NULL); + return Consume(input_string, true, matched_string1, matched_string2, NULL, + NULL, NULL, NULL); } inline bool Consume(RegExpInput* input_string, string* matched_string) const { - return Consume(input_string, true, matched_string, NULL, NULL); + return Consume(input_string, true, matched_string, NULL, NULL, NULL, NULL, + NULL); } inline bool Consume(RegExpInput* input_string) const { - return Consume(input_string, true, NULL, NULL, NULL); + return Consume(input_string, true, NULL, NULL, NULL, NULL, NULL, NULL); } // Helper method calling the Consume method that assumes the match can start // at any place in the string. inline bool FindAndConsume(RegExpInput* input_string, string* matched_string) const { - return Consume(input_string, false, matched_string, NULL, NULL); + return Consume(input_string, false, matched_string, NULL, NULL, NULL, NULL, + NULL); } // Matches string to regular expression, returns true if the expression was diff --git a/cpp/src/phonenumbers/regexp_adapter_icu.cc b/cpp/src/phonenumbers/regexp_adapter_icu.cc index ba8215cc9..db91d9420 100644 --- a/cpp/src/phonenumbers/regexp_adapter_icu.cc +++ b/cpp/src/phonenumbers/regexp_adapter_icu.cc @@ -120,7 +120,10 @@ class IcuRegExp : public RegExp { bool anchor_at_start, string* matched_string1, string* matched_string2, - string* matched_string3) const { + string* matched_string3, + string* matched_string4, + string* matched_string5, + string* matched_string6) const { DCHECK(input_string); if (!utf8_regexp_.get()) { return false; @@ -135,9 +138,9 @@ class IcuRegExp : public RegExp { if (!match_succeeded || U_FAILURE(status)) { return false; } - string* const matched_strings[] = { - matched_string1, matched_string2, matched_string3 - }; + string* const matched_strings[] = {matched_string1, matched_string2, + matched_string3, matched_string4, + matched_string5, matched_string6}; // If less matches than expected - fail. for (size_t i = 0; i < arraysize(matched_strings); ++i) { if (matched_strings[i]) { diff --git a/cpp/src/phonenumbers/regexp_adapter_re2.cc b/cpp/src/phonenumbers/regexp_adapter_re2.cc index 3fde29063..773416910 100644 --- a/cpp/src/phonenumbers/regexp_adapter_re2.cc +++ b/cpp/src/phonenumbers/regexp_adapter_re2.cc @@ -60,10 +60,15 @@ bool DispatchRE2Call(Function regex_function, const RE2& regexp, string* out1, string* out2, - string* out3) { - const RE2::Arg outs[] = { out1, out2, out3, }; - const RE2::Arg* const args[] = { &outs[0], &outs[1], &outs[2], }; - const int argc = out3 ? 3 : out2 ? 2 : out1 ? 1 : 0; + string* out3, + string* out4, + string* out5, + string* out6) { + const RE2::Arg outs[] = { out1, out2, out3, out4, out5, out6}; + const RE2::Arg* const args[] = {&outs[0], &outs[1], &outs[2], + &outs[3], &outs[4], &outs[5]}; + const int argc = + out6 ? 6 : out5 ? 5 : out4 ? 4 : out3 ? 3 : out2 ? 2 : out1 ? 1 : 0; return regex_function(input, regexp, args, argc); } @@ -94,7 +99,10 @@ class RE2RegExp : public RegExp { bool anchor_at_start, string* matched_string1, string* matched_string2, - string* matched_string3) const { + string* matched_string3, + string* matched_string4, + string* matched_string5, + string* matched_string6) const { DCHECK(input_string); StringPiece* utf8_input = static_cast(input_string)->Data(); @@ -102,11 +110,13 @@ class RE2RegExp : public RegExp { if (anchor_at_start) { return DispatchRE2Call(RE2::ConsumeN, utf8_input, utf8_regexp_, matched_string1, matched_string2, - matched_string3); + matched_string3, matched_string4, + matched_string5, matched_string6); } else { return DispatchRE2Call(RE2::FindAndConsumeN, utf8_input, utf8_regexp_, matched_string1, matched_string2, - matched_string3); + matched_string3, matched_string4, + matched_string5, matched_string6); } } @@ -115,10 +125,10 @@ class RE2RegExp : public RegExp { string* matched_string) const { if (full_match) { return DispatchRE2Call(RE2::FullMatchN, input_string, utf8_regexp_, - matched_string, NULL, NULL); + matched_string, NULL, NULL, NULL, NULL, NULL); } else { return DispatchRE2Call(RE2::PartialMatchN, input_string, utf8_regexp_, - matched_string, NULL, NULL); + matched_string, NULL, NULL), NULL, NULL, NULL); } } diff --git a/cpp/test/phonenumbers/phonenumberutil_test.cc b/cpp/test/phonenumbers/phonenumberutil_test.cc index ad7e7c26b..a296b0289 100644 --- a/cpp/test/phonenumbers/phonenumberutil_test.cc +++ b/cpp/test/phonenumbers/phonenumberutil_test.cc @@ -4399,6 +4399,169 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) { EXPECT_EQ(us_with_extension, test_number); } +TEST_F(PhoneNumberUtilTest, TestParseHandlesLongExtensionsWithExplicitLabels) { + // Test lower and upper limits of extension lengths for each type of label. + PhoneNumber nz_number; + nz_number.set_country_code(64); + nz_number.set_national_number(33316005ULL); + PhoneNumber test_number; + + // Firstly, when in RFC format: ext_limit_after_explicit_label + nz_number.set_extension("0"); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("tel:+6433316005;ext=0", RegionCode::NZ(), + &test_number)); + EXPECT_EQ(nz_number, test_number); + + nz_number.set_extension("01234567890123456789"); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("tel:+6433316005;ext=01234567890123456789", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + // Extension too long. + EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER, + phone_util_.Parse("tel:+6433316005;ext=012345678901234567890", + RegionCode::NZ(), &test_number)); + + // Explicit extension label: ext_limit_after_explicit_label + nz_number.set_extension("1"); + EXPECT_EQ( + PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005ext:1", RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + nz_number.set_extension("12345678901234567890"); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 xtn:12345678901234567890", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 extension\t12345678901234567890", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 xtensio:12345678901234567890", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 xtensión, 12345678901234567890#", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005extension.12345678901234567890", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 доб:12345678901234567890", + RegionCode::NZ(), &test_number)); + EXPECT_EQ(nz_number, test_number); + + // Extension too long. + EXPECT_EQ(PhoneNumberUtil::TOO_LONG_NSN, + phone_util_.Parse("03 3316005 extension 123456789012345678901", + RegionCode::NZ(), &test_number)); +} + +TEST_F(PhoneNumberUtilTest, + TestParseHandlesLongExtensionsWithAutoDiallingLabels) { + // Secondly, cases of auto-dialling and other standard extension labels: + // ext_limit_after_likely_label + PhoneNumber us_number_user_input; + us_number_user_input.set_country_code(1); + us_number_user_input.set_national_number(2679000000ULL); + PhoneNumber test_number; + us_number_user_input.set_extension("123456789012345"); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("+12679000000,,123456789012345#", + RegionCode::US(), &test_number)); + EXPECT_EQ(us_number_user_input, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("+12679000000;123456789012345#", RegionCode::US(), + &test_number)); + EXPECT_EQ(us_number_user_input, test_number); + + PhoneNumber uk_number_user_input; + uk_number_user_input.set_country_code(44); + uk_number_user_input.set_national_number(2034000000ULL); + uk_number_user_input.set_extension("123456789"); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("+442034000000,,123456789#", RegionCode::GB(), + &test_number)); + + // Extension too long. + EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER, + phone_util_.Parse("+12679000000,,1234567890123456#", + RegionCode::US(), &test_number)); +} + +TEST_F(PhoneNumberUtilTest, TestParseHandlesShortExtensionsWithAmbiguousChar) { + // Thirdly, for single and non-standard cases: ext_limit_after_ambiguous_char + PhoneNumber nz_number; + nz_number.set_country_code(64); + nz_number.set_national_number(33316005ULL); + PhoneNumber test_number; + nz_number.set_extension("123456789");// + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 x 123456789", RegionCode::NZ(), + &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 x. 123456789", RegionCode::NZ(), + &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 #123456789#", RegionCode::NZ(), + &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("03 3316005 ~ 123456789", RegionCode::NZ(), + &test_number)); + EXPECT_EQ(nz_number, test_number); + + EXPECT_EQ(PhoneNumberUtil::TOO_LONG_NSN, + phone_util_.Parse("03 3316005 ~ 1234567890", RegionCode::NZ(), + &test_number)); +} + +TEST_F(PhoneNumberUtilTest, TestParseHandlesShortExtensionsWhenNotSureOfLabel) { + // Thirdly, when no explicit extension label present, but denoted by + // tailing #: ext_limit_when_not_sure + PhoneNumber us_number; + us_number.set_country_code(1); + us_number.set_national_number(1234567890ULL); + PhoneNumber test_number; + us_number.set_extension("666666"); + + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("+1123-456-7890 666666#", RegionCode::US(), + &test_number)); + EXPECT_EQ(us_number, test_number); + + us_number.set_extension("6"); + EXPECT_EQ( + PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse("+11234567890-6#", RegionCode::US(), &test_number)); + EXPECT_EQ(us_number, test_number); + + // Extension too long. + EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER, + phone_util_.Parse("+1123-456-7890 7777777#", RegionCode::US(), + &test_number)); +} + TEST_F(PhoneNumberUtilTest, ParseAndKeepRaw) { PhoneNumber alpha_numeric_number; alpha_numeric_number.set_country_code(1); diff --git a/cpp/test/phonenumbers/regexp_adapter_test.cc b/cpp/test/phonenumbers/regexp_adapter_test.cc index 88a930147..913a11633 100644 --- a/cpp/test/phonenumbers/regexp_adapter_test.cc +++ b/cpp/test/phonenumbers/regexp_adapter_test.cc @@ -47,7 +47,9 @@ struct RegExpTestContext { digits(factory->CreateRegExp("\\d+")), parentheses_digits(factory->CreateRegExp("\\((\\d+)\\)")), single_digit(factory->CreateRegExp("\\d")), - two_digit_groups(factory->CreateRegExp("(\\d+)-(\\d+)")) {} + two_digit_groups(factory->CreateRegExp("(\\d+)-(\\d+)")), + six_digit_groups(factory->CreateRegExp( + "(\\d+)-(\\d+)-(\\d+)-(\\d+)-(\\d+)-(\\d+)")) {} const string name; const scoped_ptr factory; @@ -55,6 +57,7 @@ struct RegExpTestContext { const scoped_ptr parentheses_digits; const scoped_ptr single_digit; const scoped_ptr two_digit_groups; + const scoped_ptr six_digit_groups; }; class RegExpAdapterTest : public testing::Test { @@ -89,18 +92,21 @@ TEST_F(RegExpAdapterTest, TestConsumeNoMatch) { // When 'true' is passed to Consume(), the match occurs from the beginning // of the input. - ASSERT_FALSE(context.digits->Consume(input.get(), true, NULL, NULL, NULL)) - << ErrorMessage(context); + ASSERT_FALSE(context.digits->Consume( + input.get(), true, NULL, NULL, NULL, NULL, NULL, NULL)) + << ErrorMessage(context); ASSERT_EQ("+1-123-456-789", input->ToString()) << ErrorMessage(context); string res1; ASSERT_FALSE(context.parentheses_digits->Consume( - input.get(), true, &res1, NULL, NULL)) << ErrorMessage(context); + input.get(), true, &res1, NULL, NULL, NULL, NULL, NULL)) + << ErrorMessage(context); ASSERT_EQ("+1-123-456-789", input->ToString()) << ErrorMessage(context); ASSERT_EQ("", res1) << ErrorMessage(context); } } + TEST_F(RegExpAdapterTest, TestConsumeWithNull) { for (TestContextIterator it = contexts_.begin(); it != contexts_.end(); ++it) { @@ -109,7 +115,8 @@ TEST_F(RegExpAdapterTest, TestConsumeWithNull) { const scoped_ptr input(factory.CreateInput("+123")); const scoped_ptr plus_sign(factory.CreateRegExp("(\\+)")); - ASSERT_TRUE(plus_sign->Consume(input.get(), true, NULL, NULL, NULL)) + ASSERT_TRUE(plus_sign->Consume(input.get(), true, NULL, NULL, NULL, NULL, + NULL, NULL)) << ErrorMessage(context); ASSERT_EQ("123", input->ToString()) << ErrorMessage(context); } @@ -124,7 +131,8 @@ TEST_F(RegExpAdapterTest, TestConsumeRetainsMatches) { string res1, res2; ASSERT_TRUE(context.two_digit_groups->Consume( - input.get(), true, &res1, &res2, NULL)) << ErrorMessage(context); + input.get(), true, &res1, &res2, NULL, NULL, NULL, NULL)) + << ErrorMessage(context); ASSERT_EQ("-456-789", input->ToString()) << ErrorMessage(context); ASSERT_EQ("1", res1) << ErrorMessage(context); ASSERT_EQ("123", res2) << ErrorMessage(context); @@ -137,27 +145,51 @@ TEST_F(RegExpAdapterTest, TestFindAndConsume) { const RegExpTestContext& context = **it; const scoped_ptr input( context.factory->CreateInput("+1-123-456-789")); + const scoped_ptr input_with_six_digit_groups( + context.factory->CreateInput("111-222-333-444-555-666")); // When 'false' is passed to Consume(), the match can occur from any place // in the input. - ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL)) + ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL, + NULL, NULL, NULL)) << ErrorMessage(context); ASSERT_EQ("-123-456-789", input->ToString()) << ErrorMessage(context); - ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL)) + ASSERT_TRUE(context.digits->Consume(input.get(), false, NULL, NULL, NULL, + NULL, NULL, NULL)) << ErrorMessage(context); ASSERT_EQ("-456-789", input->ToString()) << ErrorMessage(context); ASSERT_FALSE(context.parentheses_digits->Consume( - input.get(), false, NULL, NULL, NULL)) << ErrorMessage(context); + input.get(), false, NULL, NULL, NULL, NULL, NULL, NULL)) + << ErrorMessage(context); ASSERT_EQ("-456-789", input->ToString()) << ErrorMessage(context); string res1, res2; ASSERT_TRUE(context.two_digit_groups->Consume( - input.get(), false, &res1, &res2, NULL)) << ErrorMessage(context); + input.get(), false, &res1, &res2, NULL, NULL, NULL, NULL)) + << ErrorMessage(context); + printf("previous input: %s", input.get()->ToString().c_str()); ASSERT_EQ("", input->ToString()) << ErrorMessage(context); ASSERT_EQ("456", res1) << ErrorMessage(context); ASSERT_EQ("789", res2) << ErrorMessage(context); + + // Testing maximum no of substrings that can be matched presently, six. + string mat1, mat2, res3, res4, res5, res6; + ASSERT_TRUE(context.six_digit_groups->Consume( + input_with_six_digit_groups.get(), false, &mat1, &mat2, &res3, &res4, + &res5, &res6)) + << ErrorMessage(context); + printf("Present input: %s", + input_with_six_digit_groups.get()->ToString().c_str()); + ASSERT_EQ("", input_with_six_digit_groups->ToString()) + << ErrorMessage(context); + ASSERT_EQ("111", mat1) << ErrorMessage(context); + ASSERT_EQ("222", mat2) << ErrorMessage(context); + ASSERT_EQ("333", res3) << ErrorMessage(context); + ASSERT_EQ("444", res4) << ErrorMessage(context); + ASSERT_EQ("555", res5) << ErrorMessage(context); + ASSERT_EQ("666", res6) << ErrorMessage(context); } } diff --git a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java index 56e38870a..d8cb9d378 100644 --- a/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java +++ b/java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java @@ -303,46 +303,98 @@ public class PhoneNumberUtil { // as the default extension prefix. This can be overridden by region-specific preferences. private static final String DEFAULT_EXTN_PREFIX = " ext. "; - // Pattern to capture digits used in an extension. Places a maximum length of "7" for an - // extension. - private static final String CAPTURING_EXTN_DIGITS = "(" + DIGITS + "{1,7})"; // Regexp of all possible ways to write extensions, for use when parsing. This will be run as a // case-insensitive regexp match. Wide character versions are also provided after each ASCII // version. - private static final String EXTN_PATTERNS_FOR_PARSING; - static final String EXTN_PATTERNS_FOR_MATCHING; - static { - // One-character symbols that can be used to indicate an extension. - String singleExtnSymbolsForMatching = "x\uFF58#\uFF03~\uFF5E"; - // For parsing, we are slightly more lenient in our interpretation than for matching. Here we - // allow "comma" and "semicolon" as possible extension indicators. When matching, these are - // hardly ever used to indicate this. - String singleExtnSymbolsForParsing = ",;" + singleExtnSymbolsForMatching; - - EXTN_PATTERNS_FOR_PARSING = createExtnPattern(singleExtnSymbolsForParsing); - EXTN_PATTERNS_FOR_MATCHING = createExtnPattern(singleExtnSymbolsForMatching); - } - - /** - * Helper initialiser method to create the regular-expression pattern to match extensions, - * allowing the one-char extension symbols provided by {@code singleExtnSymbols}. - */ - private static String createExtnPattern(String singleExtnSymbols) { - // There are three regular expressions here. The first covers RFC 3966 format, where the - // extension is added using ";ext=". The second more generic one starts with optional white - // space and ends with an optional full stop (.), followed by zero or more spaces/tabs/commas - // and then the numbers themselves. The other one covers the special case of American numbers - // where the extension is written with a hash at the end, such as "- 503#" - // Note that the only capturing groups should be around the digits that you want to capture as - // part of the extension, or else parsing will fail! - // Canonical-equivalence doesn't seem to be an option with Android java, so we allow two options - // for representing the accented o - the character itself, and one in the unicode decomposed - // form with the combining acute accent. - return (RFC3966_EXTN_PREFIX + CAPTURING_EXTN_DIGITS + "|" + "[ \u00A0\\t,]*" - + "(?:e?xt(?:ensi(?:o\u0301?|\u00F3))?n?|\uFF45?\uFF58\uFF54\uFF4E?|" - + "\u0434\u043E\u0431|" + "[" + singleExtnSymbols + "]|int|anexo|\uFF49\uFF4E\uFF54)" - + "[:\\.\uFF0E]?[ \u00A0\\t,-]*" + CAPTURING_EXTN_DIGITS + "#?|" - + "[- ]+(" + DIGITS + "{1,5})#"); + private static final String EXTN_PATTERNS_FOR_PARSING = createExtnPattern(true); + static final String EXTN_PATTERNS_FOR_MATCHING = createExtnPattern(false); + + /** + * Helper method for constructing regular expressions for parsing. Creates an expression that + * captures up to maxLength digits. + */ + private static String extnDigits(int maxLength) { + return "(" + DIGITS + "{1," + maxLength + "})"; + } + + /** + * Helper initialiser method to create the regular-expression pattern to match extensions. + * Note that there are currently six capturing groups for the extension itself. If this number is + * changed, MaybeStripExtension needs to be updated. + */ + private static String createExtnPattern(boolean forParsing) { + // We cap the maximum length of an extension based on the ambiguity of the way the extension is + // prefixed. As per ITU, the officially allowed length for extensions is actually 40, but we + // don't support this since we haven't seen real examples and this introduces many false + // interpretations as the extension labels are not standardized. + int extLimitAfterExplicitLabel = 20; + int extLimitAfterLikelyLabel = 15; + int extLimitAfterAmbiguousChar = 9; + int extLimitWhenNotSure = 6; + + String possibleSeparatorsBetweenNumberAndExtLabel = "[ \u00A0\\t,]*"; + // Optional full stop (.) or colon, followed by zero or more spaces/tabs/commas. + String possibleCharsAfterExtLabel = "[:\\.\uFF0E]?[ \u00A0\\t,-]*"; + String optionalExtnSuffix = "#?"; + + // Here the extension is called out in more explicit way, i.e mentioning it obvious patterns + // like "ext.". Canonical-equivalence doesn't seem to be an option with Android java, so we + // allow two options for representing the accented o - the character itself, and one in the + // unicode decomposed form with the combining acute accent. + String explicitExtLabels = + "(?:e?xt(?:ensi(?:o\u0301?|\u00F3))?n?|\uFF45?\uFF58\uFF54\uFF4E?|\u0434\u043E\u0431|anexo)"; + // One-character symbols that can be used to indicate an extension, and less commonly used + // or more ambiguous extension labels. + String ambiguousExtLabels = "(?:[x\uFF58#\uFF03~\uFF5E]|int|\uFF49\uFF4E\uFF54)"; + // When extension is not separated clearly. + String ambiguousSeparator = "[- ]+"; + + String rfcExtn = RFC3966_EXTN_PREFIX + extnDigits(extLimitAfterExplicitLabel); + String explicitExtn = possibleSeparatorsBetweenNumberAndExtLabel + explicitExtLabels + + possibleCharsAfterExtLabel + extnDigits(extLimitAfterExplicitLabel) + + optionalExtnSuffix; + String ambiguousExtn = possibleSeparatorsBetweenNumberAndExtLabel + ambiguousExtLabels + + possibleCharsAfterExtLabel + extnDigits(extLimitAfterAmbiguousChar) + optionalExtnSuffix; + String americanStyleExtnWithSuffix = ambiguousSeparator + extnDigits(extLimitWhenNotSure) + "#"; + + // The first regular expression covers RFC 3966 format, where the extension is added using + // ";ext=". The second more generic where extension is mentioned with explicit labels like + // "ext:". In both the above cases we allow more numbers in extension than any other extension + // labels. The third one captures when single character extension labels or less commonly used + // labels are used. In such cases we capture fewer extension digits in order to reduce the + // chance of falsely interpreting two numbers beside each other as a number + extension. The + // fourth one covers the special case of American numbers where the extension is written with a + // hash at the end, such as "- 503#". + String extensionPattern = + rfcExtn + "|" + + explicitExtn + "|" + + ambiguousExtn + "|" + + americanStyleExtnWithSuffix; + // Additional pattern that is supported when parsing extensions, not when matching. + if (forParsing) { + // This is same as possibleSeparatorsBetweenNumberAndExtLabel, but not matching comma as + // extension label may have it. + String possibleSeparatorsNumberExtLabelNoComma = "[ \u00A0\\t]*"; + // ",," is commonly used for auto dialling the extension when connected. First comma is matched + // through possibleSeparatorsBetweenNumberAndExtLabel, so we do not repeat it here. Semi-colon + // works in Iphone and Android also to pop up a button with the extension number following. + String autoDiallingAndExtLabelsFound = "(?:,{2}|;)"; + + String autoDiallingExtn = possibleSeparatorsNumberExtLabelNoComma + + autoDiallingAndExtLabelsFound + possibleCharsAfterExtLabel + + extnDigits(extLimitAfterLikelyLabel) + optionalExtnSuffix; + String onlyCommasExtn = possibleSeparatorsNumberExtLabelNoComma + + "(?:,)+" + possibleCharsAfterExtLabel + extnDigits(extLimitAfterAmbiguousChar) + + optionalExtnSuffix; + // Here the first pattern is exclusively for extension autodialling formats which are used + // when dialling and in this case we accept longer extensions. However, the second pattern + // is more liberal on the number of commas that acts as extension labels, so we have a strict + // cap on the number of digits in such extensions. + return extensionPattern + "|" + + autoDiallingExtn + "|" + + onlyCommasExtn; + } + return extensionPattern; } // Regexp of all known extension prefixes used by different regions followed by 1 or more valid diff --git a/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java b/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java index f3cd078c4..f219fdf25 100644 --- a/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java +++ b/java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java @@ -2677,6 +2677,137 @@ public class PhoneNumberUtilTest extends TestMetadataTestCase { // Retry with the same number in a slightly different format. assertEquals(usWithExtension, phoneUtil.parse("+1 (645) 123 1234 ext. 910#", RegionCode.US)); } + + public void testParseHandlesLongExtensionsWithExplicitLabels() throws Exception { + // Test lower and upper limits of extension lengths for each type of label. + PhoneNumber nzNumber = new PhoneNumber(); + nzNumber.setCountryCode(64).setNationalNumber(33316005L); + + // Firstly, when in RFC format: PhoneNumberUtil.extLimitAfterExplicitLabel + nzNumber.setExtension("0"); + assertEquals(nzNumber, phoneUtil.parse("tel:+6433316005;ext=0", RegionCode.NZ)); + nzNumber.setExtension("01234567890123456789"); + assertEquals( + nzNumber, phoneUtil.parse("tel:+6433316005;ext=01234567890123456789", RegionCode.NZ)); + // Extension too long. + try { + phoneUtil.parse("tel:+6433316005;ext=012345678901234567890", RegionCode.NZ); + fail( + "This should not parse as length of extension is higher than allowed: " + + "tel:+6433316005;ext=012345678901234567890"); + } catch (NumberParseException e) { + // Expected this exception. + assertEquals( + "Wrong error type stored in exception.", + NumberParseException.ErrorType.NOT_A_NUMBER, + e.getErrorType()); + } + + // Explicit extension label: PhoneNumberUtil.extLimitAfterExplicitLabel + nzNumber.setExtension("1"); + assertEquals(nzNumber, phoneUtil.parse("03 3316005ext:1", RegionCode.NZ)); + nzNumber.setExtension("12345678901234567890"); + assertEquals(nzNumber, phoneUtil.parse("03 3316005 xtn:12345678901234567890", RegionCode.NZ)); + assertEquals( + nzNumber, phoneUtil.parse("03 3316005 extension\t12345678901234567890", RegionCode.NZ)); + assertEquals( + nzNumber, phoneUtil.parse("03 3316005 xtensio:12345678901234567890", RegionCode.NZ)); + assertEquals( + nzNumber, phoneUtil.parse("03 3316005 xtensi\u00F3n, 12345678901234567890#", RegionCode.NZ)); + assertEquals( + nzNumber, phoneUtil.parse("03 3316005extension.12345678901234567890", RegionCode.NZ)); + assertEquals(nzNumber, phoneUtil.parse("03 3316005 \u0434\u043E\u0431:12345678901234567890", RegionCode.NZ)); + // Extension too long. + try { + phoneUtil.parse("03 3316005 extension 123456789012345678901", RegionCode.NZ); + fail( + "This should not parse as length of extension is higher than allowed: " + + "03 3316005 extension 123456789012345678901"); + } catch (NumberParseException e) { + // Expected this exception. + assertEquals( + "Wrong error type stored in exception.", + NumberParseException.ErrorType.TOO_LONG, + e.getErrorType()); + } + } + + public void testParseHandlesLongExtensionsWithAutoDiallingLabels() throws Exception { + // Secondly, cases of auto-dialling and other standard extension labels, + // PhoneNumberUtil.extLimitAfterLikelyLabel + PhoneNumber usNumberUserInput = new PhoneNumber(); + usNumberUserInput.setCountryCode(1).setNationalNumber(2679000000L); + usNumberUserInput.setExtension("123456789012345"); + assertEquals( + usNumberUserInput, phoneUtil.parse("+12679000000,,123456789012345#", RegionCode.US)); + assertEquals( + usNumberUserInput, phoneUtil.parse("+12679000000;123456789012345#", RegionCode.US)); + PhoneNumber ukNumberUserInput = new PhoneNumber(); + ukNumberUserInput.setCountryCode(44).setNationalNumber(2034000000L).setExtension("123456789"); + assertEquals(ukNumberUserInput, phoneUtil.parse("+442034000000,,123456789#", RegionCode.GB)); + // Extension too long. + try { + phoneUtil.parse("+12679000000,,1234567890123456#", RegionCode.US); + fail( + "This should not parse as length of extension is higher than allowed: " + + "+12679000000,,1234567890123456#"); + } catch (NumberParseException e) { + // Expected this exception. + assertEquals( + "Wrong error type stored in exception.", + NumberParseException.ErrorType.NOT_A_NUMBER, + e.getErrorType()); + } + } + + public void testParseHandlesShortExtensionsWithAmbiguousChar() throws Exception { + PhoneNumber nzNumber = new PhoneNumber(); + nzNumber.setCountryCode(64).setNationalNumber(33316005L); + + // Thirdly, for single and non-standard cases: + // PhoneNumberUtil.extLimitAfterAmbiguousChar + nzNumber.setExtension("123456789"); + assertEquals(nzNumber, phoneUtil.parse("03 3316005 x 123456789", RegionCode.NZ)); + assertEquals(nzNumber, phoneUtil.parse("03 3316005 x. 123456789", RegionCode.NZ)); + assertEquals(nzNumber, phoneUtil.parse("03 3316005 #123456789#", RegionCode.NZ)); + assertEquals(nzNumber, phoneUtil.parse("03 3316005 ~ 123456789", RegionCode.NZ)); + // Extension too long. + try { + phoneUtil.parse("03 3316005 ~ 1234567890", RegionCode.NZ); + fail( + "This should not parse as length of extension is higher than allowed: " + + "03 3316005 ~ 1234567890"); + } catch (NumberParseException e) { + // Expected this exception. + assertEquals( + "Wrong error type stored in exception.", + NumberParseException.ErrorType.TOO_LONG, + e.getErrorType()); + } + } + + public void testParseHandlesShortExtensionsWhenNotSureOfLabel() throws Exception { + // Lastly, when no explicit extension label present, but denoted by tailing #: + // PhoneNumberUtil.extLimitWhenNotSure + PhoneNumber usNumber = new PhoneNumber(); + usNumber.setCountryCode(1).setNationalNumber(1234567890L).setExtension("666666"); + assertEquals(usNumber, phoneUtil.parse("+1123-456-7890 666666#", RegionCode.US)); + usNumber.setExtension("6"); + assertEquals(usNumber, phoneUtil.parse("+11234567890-6#", RegionCode.US)); + // Extension too long. + try { + phoneUtil.parse("+1123-456-7890 7777777#", RegionCode.US); + fail( + "This should not parse as length of extension is higher than allowed: " + + "+1123-456-7890 7777777#"); + } catch (NumberParseException e) { + // Expected this exception. + assertEquals( + "Wrong error type stored in exception.", + NumberParseException.ErrorType.NOT_A_NUMBER, + e.getErrorType()); + } + } public void testParseAndKeepRaw() throws Exception { PhoneNumber alphaNumericNumber = new PhoneNumber().mergeFrom(ALPHA_NUMERIC_NUMBER).