Browse Source

Support semicolon as extension character while parsing phone numbers. (#1458)

* Support semicolon as extension character while parsing

* Add notes to pending_code_changes.txt

* JS port: Support semicolon as extension character while parsing

* Update comments in phonenumberutil.js
pull/1455/merge
penmetsaa 9 years ago
committed by GitHub
parent
commit
6347995ee6
11 changed files with 55 additions and 20 deletions
  1. +6
    -6
      cpp/src/phonenumbers/phonenumberutil.cc
  2. +4
    -0
      cpp/test/phonenumbers/phonenumbermatcher_test.cc
  3. +12
    -0
      cpp/test/phonenumbers/phonenumberutil_test.cc
  4. BIN
      java/carrier/src/com/google/i18n/phonenumbers/carrier/data/config
  5. BIN
      java/geocoder/src/com/google/i18n/phonenumbers/geocoding/data/config
  6. +6
    -6
      java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java
  7. +3
    -0
      java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java
  8. +5
    -0
      java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java
  9. +2
    -0
      java/pending_code_changes.txt
  10. +9
    -8
      javascript/i18n/phonenumbers/phonenumberutil.js
  11. +8
    -0
      javascript/i18n/phonenumbers/phonenumberutil_test.js

+ 6
- 6
cpp/src/phonenumbers/phonenumberutil.cc View File

@ -208,9 +208,9 @@ string CreateExtnPattern(const string& single_extn_symbols) {
// The first regular expression covers RFC 3966 format, where the extension is // The first regular expression covers RFC 3966 format, where the extension is
// added using ";ext=". The second more generic one starts with optional white // added using ";ext=". The second more generic one starts with optional white
// space and ends with an optional full stop (.), followed by zero or more // space and ends with an optional full stop (.), followed by zero or more
// spaces/tabs and then the numbers themselves. The third one covers the
// special case of American numbers where the extension is written with a hash
// at the end, such as "- 503#".
// spaces/tabs/commas and then the numbers themselves. The third one covers
// the special case of American numbers where the extension is written with a
// hash at the end, such as "- 503#".
// Note that the only capturing groups should be around the digits that you // Note that the only capturing groups should be around the digits that you
// want to capture as part of the extension, or else parsing will fail! // want to capture as part of the extension, or else parsing will fail!
// Canonical-equivalence doesn't seem to be an option with RE2, so we allow // Canonical-equivalence doesn't seem to be an option with RE2, so we allow
@ -451,8 +451,8 @@ class PhoneNumberRegExpsAndMappings {
// will be run as a case-insensitive regexp match. Wide character versions are // will be run as a case-insensitive regexp match. Wide character versions are
// also provided after each ASCII version. // also provided after each ASCII version.
// For parsing, we are slightly more lenient in our interpretation than for // For parsing, we are slightly more lenient in our interpretation than for
// matching. Here we allow a "comma" as a possible extension indicator. When
// matching, this is hardly ever used to indicate this.
// matching. Here we allow "comma" and "semicolon" as possible extension
// indicators. When matching, these are hardly ever used to indicate this.
const string extn_patterns_for_parsing_; const string extn_patterns_for_parsing_;
public: public:
@ -570,7 +570,7 @@ class PhoneNumberRegExpsAndMappings {
punctuation_and_star_sign_, kDigits, punctuation_and_star_sign_, kDigits,
"]*")), "]*")),
extn_patterns_for_parsing_( extn_patterns_for_parsing_(
CreateExtnPattern(StrCat(",", kSingleExtnSymbolsForMatching))),
CreateExtnPattern(StrCat(",;", kSingleExtnSymbolsForMatching))),
regexp_factory_(new RegExpFactory()), regexp_factory_(new RegExpFactory()),
regexp_cache_(new RegExpCache(*regexp_factory_.get(), 128)), regexp_cache_(new RegExpCache(*regexp_factory_.get(), 128)),
diallable_char_mappings_(), diallable_char_mappings_(),


+ 4
- 0
cpp/test/phonenumbers/phonenumbermatcher_test.cc View File

@ -253,6 +253,10 @@ class PhoneNumberMatcherTest : public testing::Test {
// With trailing numbers after a comma. The 45 should not be considered an // With trailing numbers after a comma. The 45 should not be considered an
// extension. // extension.
context_pairs.push_back(NumberContext("", ", 45 days a year")); context_pairs.push_back(NumberContext("", ", 45 days a year"));
// When matching we don't consider semicolon along with legitimate extension
// symbol to indicate an extension. The 7246433 should not be considered an
// extension.
context_pairs.push_back(NumberContext("", ";x 7246433"));
// With a postfix stripped off as it looks like the start of another number. // With a postfix stripped off as it looks like the start of another number.
context_pairs.push_back(NumberContext("Call ", "/x12 more")); context_pairs.push_back(NumberContext("Call ", "/x12 more"));


+ 12
- 0
cpp/test/phonenumbers/phonenumberutil_test.cc View File

@ -2824,6 +2824,9 @@ TEST_F(PhoneNumberUtilTest, IsNumberMatchMatches) {
EXPECT_EQ(PhoneNumberUtil::EXACT_MATCH, EXPECT_EQ(PhoneNumberUtil::EXACT_MATCH,
phone_util_.IsNumberMatchWithTwoStrings("+64 3 331-6005 extn 1234", phone_util_.IsNumberMatchWithTwoStrings("+64 3 331-6005 extn 1234",
"+6433316005#1234")); "+6433316005#1234"));
EXPECT_EQ(PhoneNumberUtil::EXACT_MATCH,
phone_util_.IsNumberMatchWithTwoStrings("+64 3 331-6005 extn 1234",
"+6433316005;1234"));
// Test proto buffers. // Test proto buffers.
PhoneNumber nz_number; PhoneNumber nz_number;
nz_number.set_country_code(64); nz_number.set_country_code(64);
@ -3792,6 +3795,15 @@ TEST_F(PhoneNumberUtilTest, ParseExtensions) {
phone_util_.Parse("(800) 901-3355 , ext 7246433", RegionCode::US(), phone_util_.Parse("(800) 901-3355 , ext 7246433", RegionCode::US(),
&test_number)); &test_number));
EXPECT_EQ(us_with_extension, test_number); EXPECT_EQ(us_with_extension, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("(800) 901-3355 ; 7246433", RegionCode::US(),
&test_number));
EXPECT_EQ(us_with_extension, test_number);
// To test an extension character without surrounding spaces.
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("(800) 901-3355;7246433", RegionCode::US(),
&test_number));
EXPECT_EQ(us_with_extension, test_number);
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse("(800) 901-3355 ,extension 7246433", phone_util_.Parse("(800) 901-3355 ,extension 7246433",
RegionCode::US(), RegionCode::US(),


BIN
java/carrier/src/com/google/i18n/phonenumbers/carrier/data/config View File


BIN
java/geocoder/src/com/google/i18n/phonenumbers/geocoding/data/config View File


+ 6
- 6
java/libphonenumber/src/com/google/i18n/phonenumbers/PhoneNumberUtil.java View File

@ -313,9 +313,9 @@ public class PhoneNumberUtil {
// One-character symbols that can be used to indicate an extension. // One-character symbols that can be used to indicate an extension.
String singleExtnSymbolsForMatching = "x\uFF58#\uFF03~\uFF5E"; String singleExtnSymbolsForMatching = "x\uFF58#\uFF03~\uFF5E";
// For parsing, we are slightly more lenient in our interpretation than for matching. Here we // For parsing, we are slightly more lenient in our interpretation than for matching. Here we
// allow a "comma" as a possible extension indicator. When matching, this is hardly ever used to
// indicate this.
String singleExtnSymbolsForParsing = "," + singleExtnSymbolsForMatching;
// allow "comma" and "semicolon" as possible extension indicators. When matching, these are
// hardly ever used to indicate this.
String singleExtnSymbolsForParsing = ",;" + singleExtnSymbolsForMatching;
EXTN_PATTERNS_FOR_PARSING = createExtnPattern(singleExtnSymbolsForParsing); EXTN_PATTERNS_FOR_PARSING = createExtnPattern(singleExtnSymbolsForParsing);
EXTN_PATTERNS_FOR_MATCHING = createExtnPattern(singleExtnSymbolsForMatching); EXTN_PATTERNS_FOR_MATCHING = createExtnPattern(singleExtnSymbolsForMatching);
@ -328,9 +328,9 @@ public class PhoneNumberUtil {
private static String createExtnPattern(String singleExtnSymbols) { private static String createExtnPattern(String singleExtnSymbols) {
// There are three regular expressions here. The first covers RFC 3966 format, where the // There are three regular expressions here. The first covers RFC 3966 format, where the
// extension is added using ";ext=". The second more generic one starts with optional white // extension is added using ";ext=". The second more generic one starts with optional white
// space and ends with an optional full stop (.), followed by zero or more spaces/tabs and then
// the numbers themselves. The other one covers the special case of American numbers where the
// extension is written with a hash at the end, such as "- 503#".
// space and ends with an optional full stop (.), followed by zero or more spaces/tabs/commas
// and then the numbers themselves. The other one covers the special case of American numbers
// where the extension is written with a hash at the end, such as "- 503#"
// Note that the only capturing groups should be around the digits that you want to capture as // Note that the only capturing groups should be around the digits that you want to capture as
// part of the extension, or else parsing will fail! // part of the extension, or else parsing will fail!
// Canonical-equivalence doesn't seem to be an option with Android java, so we allow two options // Canonical-equivalence doesn't seem to be an option with Android java, so we allow two options


+ 3
- 0
java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberMatcherTest.java View File

@ -981,6 +981,9 @@ public class PhoneNumberMatcherTest extends TestMetadataTestCase {
"As I said on 03/10/2011, you may call me at ", "")); "As I said on 03/10/2011, you may call me at ", ""));
// With trailing numbers after a comma. The 45 should not be considered an extension. // With trailing numbers after a comma. The 45 should not be considered an extension.
contextPairs.add(new NumberContext("", ", 45 days a year")); contextPairs.add(new NumberContext("", ", 45 days a year"));
// When matching we don't consider semicolon along with legitimate extension symbol to indicate
// an extension. The 7246433 should not be considered an extension.
contextPairs.add(new NumberContext("", ";x 7246433"));
// With a postfix stripped off as it looks like the start of another number. // With a postfix stripped off as it looks like the start of another number.
contextPairs.add(new NumberContext("Call ", "/x12 more")); contextPairs.add(new NumberContext("Call ", "/x12 more"));


+ 5
- 0
java/libphonenumber/test/com/google/i18n/phonenumbers/PhoneNumberUtilTest.java View File

@ -2262,6 +2262,9 @@ public class PhoneNumberUtilTest extends TestMetadataTestCase {
usWithExtension.setCountryCode(1).setNationalNumber(8009013355L).setExtension("7246433"); usWithExtension.setCountryCode(1).setNationalNumber(8009013355L).setExtension("7246433");
assertEquals(usWithExtension, phoneUtil.parse("(800) 901-3355 x 7246433", RegionCode.US)); assertEquals(usWithExtension, phoneUtil.parse("(800) 901-3355 x 7246433", RegionCode.US));
assertEquals(usWithExtension, phoneUtil.parse("(800) 901-3355 , ext 7246433", RegionCode.US)); assertEquals(usWithExtension, phoneUtil.parse("(800) 901-3355 , ext 7246433", RegionCode.US));
assertEquals(usWithExtension, phoneUtil.parse("(800) 901-3355 ; 7246433", RegionCode.US));
// To test an extension character without surrounding spaces.
assertEquals(usWithExtension, phoneUtil.parse("(800) 901-3355;7246433", RegionCode.US));
assertEquals(usWithExtension, assertEquals(usWithExtension,
phoneUtil.parse("(800) 901-3355 ,extension 7246433", RegionCode.US)); phoneUtil.parse("(800) 901-3355 ,extension 7246433", RegionCode.US));
assertEquals(usWithExtension, assertEquals(usWithExtension,
@ -2405,6 +2408,8 @@ public class PhoneNumberUtilTest extends TestMetadataTestCase {
// Test numbers with extensions. // Test numbers with extensions.
assertEquals(PhoneNumberUtil.MatchType.EXACT_MATCH, assertEquals(PhoneNumberUtil.MatchType.EXACT_MATCH,
phoneUtil.isNumberMatch("+64 3 331-6005 extn 1234", "+6433316005#1234")); phoneUtil.isNumberMatch("+64 3 331-6005 extn 1234", "+6433316005#1234"));
assertEquals(PhoneNumberUtil.MatchType.EXACT_MATCH,
phoneUtil.isNumberMatch("+64 3 331-6005 ext. 1234", "+6433316005;1234"));
// Test proto buffers. // Test proto buffers.
assertEquals(PhoneNumberUtil.MatchType.EXACT_MATCH, assertEquals(PhoneNumberUtil.MatchType.EXACT_MATCH,
phoneUtil.isNumberMatch(NZ_NUMBER, "+6403 331 6005")); phoneUtil.isNumberMatch(NZ_NUMBER, "+6403 331 6005"));


+ 2
- 0
java/pending_code_changes.txt View File

@ -3,3 +3,5 @@ Code changes:
of phone number objects. These have been marked deprecated for months. Any of phone number objects. These have been marked deprecated for months. Any
users of these methods should call PhoneNumberUtil.parse first to create a users of these methods should call PhoneNumberUtil.parse first to create a
PhoneNumber object, and pass this in. PhoneNumber object, and pass this in.
- Support semicolon as extension character while parsing phone numbers. This
is not applicable when you are trying to find the phone numbers.

+ 9
- 8
javascript/i18n/phonenumbers/phonenumberutil.js View File

@ -763,13 +763,14 @@ i18n.phonenumbers.PhoneNumberUtil.CAPTURING_EXTN_DIGITS_ =
* also provided after each ASCII version. There are three regular expressions * also provided after each ASCII version. There are three regular expressions
* here. The first covers RFC 3966 format, where the extension is added using * here. The first covers RFC 3966 format, where the extension is added using
* ';ext='. The second more generic one starts with optional white space and * ';ext='. The second more generic one starts with optional white space and
* ends with an optional full stop (.), followed by zero or more spaces/tabs and
* then the numbers themselves. The other one covers the special case of
* American numbers where the extension is written with a hash at the end, such
* as '- 503#'. Note that the only capturing groups should be around the digits
* that you want to capture as part of the extension, or else parsing will fail!
* We allow two options for representing the accented o - the character itself,
* and one in the unicode decomposed form with the combining acute accent.
* ends with an optional full stop (.), followed by zero or more spaces/tabs
* /commas and then the numbers themselves. The other one covers the special
* case of American numbers where the extension is written with a hash at the
* end, such as '- 503#'. Note that the only capturing groups should be around
* the digits that you want to capture as part of the extension, or else parsing
* will fail! We allow two options for representing the accented o - the
* character itself, and one in the unicode decomposed form with the combining
* acute accent.
* *
* @const * @const
* @type {string} * @type {string}
@ -780,7 +781,7 @@ i18n.phonenumbers.PhoneNumberUtil.EXTN_PATTERNS_FOR_PARSING_ =
i18n.phonenumbers.PhoneNumberUtil.CAPTURING_EXTN_DIGITS_ + '|' + i18n.phonenumbers.PhoneNumberUtil.CAPTURING_EXTN_DIGITS_ + '|' +
'[ \u00A0\\t,]*' + '[ \u00A0\\t,]*' +
'(?:e?xt(?:ensi(?:o\u0301?|\u00F3))?n?|\uFF45?\uFF58\uFF54\uFF4E?|' + '(?:e?xt(?:ensi(?:o\u0301?|\u00F3))?n?|\uFF45?\uFF58\uFF54\uFF4E?|' +
'[,x\uFF58#\uFF03~\uFF5E]|int|anexo|\uFF49\uFF4E\uFF54)' +
'[;,x\uFF58#\uFF03~\uFF5E]|int|anexo|\uFF49\uFF4E\uFF54)' +
'[:\\.\uFF0E]?[ \u00A0\\t,-]*' + '[:\\.\uFF0E]?[ \u00A0\\t,-]*' +
i18n.phonenumbers.PhoneNumberUtil.CAPTURING_EXTN_DIGITS_ + '#?|' + i18n.phonenumbers.PhoneNumberUtil.CAPTURING_EXTN_DIGITS_ + '#?|' +
'[- ]+([' + i18n.phonenumbers.PhoneNumberUtil.VALID_DIGITS_ + ']{1,5})#'; '[- ]+([' + i18n.phonenumbers.PhoneNumberUtil.VALID_DIGITS_ + ']{1,5})#';


+ 8
- 0
javascript/i18n/phonenumbers/phonenumberutil_test.js View File

@ -2928,6 +2928,11 @@ function testParseExtensions() {
phoneUtil.parse('(800) 901-3355 x 7246433', RegionCode.US))); phoneUtil.parse('(800) 901-3355 x 7246433', RegionCode.US)));
assertTrue(usWithExtension.equals( assertTrue(usWithExtension.equals(
phoneUtil.parse('(800) 901-3355 , ext 7246433', RegionCode.US))); phoneUtil.parse('(800) 901-3355 , ext 7246433', RegionCode.US)));
assertTrue(usWithExtension.equals(
phoneUtil.parse('(800) 901-3355 ; 7246433', RegionCode.US)));
// To test an extension character without surrounding spaces.
assertTrue(usWithExtension.equals(
phoneUtil.parse('(800) 901-3355;7246433', RegionCode.US)));
assertTrue(usWithExtension.equals( assertTrue(usWithExtension.equals(
phoneUtil.parse('(800) 901-3355 ,extension 7246433', RegionCode.US))); phoneUtil.parse('(800) 901-3355 ,extension 7246433', RegionCode.US)));
assertTrue(usWithExtension.equals( assertTrue(usWithExtension.equals(
@ -3120,6 +3125,9 @@ function testIsNumberMatchMatches() {
assertEquals(i18n.phonenumbers.PhoneNumberUtil.MatchType.EXACT_MATCH, assertEquals(i18n.phonenumbers.PhoneNumberUtil.MatchType.EXACT_MATCH,
phoneUtil.isNumberMatch('+64 3 331-6005 extn 1234', phoneUtil.isNumberMatch('+64 3 331-6005 extn 1234',
'+6433316005#1234')); '+6433316005#1234'));
assertEquals(i18n.phonenumbers.PhoneNumberUtil.MatchType.EXACT_MATCH,
phoneUtil.isNumberMatch('+64 3 331-6005 ext. 1234',
'+6433316005;1234'));
// Test proto buffers. // Test proto buffers.
assertEquals(i18n.phonenumbers.PhoneNumberUtil.MatchType.EXACT_MATCH, assertEquals(i18n.phonenumbers.PhoneNumberUtil.MatchType.EXACT_MATCH,
phoneUtil.isNumberMatch(NZ_NUMBER, '+6403 331 6005')); phoneUtil.isNumberMatch(NZ_NUMBER, '+6403 331 6005'));


Loading…
Cancel
Save