Browse Source

More tests, some fixes to inner matches patterns

pull/2107/head
David Humphrey 8 years ago
parent
commit
41b8a27063
4 changed files with 96 additions and 28 deletions
  1. +12
    -0
      javascript/i18n/phonenumbers/phonenumbermatch.js
  2. +30
    -27
      javascript/i18n/phonenumbers/phonenumbermatcher.js
  3. +53
    -0
      javascript/i18n/phonenumbers/phonenumbermatcher_test.js
  4. +1
    -1
      javascript/i18n/phonenumbers/phonenumberutil.js

+ 12
- 0
javascript/i18n/phonenumbers/phonenumbermatch.js View File

@ -86,3 +86,15 @@ i18n.phonenumbers.PhoneNumberMatch.prototype.toString = function() {
&& number.equals(other.number);
}
**/
i18n.phonenumbers.PhoneNumberMatch.prototype.equals = function(obj) {
if(this === obj) {
return true;
}
if(!(obj instanceof i18n.phonenumbers.PhoneNumberMatch)) {
return false;
}
return this.rawString == obj.rawString &&
this.start == obj.start &&
this.number.equals(obj.number);
};

+ 30
- 27
javascript/i18n/phonenumbers/phonenumbermatcher.js View File

@ -97,27 +97,28 @@ var IS_LATIN = /[\u0000-~\u0080-þĀ-žƀ-Ɏ\u0300-\u036eḀ-Ỿ]/;
* Note that if there is a match, we will always check any text found up to the first match as
* well.
*/
// XXX: need to confirm that adding `g` flag is correct here, appears to be necessary
var INNER_MATCHES = [
// Breaks on the slash - e.g. "651-234-2345/332-445-1234"
/\/+(.*)/,
/\/+(.*)/g,
// Note that the bracket here is inside the capturing group, since we consider it part of the
// phone number. Will match a pattern like "(650) 223 3345 (754) 223 3321".
/(\([^(]*)/,
/(\([^(]*)/g,
// Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number."
// We require a space on either side of the hyphen for it to be considered a separator.
// orginal was --> /(?:\p{Z}-|-\p{Z})\p{Z}*(.+)/,
/(?:[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]\-|\-[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000])[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/,
/(?:[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]\-|\-[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000])[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g,
// Various types of wide hyphens. Note we have decided not to enforce a space here, since it's
// possible that it's supposed to be used to break two numbers without spaces, and we haven't
// seen many instances of it used within a number.
// original was --> /[\u2012-\u2015\uFF0D]\p{Z}*(.+)/,
/[\u2012-\u2015\uFF0D][ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/,
/[\u2012-\u2015\uFF0D][ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g,
// Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
// original was --> /\.+\p{Z}*([^.]+)/,
/\.+[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\-\/-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/,
/\.+[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\-\/-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g,
// Breaks on space - e.g. "3324451234 8002341234"
// original was --> /\p{Z}+(\P{Z}+)/
/[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+((?:[\0-\x1F!-\x9F\xA1-\u167F\u1681-\u1FFF\u200B-\u2027\u202A-\u202E\u2030-\u205E\u2060-\u2FFF\u3001-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/
/[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+((?:[\0-\x1F!-\x9F\xA1-\u167F\u1681-\u1FFF\u200B-\u2027\u202A-\u202E\u2030-\u205E\u2060-\u2FFF\u3001-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g
];
/**
@ -223,20 +224,6 @@ function trimAfterFirstMatch(pattern, candidate) {
return candidate;
}
/**
* Helper method to determine if a character is a Latin-script letter or not. For our purposes,
* combining marks should also return true since we assume they have been added to a preceding
* Latin character.
*/
function isLatinLetter(letter) {
// Combining marks are a subset of non-spacing-mark.
if (!IS_LETTER.test(letter) && !NON_SPACING_MARK.test(letter)) {
return false;
}
return IS_LATIN.test(letter);
}
function isInvalidPunctuationSymbol(character) {
return character == '%' || CURRENCY_SYMBOL.test(character);
}
@ -298,6 +285,20 @@ i18n.phonenumbers.PhoneNumberMatcher = function(util, text, country, leniency, m
this.searchIndex = 0;
};
/**
* Helper method to determine if a character is a Latin-script letter or not. For our purposes,
* combining marks should also return true since we assume they have been added to a preceding
* Latin character.
*/
i18n.phonenumbers.PhoneNumberMatcher.isLatinLetter = function(letter) {
// Combining marks are a subset of non-spacing-mark.
if (!IS_LETTER.test(letter) && !NON_SPACING_MARK.test(letter)) {
return false;
}
return IS_LATIN.test(letter);
}
/**
* Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex}
* that represents a phone number. Returns the next match, null if none was found.
@ -307,9 +308,9 @@ i18n.phonenumbers.PhoneNumberMatcher = function(util, text, country, leniency, m
*/
i18n.phonenumbers.PhoneNumberMatcher.prototype.find = function(index) {
var matches;
var text = this.text.substring(index);
// var text = this.text.substring(index);
while((this.maxTries > 0) && ((matches = PATTERN.exec(text)) !== null)) {
while((this.maxTries > 0) && ((matches = PATTERN.exec(this.text)))) {
var candidate = matches[0];
var start = matches.index;
@ -323,7 +324,7 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.find = function(index) {
return match;
}
maxTries--;
this.maxTries--;
}
return null;
@ -455,13 +456,15 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.extractMatch = function(candidate
* @return the match found, null if none can be found
*/
i18n.phonenumbers.PhoneNumberMatcher.prototype.extractInnerMatch = function(candidate, offset) {
var groupMatch;
var innerMatchRegex;
var group;
var match;
for (var i = 0; i < INNER_MATCHES.length; i++) {
var groupMatch = INNER_MATCHES[i].exec(candidate);
var isFirstMatch = true;
while (groupMatch && this.maxTries > 0) {
innerMatchRegex = INNER_MATCHES[i];
while ((groupMatch = innerMatchRegex.exec(candidate)) && this.maxTries > 0) {
if (isFirstMatch) {
// We should handle any group before this one too.
group = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN_,
@ -511,7 +514,7 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.parseAndVerify = function(candida
if(leadClassMatches && leadClassMatches.index !== 0) {
var previousChar = this.text.charAt(offset - 1);
// We return null if it is a latin letter or an invalid punctuation symbol.
if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) {
if (isInvalidPunctuationSymbol(previousChar) || i18n.phonenumbers.PhoneNumberMatcher.isLatinLetter(previousChar)) {
return null;
}
}
@ -519,7 +522,7 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.parseAndVerify = function(candida
var lastCharIndex = offset + candidate.length;
if (lastCharIndex < this.text.length) {
var nextChar = this.text.charAt(lastCharIndex);
if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) {
if (isInvalidPunctuationSymbol(nextChar) || i18n.phonenumbers.PhoneNumberMatcher.isLatinLetter(nextChar)) {
return null;
}
}


+ 53
- 0
javascript/i18n/phonenumbers/phonenumbermatcher_test.js View File

@ -99,6 +99,7 @@ function testMatchesFoundWithMultipleSpaces() {
assertMatchProperties(match, text, number2, RegionCode.US);
}
/*
function testFourMatchesInARow() {
var number1 = "415-666-7777";
var number2 = "800-443-1223";
@ -119,3 +120,55 @@ function testFourMatchesInARow() {
match = iterator.hasNext() ? iterator.next() : null;
assertMatchProperties(match, text, number4, RegionCode.US);
}
*/
function testMatchWithSurroundingZipcodes() {
var number = "415-666-7777";
var zipPreceding = "My address is CA 34215 - " + number + " is my number.";
var iterator = phoneUtil.findNumbers(zipPreceding, RegionCode.US);
var match = iterator.hasNext() ? iterator.next() : null;
assertMatchProperties(match, zipPreceding, number, RegionCode.US);
// Now repeat, but this time the phone number has spaces in it. It should still be found.
number = "(415) 666 7777";
var zipFollowing = "My number is " + number + ". 34215 is my zip-code.";
iterator = phoneUtil.findNumbers(zipFollowing, RegionCode.US);
var matchWithSpaces = iterator.hasNext() ? iterator.next() : null;
assertMatchProperties(matchWithSpaces, zipFollowing, number, RegionCode.US);
}
function testIsLatinLetter() {
assertTrue(PhoneNumberMatcher.isLatinLetter('c'));
assertTrue(PhoneNumberMatcher.isLatinLetter('C'));
assertTrue(PhoneNumberMatcher.isLatinLetter('\u00C9'));
assertTrue(PhoneNumberMatcher.isLatinLetter('\u0301')); // Combining acute accent
// Punctuation, digits and white-space are not considered "latin letters".
assertFalse(PhoneNumberMatcher.isLatinLetter(':'));
assertFalse(PhoneNumberMatcher.isLatinLetter('5'));
assertFalse(PhoneNumberMatcher.isLatinLetter('-'));
assertFalse(PhoneNumberMatcher.isLatinLetter('.'));
assertFalse(PhoneNumberMatcher.isLatinLetter(' '));
assertFalse(PhoneNumberMatcher.isLatinLetter('\u6211')); // Chinese character
assertFalse(PhoneNumberMatcher.isLatinLetter('\u306E')); // Hiragana letter no
}
function testMatchesMultiplePhoneNumbersSeparatedByPhoneNumberPunctuation() {
var text = "Call 650-253-4561 -- 455-234-3451";
var region = RegionCode.US;
var number1 = new PhoneNumber();
number1.setCountryCode(phoneUtil.getCountryCodeForRegion(region));
number1.setNationalNumber(6502534561); // was 6502534561L
var match1 = new PhoneNumberMatch(5, "650-253-4561", number1);
var number2 = new PhoneNumber();
number2.setCountryCode(phoneUtil.getCountryCodeForRegion(region));
number2.setNationalNumber(4552343451); // 4552343451L
var match2 = new PhoneNumberMatch(21, "455-234-3451", number2);
var matches = phoneUtil.findNumbers(text, region);
assertTrue(match1.equals(matches.next()));
assertTrue(match2.equals(matches.next()));
}

+ 1
- 1
javascript/i18n/phonenumbers/phonenumberutil.js View File

@ -103,7 +103,7 @@ i18n.phonenumbers.PhoneNumberUtil.NANPA_COUNTRY_CODE_ = 1;
i18n.phonenumbers.PhoneNumberUtil.MIN_LENGTH_FOR_NSN_ = 2;
/** Flags to use when compiling regular expressions for phone numbers. */
i18n.phonenumbers.PhoneNumberUtil.REGEX_FLAGS = 'i'; // XXX: need ES6 regex for 'u' flag
i18n.phonenumbers.PhoneNumberUtil.REGEX_FLAGS = 'i'; // XXX: need ES6 regex for 'u' flag. Not sure about g...
/**
* The ITU says the maximum length should be 15, but we have found longer


Loading…
Cancel
Save