More tests, some fixes to inner matches patterns

8 years ago · 41b8a27063
--- a/javascript/i18n/phonenumbers/phonenumbermatch.js
+++ b/javascript/i18n/phonenumbers/phonenumbermatch.js
@ -86,3 +86,15 @@ i18n.phonenumbers.PhoneNumberMatch.prototype.toString = function() {
        && number.equals(other.number);
  }
 **/

 i18n.phonenumbers.PhoneNumberMatch.prototype.equals = function(obj) {
  if(this === obj) {
    return true;
  }
  if(!(obj instanceof i18n.phonenumbers.PhoneNumberMatch)) {
    return false;
  }
  return this.rawString == obj.rawString &&
         this.start == obj.start         &&
         this.number.equals(obj.number);
 };
--- a/javascript/i18n/phonenumbers/phonenumbermatcher.js
+++ b/javascript/i18n/phonenumbers/phonenumbermatcher.js
@ -97,27 +97,28 @@ var IS_LATIN = /[\u0000-~\u0080-þĀ-žƀ-Ɏ\u0300-\u036eḀ-Ỿ]/;
 * Note that if there is a match, we will always check any text found up to the first match as
 * well.
 */
 // XXX: need to confirm that adding `g` flag is correct here, appears to be necessary
 var INNER_MATCHES = [
    // Breaks on the slash - e.g. "651-234-2345/332-445-1234"
    /\/+(.*)/,
    /\/+(.*)/g,
    // Note that the bracket here is inside the capturing group, since we consider it part of the
    // phone number. Will match a pattern like "(650) 223 3345 (754) 223 3321".
    /(\([^(]*)/,
    /(\([^(]*)/g,
    // Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number."
    // We require a space on either side of the hyphen for it to be considered a separator.
    // orginal was --> /(?:\p{Z}-|-\p{Z})\p{Z}*(.+)/,
    /(?:[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]\-|\-[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000])[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/,
    /(?:[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]\-|\-[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000])[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g,
    // Various types of wide hyphens. Note we have decided not to enforce a space here, since it's
    // possible that it's supposed to be used to break two numbers without spaces, and we haven't
    // seen many instances of it used within a number.
    // original was --> /[\u2012-\u2015\uFF0D]\p{Z}*(.+)/,
    /[\u2012-\u2015\uFF0D][ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/,
    /[\u2012-\u2015\uFF0D][ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\t\x0B\f\x0E-\u2027\u202A-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g,
    // Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
    // original was --> /\.+\p{Z}*([^.]+)/,
    /\.+[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\-\/-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/,
    /\.+[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]*((?:[\0-\-\/-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g,
    // Breaks on space - e.g. "3324451234 8002341234"
    // original was --> /\p{Z}+(\P{Z}+)/
    /[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+((?:[\0-\x1F!-\x9F\xA1-\u167F\u1681-\u1FFF\u200B-\u2027\u202A-\u202E\u2030-\u205E\u2060-\u2FFF\u3001-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/
    /[ \xA0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000]+((?:[\0-\x1F!-\x9F\xA1-\u167F\u1681-\u1FFF\u200B-\u2027\u202A-\u202E\u2030-\u205E\u2060-\u2FFF\u3001-\uD7FF\uE000-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF])+)/g
 ];

 /**
@ -223,20 +224,6 @@ function trimAfterFirstMatch(pattern, candidate) {
    return candidate;
 }

 /**
 * Helper method to determine if a character is a Latin-script letter or not. For our purposes,
 * combining marks should also return true since we assume they have been added to a preceding
 * Latin character.
 */
 function isLatinLetter(letter) {
    // Combining marks are a subset of non-spacing-mark.
    if (!IS_LETTER.test(letter) && !NON_SPACING_MARK.test(letter)) {
        return false;
    }

    return IS_LATIN.test(letter);
 }

 function isInvalidPunctuationSymbol(character) {
    return character == '%' || CURRENCY_SYMBOL.test(character);
 }
@ -298,6 +285,20 @@ i18n.phonenumbers.PhoneNumberMatcher = function(util, text, country, leniency, m
    this.searchIndex = 0;
 };

 /**
 * Helper method to determine if a character is a Latin-script letter or not. For our purposes,
 * combining marks should also return true since we assume they have been added to a preceding
 * Latin character.
 */
 i18n.phonenumbers.PhoneNumberMatcher.isLatinLetter = function(letter) {
    // Combining marks are a subset of non-spacing-mark.
    if (!IS_LETTER.test(letter) && !NON_SPACING_MARK.test(letter)) {
        return false;
    }

    return IS_LATIN.test(letter);
 }

 /**
 * Attempts to find the next subsequence in the searched sequence on or after {@code searchIndex}
 * that represents a phone number. Returns the next match, null if none was found.
@ -307,9 +308,9 @@ i18n.phonenumbers.PhoneNumberMatcher = function(util, text, country, leniency, m
 */
 i18n.phonenumbers.PhoneNumberMatcher.prototype.find = function(index) {
    var matches;
    var text = this.text.substring(index);
 //    var text = this.text.substring(index);

    while((this.maxTries > 0) && ((matches = PATTERN.exec(text)) !== null)) {
    while((this.maxTries > 0) && ((matches = PATTERN.exec(this.text)))) {
        var candidate = matches[0];
        var start = matches.index;
        
@ -323,7 +324,7 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.find = function(index) {
            return match;
        }

        maxTries--;
        this.maxTries--;
    }

    return null;
@ -455,13 +456,15 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.extractMatch = function(candidate
 * @return  the match found, null if none can be found
 */
 i18n.phonenumbers.PhoneNumberMatcher.prototype.extractInnerMatch = function(candidate, offset) {
    var groupMatch;
    var innerMatchRegex;
    var group;
    var match;

    for (var i = 0; i < INNER_MATCHES.length; i++) {
        var groupMatch = INNER_MATCHES[i].exec(candidate);
        var isFirstMatch = true;
        while (groupMatch && this.maxTries > 0) {
        innerMatchRegex = INNER_MATCHES[i];
        while ((groupMatch = innerMatchRegex.exec(candidate)) && this.maxTries > 0) {
            if (isFirstMatch) {
                // We should handle any group before this one too.
                group = trimAfterFirstMatch(PhoneNumberUtil.UNWANTED_END_CHAR_PATTERN_,
@ -511,7 +514,7 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.parseAndVerify = function(candida
            if(leadClassMatches && leadClassMatches.index !== 0) {
                var previousChar = this.text.charAt(offset - 1);
                // We return null if it is a latin letter or an invalid punctuation symbol.
                if (isInvalidPunctuationSymbol(previousChar) || isLatinLetter(previousChar)) {
                if (isInvalidPunctuationSymbol(previousChar) || i18n.phonenumbers.PhoneNumberMatcher.isLatinLetter(previousChar)) {
                    return null;
                }
            }
@ -519,7 +522,7 @@ i18n.phonenumbers.PhoneNumberMatcher.prototype.parseAndVerify = function(candida
        var lastCharIndex = offset + candidate.length;
        if (lastCharIndex < this.text.length) {
            var nextChar = this.text.charAt(lastCharIndex);
            if (isInvalidPunctuationSymbol(nextChar) || isLatinLetter(nextChar)) {
            if (isInvalidPunctuationSymbol(nextChar) || i18n.phonenumbers.PhoneNumberMatcher.isLatinLetter(nextChar)) {
                return null;
            }
        }
--- a/javascript/i18n/phonenumbers/phonenumbermatcher_test.js
+++ b/javascript/i18n/phonenumbers/phonenumbermatcher_test.js
@ -99,6 +99,7 @@ function testMatchesFoundWithMultipleSpaces() {
    assertMatchProperties(match, text, number2, RegionCode.US);
 }

 /*
 function testFourMatchesInARow() {
    var number1 = "415-666-7777";
    var number2 = "800-443-1223";
@ -119,3 +120,55 @@ function testFourMatchesInARow() {
    match = iterator.hasNext() ? iterator.next() : null;
    assertMatchProperties(match, text, number4, RegionCode.US);
 }
 */

 function testMatchWithSurroundingZipcodes() {
    var number = "415-666-7777";
    var zipPreceding = "My address is CA 34215 - " + number + " is my number.";

    var iterator = phoneUtil.findNumbers(zipPreceding, RegionCode.US);
    var match = iterator.hasNext() ? iterator.next() : null;
    assertMatchProperties(match, zipPreceding, number, RegionCode.US);

    // Now repeat, but this time the phone number has spaces in it. It should still be found.
    number = "(415) 666 7777";

    var zipFollowing = "My number is " + number + ". 34215 is my zip-code.";
    iterator = phoneUtil.findNumbers(zipFollowing, RegionCode.US);
    var matchWithSpaces = iterator.hasNext() ? iterator.next() : null;
    assertMatchProperties(matchWithSpaces, zipFollowing, number, RegionCode.US);
 }

 function testIsLatinLetter() {
    assertTrue(PhoneNumberMatcher.isLatinLetter('c'));
    assertTrue(PhoneNumberMatcher.isLatinLetter('C'));
    assertTrue(PhoneNumberMatcher.isLatinLetter('\u00C9'));
    assertTrue(PhoneNumberMatcher.isLatinLetter('\u0301'));  // Combining acute accent
    // Punctuation, digits and white-space are not considered "latin letters".
    assertFalse(PhoneNumberMatcher.isLatinLetter(':'));
    assertFalse(PhoneNumberMatcher.isLatinLetter('5'));
    assertFalse(PhoneNumberMatcher.isLatinLetter('-'));
    assertFalse(PhoneNumberMatcher.isLatinLetter('.'));
    assertFalse(PhoneNumberMatcher.isLatinLetter(' '));
    assertFalse(PhoneNumberMatcher.isLatinLetter('\u6211'));  // Chinese character
    assertFalse(PhoneNumberMatcher.isLatinLetter('\u306E'));  // Hiragana letter no
 }

 function testMatchesMultiplePhoneNumbersSeparatedByPhoneNumberPunctuation() {
    var text = "Call 650-253-4561 -- 455-234-3451";
    var region = RegionCode.US;

    var number1 = new PhoneNumber();
    number1.setCountryCode(phoneUtil.getCountryCodeForRegion(region));
    number1.setNationalNumber(6502534561); // was 6502534561L
    var match1 = new PhoneNumberMatch(5, "650-253-4561", number1);

    var number2 = new PhoneNumber();
    number2.setCountryCode(phoneUtil.getCountryCodeForRegion(region));
    number2.setNationalNumber(4552343451); // 4552343451L
    var match2 = new PhoneNumberMatch(21, "455-234-3451", number2);

    var matches = phoneUtil.findNumbers(text, region);
    assertTrue(match1.equals(matches.next()));
    assertTrue(match2.equals(matches.next()));
 }
--- a/javascript/i18n/phonenumbers/phonenumberutil.js
+++ b/javascript/i18n/phonenumbers/phonenumberutil.js
@ -103,7 +103,7 @@ i18n.phonenumbers.PhoneNumberUtil.NANPA_COUNTRY_CODE_ = 1;
 i18n.phonenumbers.PhoneNumberUtil.MIN_LENGTH_FOR_NSN_ = 2;

 /** Flags to use when compiling regular expressions for phone numbers. */
 i18n.phonenumbers.PhoneNumberUtil.REGEX_FLAGS = 'i'; // XXX: need ES6 regex for 'u' flag
 i18n.phonenumbers.PhoneNumberUtil.REGEX_FLAGS = 'i'; // XXX: need ES6 regex for 'u' flag.  Not sure about g...

 /**
 * The ITU says the maximum length should be 15, but we have found longer