From 7b30af6d59b8a4b79eb0de4721003b79af5c592c Mon Sep 17 00:00:00 2001 From: Philippe Liard Date: Wed, 21 Sep 2011 17:43:54 +0000 Subject: [PATCH] CPP: Add phonenumbermatcher. --- cpp/CMakeLists.txt | 38 +- cpp/README | 5 +- cpp/src/base/logging.h | 7 +- cpp/src/phonenumbers/encoding_utils.h | 11 + cpp/src/phonenumbers/phonenumbermatch.cc | 91 ++ cpp/src/phonenumbers/phonenumbermatch.h | 125 ++ cpp/src/phonenumbers/phonenumbermatcher.cc | 626 ++++++++++ cpp/src/phonenumbers/phonenumbermatcher.h | 158 +++ cpp/src/phonenumbers/phonenumberutil.h | 3 + cpp/src/phonenumbers/stringutil.cc | 58 + cpp/src/phonenumbers/stringutil.h | 19 + .../phonenumbers/asyoutypeformatter_test.cc | 42 +- .../phonenumbers/phonenumbermatch_test.cc | 91 ++ .../phonenumbers/phonenumbermatcher_test.cc | 1022 +++++++++++++++++ cpp/test/phonenumbers/phonenumberutil_test.cc | 141 +-- cpp/test/phonenumbers/stringutil_test.cc | 61 +- cpp/test/phonenumbers/test_util.cc | 63 + cpp/test/phonenumbers/test_util.h | 162 +++ tools/script/continuous-integration.sh | 16 +- 19 files changed, 2565 insertions(+), 174 deletions(-) create mode 100644 cpp/src/phonenumbers/phonenumbermatch.cc create mode 100644 cpp/src/phonenumbers/phonenumbermatch.h create mode 100644 cpp/src/phonenumbers/phonenumbermatcher.cc create mode 100644 cpp/src/phonenumbers/phonenumbermatcher.h create mode 100644 cpp/test/phonenumbers/phonenumbermatch_test.cc create mode 100644 cpp/test/phonenumbers/phonenumbermatcher_test.cc create mode 100644 cpp/test/phonenumbers/test_util.cc create mode 100644 cpp/test/phonenumbers/test_util.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f15261351..835d63b42 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -67,8 +67,9 @@ function (find_required_program NAME FILENAME DESCRIPTION) endfunction (find_required_program) # Options that can be passed to CMake using 'cmake -DKEY=VALUE'. +option ("USE_ICU_REGEXP" "Use ICU regexp engine" "ON") option ("USE_LITE_METADATA" "Use lite metadata" "OFF") -option ("USE_RE2" "Use RE2 instead of ICU" "OFF") +option ("USE_RE2" "Use RE2" "OFF") option ("USE_STD_MAP" "Force the use of std::map" "OFF") # Find all the required libraries and programs. @@ -94,7 +95,7 @@ check_library_version (PC_ICU_UC icu-uc>=4.4) set (ICU_INCLUDE_DIR ${ICU_UC_INCLUDE_DIR}) set (ICU_LIB ${ICU_UC_LIB}) # If ICU regexp engine is used, use icui18n as well. -if (${USE_RE2} STREQUAL "OFF") +if (${USE_ICU_REGEXP} STREQUAL "ON") find_required_library (ICU_I18N unicode/regex.h icui18n "ICU") check_library_version (PC_ICU_I18N icu-i18n>=4.4) list (APPEND ICU_INCLUDE_DIR ${ICU_I18N_INCLUDE_DIR}) @@ -169,14 +170,22 @@ set ( "src/phonenumbers/utf/unilib.cc" ) -# Add regexp engine sources. ICU is used by default. +# Add regexp engine-dependent sources. ICU is used by default. if (${USE_RE2} STREQUAL "ON") # Add a flag to select the right regexp factory implementation used by # regexp_factory.h and regexp_adapter_test.cc. + # When both ICU regexp and RE2 are defined, the regexp engine adapter defaults + # to RE2 unless the ICU implementation is instantiated explictly obviously. add_definitions (-DUSE_RE2) list (APPEND SOURCES "src/phonenumbers/regexp_adapter_re2.cc") -else () +endif () + +if (${USE_ICU_REGEXP} STREQUAL "ON") + add_definitions (-DUSE_ICU_REGEXP) list (APPEND SOURCES "src/phonenumbers/regexp_adapter_icu.cc") + # The phone number matcher needs ICU. + list (APPEND SOURCES "src/phonenumbers/phonenumbermatch.cc") + list (APPEND SOURCES "src/phonenumbers/phonenumbermatcher.cc") endif () # Library sources excluding the metadata files, since special metadata is used @@ -300,9 +309,15 @@ set (TEST_SOURCES "test/phonenumbers/regexp_cache_test.cc" "test/phonenumbers/run_tests.cc" "test/phonenumbers/stringutil_test.cc" + "test/phonenumbers/test_util.cc" "test/phonenumbers/unicodestring_test.cc" "test/phonenumbers/utf/unicodetext_test.cc" ) +if (${USE_ICU_REGEXP} STREQUAL "ON") + # Add the phone number matcher tests. + list (APPEND TEST_SOURCES "test/phonenumbers/phonenumbermatch_test.cc") + list (APPEND TEST_SOURCES "test/phonenumbers/phonenumbermatcher_test.cc") +endif () # Build the testing binary. include_directories ("test") @@ -329,8 +344,17 @@ install (FILES install (FILES "src/phonenumbers/utf/unicodetext.h" DESTINATION include/phonenumbers/utf/) -install (FILES src/base/basictypes.h - DESTINATION include/base/) +if (${USE_ICU_REGEXP} STREQUAL "ON") + # Install the phone number matcher headers. + install (FILES + "src/phonenumbers/phonenumbermatch.h" + "src/phonenumbers/phonenumbermatcher.h" + "src/phonenumbers/regexp_adapter.h" + DESTINATION include/phonenumbers/ + ) +endif () + +install (FILES "src/base/basictypes.h" DESTINATION include/base/) install (FILES "src/base/memory/scoped_ptr.h" @@ -338,7 +362,7 @@ install (FILES DESTINATION include/base/memory/ ) -install (FILES src/base/synchronization/lock.h +install (FILES "src/base/synchronization/lock.h" DESTINATION include/base/synchronization/) install (TARGETS phonenumber LIBRARY DESTINATION lib/ ARCHIVE DESTINATION lib/) diff --git a/cpp/README b/cpp/README index 5cb35e2b8..4825d1aeb 100644 --- a/cpp/README +++ b/cpp/README @@ -80,12 +80,15 @@ How to build libphonenumber C++: $ cd libphonenumber $ mkdir build $ cd build - $ cmake ../cpp/ + $ cmake .. $ make Supported build parameters: Build parameters can be specified invoking CMake with '-DKEY=VALUE' or using a CMake user interface (ccmake or cmake-gui). + USE_ICU_REGEXP = ON | OFF [ON] -- Use ICU regexp engine. USE_LITE_METADATA = ON | OFF [OFF] -- Generates smaller metadata that doesn't include example numbers. + USE_RE2 = ON | OFF [OFF] -- Use RE2. + USE_STD_MAP = ON | OFF [OFF] -- Force the use of std::map. diff --git a/cpp/src/base/logging.h b/cpp/src/base/logging.h index ce31e3494..a8ffa03b7 100644 --- a/cpp/src/base/logging.h +++ b/cpp/src/base/logging.h @@ -23,8 +23,11 @@ #define CHECK_EQ(X, Y) assert((X) == (Y)) -#define DCHECK(X) assert(X) -#define DCHECK_EQ(X, Y) CHECK_EQ((X), (Y)) +# define DCHECK(X) assert(X) +# define DCHECK_EQ(X, Y) CHECK_EQ((X), (Y)) +# define DCHECK_GE(X, Y) assert((X) >= (Y)) +# define DCHECK_GT(X, Y) assert((X) > (Y)) +# define DCHECK_LT(X, Y) assert((X) < (Y)) template T* CHECK_NOTNULL(T* ptr) { assert(ptr); diff --git a/cpp/src/phonenumbers/encoding_utils.h b/cpp/src/phonenumbers/encoding_utils.h index 85819a55e..415ce6230 100644 --- a/cpp/src/phonenumbers/encoding_utils.h +++ b/cpp/src/phonenumbers/encoding_utils.h @@ -16,6 +16,7 @@ #define I18N_PHONENUMBERS_ENCODING_UTILS_H_ #include "base/basictypes.h" +#include "phonenumbers/utf/unilib.h" #include "phonenumbers/utf/utf.h" namespace i18n { @@ -32,6 +33,16 @@ class EncodingUtils { *out = r; return len; } + + static const char* AdvanceOneUTF8Character(const char* buf_utf8) { + return buf_utf8 + UniLib::OneCharLen(buf_utf8); + } + + static const char* BackUpOneUTF8Character(const char* start, + const char* end) { + while (start < end && UniLib::IsTrailByte(*--end)) {} + return end; + } }; } // namespace phonenumbers diff --git a/cpp/src/phonenumbers/phonenumbermatch.cc b/cpp/src/phonenumbers/phonenumbermatch.cc new file mode 100644 index 000000000..bcf5efbc5 --- /dev/null +++ b/cpp/src/phonenumbers/phonenumbermatch.cc @@ -0,0 +1,91 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: Tao Huang +// +// Implementation of a mutable match of a phone number within a piece of +// text. Matches may be found using PhoneNumberUtil::FindNumbers. + +#include "phonenumbers/phonenumbermatch.h" + +#include + +#include "phonenumbers/phonenumber.h" +#include "phonenumbers/phonenumber.pb.h" +#include "phonenumbers/stringutil.h" + +namespace i18n { +namespace phonenumbers { + +PhoneNumberMatch::PhoneNumberMatch(int start, + const string& raw_string, + const PhoneNumber& number) + : start_(start), raw_string_(raw_string), number_(number) { +} + +PhoneNumberMatch::PhoneNumberMatch() + : start_(-1), raw_string_(""), number_(PhoneNumber::default_instance()) { +} + +const PhoneNumber& PhoneNumberMatch::number() const { + return number_; +} + +int PhoneNumberMatch::start() const { + return start_; +} + +int PhoneNumberMatch::end() const { + return start_ + raw_string_.length(); +} + +int PhoneNumberMatch::length() const { + return raw_string_.length(); +} + +const string& PhoneNumberMatch::raw_string() const { + return raw_string_; +} + +void PhoneNumberMatch::set_start(int start) { + start_ = start; +} + +void PhoneNumberMatch::set_raw_string(const string& raw_string) { + raw_string_ = raw_string; +} + +void PhoneNumberMatch::set_number(const PhoneNumber& number) { + number_.CopyFrom(number); +} + +string PhoneNumberMatch::ToString() const { + return StrCat("PhoneNumberMatch [", start(), ",", end(), ") ", + raw_string_.c_str()); +} + +bool PhoneNumberMatch::Equals(const PhoneNumberMatch& match) const { + return ExactlySameAs(match.number_, number_) && + match.raw_string_.compare(raw_string_) == 0 && + match.start_ == start_; +} + +void PhoneNumberMatch::CopyFrom(const PhoneNumberMatch& match) { + raw_string_ = match.raw_string(); + start_ = match.start(); + number_ = match.number(); +} + +} // namespace phonenumbers +} // namespace i18n diff --git a/cpp/src/phonenumbers/phonenumbermatch.h b/cpp/src/phonenumbers/phonenumbermatch.h new file mode 100644 index 000000000..5ebfd9d5b --- /dev/null +++ b/cpp/src/phonenumbers/phonenumbermatch.h @@ -0,0 +1,125 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: Tao Huang +// +// A mutable match of a phone number within a piece of text. +// Matches may be found using PhoneNumberUtil::FindNumbers. +// +// A match consists of the phone number as well as the start and end offsets of +// the corresponding subsequence of the searched text. Use raw_string() to +// obtain a copy of the matched subsequence. +// +// The following annotated example clarifies the relationship between the +// searched text, the match offsets, and the parsed number: +// +// string text = "Call me at +1 425 882-8080 for details."; +// const string country = "US"; +// +// // Find the first phone number match: +// PhoneNumberMatcher matcher(text, country); +// if (matcher.HasNext()) { +// PhoneNumberMatch match; +// matcher.Next(&match); +// } +// +// // raw_string() contains the phone number as it appears in the text. +// "+1 425 882-8080" == match.raw_string(); +// +// // start() and end() define the range of the matched subsequence. +// string subsequence = text.substr(match.start(), match.end()); +// "+1 425 882-8080" == subsequence; +// +// // number() returns the the same result as PhoneNumberUtil::Parse() +// // invoked on raw_string(). +// const PhoneNumberUtil& util = *PhoneNumberUtil::GetInstance(); +// util.Parse(match.raw_string(), country).Equals(match.number()); +// +// This class is a port of PhoneNumberMatch.java + +#ifndef I18N_PHONENUMBERS_PHONENUMBERMATCH_H_ +#define I18N_PHONENUMBERS_PHONENUMBERMATCH_H_ + +#include + +#include "base/basictypes.h" +#include "phonenumbers/phonenumber.pb.h" + +namespace i18n { +namespace phonenumbers { + +using std::string; + +class PhoneNumberMatch { + public: + // Creates a new match. + // - start is the index into the target text. + // - match is the matched string of the target text. + // - number is the matched phone number. + PhoneNumberMatch(int start, + const string& raw_string, + const PhoneNumber& number); + + // Default constructor. + PhoneNumberMatch(); + + ~PhoneNumberMatch() {} + + // Returns the phone number matched by the receiver. + const PhoneNumber& number() const; + + // Returns the start index of the matched phone number within the searched + // text. + int start() const; + + // Returns the exclusive end index of the matched phone number within the + // searched text. + int end() const; + + // Returns the length of the text matched in the searched text. + int length() const; + + // Returns the raw string matched as a phone number in the searched text. + const string& raw_string() const; + + // Returns a string containing debug information. + string ToString() const; + + void set_start(int start); + + void set_raw_string(const string& raw_string); + + void set_number(const PhoneNumber& number); + + bool Equals(const PhoneNumberMatch& number) const; + + void CopyFrom(const PhoneNumberMatch& number); + + private: + // The start index into the text. + int start_; + + // The raw substring matched. + string raw_string_; + + // The matched phone number. + PhoneNumber number_; + + DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatch); +}; + +} // namespace phonenumbers +} // namespace i18n + +#endif // I18N_PHONENUMBERS_PHONENUMBERMATCH_H_ diff --git a/cpp/src/phonenumbers/phonenumbermatcher.cc b/cpp/src/phonenumbers/phonenumbermatcher.cc new file mode 100644 index 000000000..a3d528f27 --- /dev/null +++ b/cpp/src/phonenumbers/phonenumbermatcher.cc @@ -0,0 +1,626 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: Lara Rennie +// Author: Tao Huang +// +// Implementation of a stateful class that finds and extracts telephone numbers +// from text. + +#include "phonenumbers/phonenumbermatcher.h" + +#ifndef USE_ICU_REGEXP +#error phonenumbermatcher depends on ICU (i.e. USE_ICU_REGEXP must be set) +#endif // USE_ICU_REGEXP + +#include +#include +#include + +#include "base/logging.h" +#include "base/memory/scoped_ptr.h" +#include "base/memory/singleton.h" +#include "phonenumbers/default_logger.h" +#include "phonenumbers/encoding_utils.h" +#include "phonenumbers/normalize_utf8.h" +#include "phonenumbers/phonenumber.pb.h" +#include "phonenumbers/phonenumbermatch.h" +#include "phonenumbers/phonenumberutil.h" +#include "phonenumbers/regexp_adapter.h" +#include "phonenumbers/regexp_adapter_icu.h" +#include "phonenumbers/stringutil.h" + +#ifdef USE_RE2 +#include "phonenumbers/regexp_adapter_re2.h" +#endif // USE_RE2_AND_ICU + +using std::numeric_limits; +using std::string; +using std::vector; + +namespace i18n { +namespace phonenumbers { + +namespace { +// Returns a regular expression quantifier with an upper and lower limit. +string Limit(int lower, int upper) { + DCHECK_GE(lower, 0); + DCHECK_GT(upper, 0); + DCHECK_LT(lower, upper); + return StrCat("{", lower, ",", upper, "}"); +} + +bool IsCurrencySymbol(char32 character) { + return (u_charType(character) == U_CURRENCY_SYMBOL); +} + +// Helper method to get the national-number part of a number, formatted without +// any national prefix, and return it as a set of digit blocks that would be +// formatted together. +void GetNationalNumberGroups(const PhoneNumberUtil& util, + const PhoneNumber& number, + vector* digit_blocks) { + // This will be in the format +CC-DG;ext=EXT where DG represents groups of + // digits. + string rfc3966_format; + util.Format(number, PhoneNumberUtil::RFC3966, &rfc3966_format); + // We remove the extension part from the formatted string before splitting it + // into different groups. + size_t end_index = rfc3966_format.find(';'); + if (end_index == string::npos) { + end_index = rfc3966_format.length(); + } + // The country-code will have a '-' following it. + size_t start_index = rfc3966_format.find('-') + 1; + SplitStringUsing(rfc3966_format.substr(start_index, end_index - start_index), + "-", digit_blocks); +} + +bool ContainsOnlyValidXChars(const PhoneNumber& number, const string& candidate, + const PhoneNumberUtil& util) { + // The characters 'x' and 'X' can be (1) a carrier code, in which case they + // always precede the national significant number or (2) an extension sign, + // in which case they always precede the extension number. We assume a + // carrier code is more than 1 digit, so the first case has to have more than + // 1 consecutive 'x' or 'X', whereas the second case can only have exactly 1 + // 'x' or 'X'. + size_t found; + found = candidate.find_first_of("xX"); + // We ignore the character if 'x' or 'X' appears as the last character of + // the string. + while (found != string::npos && found < candidate.length() - 1) { + // We only look for 'x' or 'X' in ASCII form. + char next_char = candidate[found + 1]; + if (next_char == 'x' || next_char == 'X') { + // This is the carrier code case, in which the 'X's always precede the + // national significant number. + ++found; + if (util.IsNumberMatchWithOneString( + number, candidate.substr(found, candidate.length() - found)) + != PhoneNumberUtil::NSN_MATCH) { + return false; + } + } else { + string normalized_extension(candidate.substr(found, + candidate.length() - found)); + util.NormalizeDigitsOnly(&normalized_extension); + if (normalized_extension != number.extension()) { + return false; + } + } + found = candidate.find_first_of("xX", found + 1); + } + return true; +} +} // namespace + +#ifdef USE_GOOGLE_BASE +class PhoneNumberMatcherRegExps { + friend struct DefaultSingletonTraits; +#else +class PhoneNumberMatcherRegExps : public Singleton { + friend class Singleton; +#endif // USE_GOOGLE_BASE + private: + string opening_parens_; + string closing_parens_; + string non_parens_; + // Limit on the number of pairs of brackets in a phone number. + string bracket_pair_limit_; + // Helper strings for the matching_brackets_ pattern. + // An opening bracket at the beginning may not be closed, but subsequent ones + // should be. It's also possible that the leading bracket was dropped, so we + // shouldn't be surprised if we see a closing bracket first. + string leading_maybe_matched_bracket_; + string bracket_pairs_; + // Limit on the number of leading (plus) characters. + string lead_limit_; + // Limit on the number of consecutive punctuation characters. + string punctuation_limit_; + // The maximum number of digits allowed in a digit-separated block. As we + // allow all digits in a single block, this should be set high enough to + // accommodate the entire national number and the international country code. + int digit_block_limit_; + // Limit on the number of blocks separated by punctuation. Uses + // kDigitBlockLimit since some formats use spaces to separate each digit. + string block_limit_; + // A punctuation sequence allowing white space. + string punctuation_; + // A digits block without punctuation. + string digit_sequence_; + // Punctuation that may be at the start of a phone number - brackets and plus + // signs. + string lead_class_chars_; + // Same as lead_class_chars_, but enclosed as a character class. + string lead_class_; + // Extra helper strings that form part of pattern_. These are stored + // separately since StrCat has a limit of 12 args. + string opening_punctuation_; + string optional_extn_pattern_; + + public: + // We use two different reg-ex factories here for performance reasons. RE2 is + // much faster for smaller reg-ex patterns, but the main pattern cannot be + // handled by RE2 in an efficient way. + scoped_ptr regexp_factory_for_pattern_; + scoped_ptr regexp_factory_; + + // Matches strings that look like publication pages. Example: + // Computing Complete Answers to Queries in the Presence of Limited Access + // Patterns. Chen Li. VLDB J. 12(3): 211-227 (2003). + // + // The string "211-227 (2003)" is not a telephone number. + scoped_ptr pub_pages_; + // Matches strings that look like dates using "/" as a separator. Examples: + // 3/10/2011, 31/10/96 or 08/31/95. + scoped_ptr slash_separated_dates_; + // Pattern to check that brackets match. Opening brackets should be closed + // within a phone number. This also checks that there is something inside the + // brackets. Having no brackets at all is also fine. + scoped_ptr matching_brackets_; + // Matches white-space, which may indicate the end of a phone number and the + // start of something else (such as a neighbouring zip-code). If white-space + // is found, continues to match all characters that are not typically used to + // start a phone number. + scoped_ptr group_separator_; + scoped_ptr capture_up_to_second_number_start_pattern_; + scoped_ptr capturing_ascii_digits_pattern_; + // Compiled reg-ex representing lead_class_; + scoped_ptr lead_class_pattern_; + // Phone number pattern allowing optional punctuation. + scoped_ptr pattern_; + +#ifdef USE_GOOGLE_BASE + PhoneNumberMatcherRegExps* PhoneNumberMatcherRegExps::GetInstance() { + return Singleton::get(); + } +#endif // USE_GOOGLE_BASE + + PhoneNumberMatcherRegExps() + : opening_parens_("(\\[\xEF\xBC\x88\xEF\xBC\xBB" /* "(\\[([" */), + closing_parens_(")\\]\xEF\xBC\x89\xEF\xBC\xBD" /* ")\\])]" */), + non_parens_(StrCat("[^", opening_parens_, closing_parens_, "]")), + bracket_pair_limit_(Limit(0, 3)), + leading_maybe_matched_bracket_(StrCat( + "(?:[", opening_parens_, "])?", + "(?:", non_parens_, "+[", closing_parens_, "])?")), + bracket_pairs_(StrCat( + "(?:[", opening_parens_, "]", non_parens_, "+", + "[", closing_parens_, "])", bracket_pair_limit_)), + lead_limit_(Limit(0, 2)), + punctuation_limit_(Limit(0, 4)), + digit_block_limit_(PhoneNumberUtil::kMaxLengthForNsn + + PhoneNumberUtil::kMaxLengthCountryCode), + block_limit_(Limit(0, digit_block_limit_)), + punctuation_(StrCat("[", PhoneNumberUtil::kValidPunctuation, "]", + punctuation_limit_)), + digit_sequence_(StrCat("\\p{Nd}", Limit(1, digit_block_limit_))), + lead_class_chars_(StrCat(opening_parens_, PhoneNumberUtil::kPlusChars)), + lead_class_(StrCat("[", lead_class_chars_, "]")), + opening_punctuation_(StrCat("(?:", lead_class_, punctuation_, ")")), + optional_extn_pattern_(StrCat( + "(?i)(?:", + PhoneNumberUtil::GetInstance()->GetExtnPatternsForMatching(), + ")?")), + regexp_factory_for_pattern_(new ICURegExpFactory()), +#ifdef USE_RE2 + regexp_factory_(new RE2RegExpFactory()), +#else + regexp_factory_(new ICURegExpFactory()), +#endif // USE_RE2 + pub_pages_(regexp_factory_->CreateRegExp( + "\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}")), + slash_separated_dates_(regexp_factory_->CreateRegExp( + "(?:(?:[0-3]?\\d/[01]?\\d)|" + "(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}")), + matching_brackets_(regexp_factory_->CreateRegExp( + StrCat(leading_maybe_matched_bracket_, non_parens_, "+", + bracket_pairs_, non_parens_, "*"))), + group_separator_(regexp_factory_->CreateRegExp( + StrCat("\\p{Z}", "[^", lead_class_chars_, "\\p{Nd}]*"))), + capture_up_to_second_number_start_pattern_( + regexp_factory_->CreateRegExp( + PhoneNumberUtil::kCaptureUpToSecondNumberStart)), + capturing_ascii_digits_pattern_( + regexp_factory_->CreateRegExp("(\\d+)")), + lead_class_pattern_(regexp_factory_->CreateRegExp(lead_class_)), + pattern_(regexp_factory_for_pattern_->CreateRegExp( + StrCat("(", opening_punctuation_, lead_limit_, + digit_sequence_, "(?:", punctuation_, digit_sequence_, ")", + block_limit_, optional_extn_pattern_, ")"))) { + } + + private: + DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcherRegExps); +}; + +PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util, + const string& text, + const string& region_code, + PhoneNumberMatcher::Leniency leniency, + int max_tries) + : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), + phone_util_(util), + text_(text), + preferred_region_(region_code), + leniency_(leniency), + max_tries_(max_tries), + state_(NOT_READY), + last_match_(NULL), + search_index_(0) { +} + +PhoneNumberMatcher::PhoneNumberMatcher(const string& text, + const string& region_code) + : reg_exps_(PhoneNumberMatcherRegExps::GetInstance()), + phone_util_(*PhoneNumberUtil::GetInstance()), + text_(text), + preferred_region_(region_code), + leniency_(VALID), + max_tries_(numeric_limits::max()), + state_(NOT_READY), + last_match_(NULL), + search_index_(0) { +} + +PhoneNumberMatcher::~PhoneNumberMatcher() { +} + +// static +bool PhoneNumberMatcher::IsLatinLetter(char32 letter) { + // Combining marks are a subset of non-spacing-mark. + if (!u_isalpha(letter) && (u_charType(letter) != U_NON_SPACING_MARK)) { + return false; + } + UBlockCode block = ublock_getCode(letter); + return ((block == UBLOCK_BASIC_LATIN) || + (block == UBLOCK_LATIN_1_SUPPLEMENT) || + (block == UBLOCK_LATIN_EXTENDED_A) || + (block == UBLOCK_LATIN_EXTENDED_ADDITIONAL) || + (block == UBLOCK_LATIN_EXTENDED_B) || + (block == UBLOCK_COMBINING_DIACRITICAL_MARKS)); +} + +bool PhoneNumberMatcher::ParseAndVerify(const string& candidate, int offset, + PhoneNumberMatch* match) { + DCHECK(match); + // Check the candidate doesn't contain any formatting which would indicate + // that it really isn't a phone number. + if (!reg_exps_->matching_brackets_->FullMatch(candidate)) { + return false; + } + + // If leniency is set to VALID or stricter, we also want to skip numbers that + // are surrounded by Latin alphabetic characters, to skip cases like + // abc8005001234 or 8005001234def. + if (leniency_ >= VALID) { + // If the candidate is not at the start of the text, and does not start with + // phone-number punctuation, check the previous character. + scoped_ptr candidate_input( + reg_exps_->regexp_factory_->CreateInput(candidate)); + if (offset > 0 && + !reg_exps_->lead_class_pattern_->Consume(candidate_input.get())) { + char32 previous_char; + const char* previous_char_ptr = + EncodingUtils::BackUpOneUTF8Character(text_.c_str(), + text_.c_str() + offset); + EncodingUtils::DecodeUTF8Char(previous_char_ptr, &previous_char); + // We return false if it is a latin letter or a currency symbol. + if (IsCurrencySymbol(previous_char) || IsLatinLetter(previous_char)) { + return false; + } + } + size_t lastCharIndex = offset + candidate.length(); + if (lastCharIndex < text_.length()) { + char32 next_char; + const char* next_char_ptr = + EncodingUtils::AdvanceOneUTF8Character( + text_.c_str() + lastCharIndex - 1); + EncodingUtils::DecodeUTF8Char(next_char_ptr, &next_char); + if (IsCurrencySymbol(next_char) || IsLatinLetter(next_char)) { + return false; + } + } + } + + PhoneNumber number; + if (phone_util_.Parse(candidate, preferred_region_, &number) != + PhoneNumberUtil::NO_PARSING_ERROR) { + return false; + } + if (VerifyAccordingToLeniency(leniency_, number, candidate)) { + match->set_start(offset); + match->set_raw_string(candidate); + match->set_number(number); + return true; + } + return false; +} + +// Helper method to replace the verification method for each enum in the Java +// version. +bool PhoneNumberMatcher::VerifyAccordingToLeniency( + Leniency leniency, const PhoneNumber& number, + const string& candidate) const { + switch (leniency) { + case PhoneNumberMatcher::POSSIBLE: + return phone_util_.IsPossibleNumber(number); + case PhoneNumberMatcher::VALID: + if (!phone_util_.IsValidNumber(number)) { + return false; + } + return ContainsOnlyValidXChars(number, candidate, phone_util_); + case PhoneNumberMatcher::STRICT_GROUPING: { + if (!phone_util_.IsValidNumber(number) || + !ContainsOnlyValidXChars(number, candidate, phone_util_) || + // Two or more slashes were present. + FindNth(candidate, '/', 2) != string::npos) { + return false; + } + // TODO(lararennie,shaopengjia): Evaluate how this works for other locales + // (testing has been limited to NANPA regions) and optimise if necessary. + string normalized_candidate = + NormalizeUTF8::NormalizeDecimalDigits(candidate); + vector formatted_number_groups; + GetNationalNumberGroups(phone_util_, number, &formatted_number_groups); + size_t from_index = 0; + // Check each group of consecutive digits are not broken into separate + // groups in the normalized_candidate string. + for (size_t i = 0; i < formatted_number_groups.size(); ++i) { + // Fails if the substring of normalized_candidate starting from + // from_index doesn't contain the consecutive digits in digit_group. + from_index = normalized_candidate.find(formatted_number_groups.at(i), + from_index); + if (from_index == string::npos) { + return false; + } + // Moves from_index forward. + from_index += formatted_number_groups.at(i).length(); + if (i == 0 && from_index < normalized_candidate.length()) { + // We are at the position right after the NDC. Note although + // normalized_candidate might contain non-ASCII formatting characters, + // they won't be treated as ASCII digits when converted to a char. + if (isdigit(normalized_candidate.at(from_index))) { + // This means there is no formatting symbol after the NDC. In this + // case, we only accept the number if there is no formatting + // symbol at all in the number, except for extensions. + string national_significant_number; + phone_util_.GetNationalSignificantNumber( + number, &national_significant_number); + return HasPrefixString( + normalized_candidate.substr( + from_index - formatted_number_groups.at(i).length()), + national_significant_number); + } + } + } + // The check here makes sure that we haven't mistakenly already used the + // extension to match the last group of the subscriber number. Note the + // extension cannot have formatting in-between digits. + return + normalized_candidate.substr(from_index).find(number.extension()) != + string::npos; + } + case PhoneNumberMatcher::EXACT_GROUPING: { + if (!phone_util_.IsValidNumber(number) || + !ContainsOnlyValidXChars(number, candidate, phone_util_) || + // Two or more slashes were present. + FindNth(candidate, '/', 2) != string::npos) { + return false; + } + // TODO(lararennie,shaopengjia): Evaluate how this works for other locales + // (testing has been limited to NANPA regions) and optimise if necessary. + vector candidate_groups; + string normalized_candidate = + NormalizeUTF8::NormalizeDecimalDigits(candidate); + const scoped_ptr candidate_number( + reg_exps_->regexp_factory_->CreateInput(normalized_candidate)); + string digit_block; + while (reg_exps_->capturing_ascii_digits_pattern_->FindAndConsume( + candidate_number.get(), + &digit_block)) { + candidate_groups.push_back(digit_block); + } + + // Set this to the last group, skipping it if the number has an extension. + int candidate_number_group_index = + number.has_extension() ? candidate_groups.size() - 2 + : candidate_groups.size() - 1; + // First we check if the national significant number is formatted as a + // block. We use contains and not equals, since the national significant + // number may be present with a prefix such as a national number prefix, + // or the country code itself. + string national_significant_number; + phone_util_.GetNationalSignificantNumber(number, + &national_significant_number); + if (candidate_groups.size() == 1 || + candidate_groups.at(candidate_number_group_index).find( + national_significant_number) != string::npos) { + return true; + } + vector formatted_number_groups; + GetNationalNumberGroups(phone_util_, number, &formatted_number_groups); + // Starting from the end, go through in reverse, excluding the first + // group, and check the candidate and number groups are the same. + for (int formatted_number_group_index = + (formatted_number_groups.size() - 1); + formatted_number_group_index > 0 && + candidate_number_group_index >= 0; + --formatted_number_group_index, --candidate_number_group_index) { + if (candidate_groups.at(candidate_number_group_index) != + formatted_number_groups.at(formatted_number_group_index)) { + return false; + } + } + // Now check the first group. There may be a national prefix at the start, + // so we only check that the candidate group ends with the formatted + // number group. + return (candidate_number_group_index >= 0 && + HasSuffixString(candidate_groups.at(candidate_number_group_index), + formatted_number_groups.at(0))); + } + default: + LOG(ERROR) << "No implementation defined for verification for leniency " + << static_cast(leniency); + return false; + } +} + +bool PhoneNumberMatcher::ExtractInnerMatch(const string& candidate, int offset, + PhoneNumberMatch* match) { + DCHECK(match); + // Try removing either the first or last "group" in the number and see if this + // gives a result. We consider white space to be a possible indication of + // the start or end of the phone number. + scoped_ptr candidate_input( + reg_exps_->regexp_factory_->CreateInput(candidate)); + if (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), + NULL)) { + // Try the first group by itself. + int group_start_index = + candidate.length() - candidate_input->ToString().length(); + string first_group_only = candidate.substr(0, group_start_index); + phone_util_.TrimUnwantedEndChars(&first_group_only); + bool success = ParseAndVerify(first_group_only, offset, match); + if (success) { + return true; + } + --max_tries_; + + // Try the rest of the candidate without the first group. + string without_first_group(candidate_input->ToString()); + phone_util_.TrimUnwantedEndChars(&without_first_group); + success = + ParseAndVerify(without_first_group, offset + group_start_index, match); + if (success) { + return true; + } + --max_tries_; + + if (max_tries_ > 0) { + while (reg_exps_->group_separator_->FindAndConsume(candidate_input.get(), + NULL)) { + // Find the last group. + } + int last_group_start = + candidate.length() - candidate_input->ToString().length(); + string without_last_group = candidate.substr(0, last_group_start); + phone_util_.TrimUnwantedEndChars(&without_last_group); + if (without_last_group == first_group_only) { + // If there are only two groups, then the group "without the last group" + // is the same as the first group. In these cases, we don't want to + // re-check the number group, so we exit already. + return false; + } + success = ParseAndVerify(without_last_group, offset, match); + if (success) { + return true; + } + --max_tries_; + } + } + return false; +} + +bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset, + PhoneNumberMatch* match) { + DCHECK(match); + // Skip a match that is more likely a publication page reference or a date. + if (reg_exps_->pub_pages_->PartialMatch(candidate) || + reg_exps_->slash_separated_dates_->PartialMatch(candidate)) { + return false; + } + + // Try to come up with a valid match given the entire candidate. + if (ParseAndVerify(candidate, offset, match)) { + return true; + } + + // If that failed, try to find an "inner match" - there might be a phone + // number within this candidate. + return ExtractInnerMatch(candidate, offset, match); +} + +bool PhoneNumberMatcher::HasNext() { + if (state_ == NOT_READY) { + PhoneNumberMatch temp_match; + if (!Find(search_index_, &temp_match)) { + state_ = DONE; + } else { + last_match_.reset(new PhoneNumberMatch(temp_match.start(), + temp_match.raw_string(), + temp_match.number())); + search_index_ = last_match_->end(); + state_ = READY; + } + } + return state_ == READY; +} + +bool PhoneNumberMatcher::Next(PhoneNumberMatch* match) { + DCHECK(match); + // Check the state and find the next match as a side-effect if necessary. + if (!HasNext()) { + return false; + } + match->CopyFrom(*last_match_); + state_ = NOT_READY; + last_match_.reset(NULL); + return true; +} + +bool PhoneNumberMatcher::Find(int index, PhoneNumberMatch* match) { + DCHECK(match); + + scoped_ptr text( + reg_exps_->regexp_factory_for_pattern_->CreateInput(text_.substr(index))); + string candidate; + while ((max_tries_ > 0) && + reg_exps_->pattern_->FindAndConsume(text.get(), &candidate)) { + int start = text_.length() - text->ToString().length() - candidate.length(); + // Check for extra numbers at the end. + reg_exps_->capture_up_to_second_number_start_pattern_-> + PartialMatch(candidate, &candidate); + if (ExtractMatch(candidate, start, match)) { + return true; + } + + index = start + candidate.length(); + --max_tries_; + } + return false; +} + +} // namespace phonenumbers +} // namespace i18n diff --git a/cpp/src/phonenumbers/phonenumbermatcher.h b/cpp/src/phonenumbers/phonenumbermatcher.h new file mode 100644 index 000000000..0eb013d98 --- /dev/null +++ b/cpp/src/phonenumbers/phonenumbermatcher.h @@ -0,0 +1,158 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: Lara Rennie +// Author: Tao Huang +// +// This is a direct port from PhoneNumberMatcher.java. +// Changes to this class should also happen to the Java version, whenever it +// makes sense. + +#ifndef I18N_PHONENUMBERS_PHONENUMBERMATCHER_H_ +#define I18N_PHONENUMBERS_PHONENUMBERMATCHER_H_ + +#include + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "phonenumbers/regexp_adapter.h" + +namespace i18n { +namespace phonenumbers { + +using std::string; + +class PhoneNumber; +class PhoneNumberMatch; +class PhoneNumberMatcherRegExps; +class PhoneNumberUtil; + +class PhoneNumberMatcher { + friend class PhoneNumberMatcherTest; + public: + // Leniency when finding potential phone numbers in text segments. The levels + // here are ordered in increasing strictness. + enum Leniency { + // Phone numbers accepted are possible, but not necessarily valid. + POSSIBLE, + // Phone numbers accepted are possible and valid. + VALID, + // Phone numbers accepted are valid and are grouped in a possible way for + // this locale. For example, a US number written as "65 02 53 00 00" is not + // accepted at this leniency level, whereas "650 253 0000" or "6502530000" + // are. Numbers with more than one '/' symbol are also dropped at this + // level. + // Warning: This and the next level might result in lower coverage + // especially for regions outside of country code "+1". + STRICT_GROUPING, + // Phone numbers accepted are valid and are grouped in the same way that we + // would have formatted it, or as a single block. For example, a US number + // written as "650 2530000" is not accepted at this leniency level, whereas + // "650 253 0000" or "6502530000" are. + EXACT_GROUPING, + }; + + // Constructs a phone number matcher. + PhoneNumberMatcher(const PhoneNumberUtil& util, + const string& text, + const string& region_code, + Leniency leniency, + int max_tries); + + // Wrapper to construct a phone number matcher, with no limitation on the + // number of retries and VALID Leniency. + PhoneNumberMatcher(const string& text, + const string& region_code); + + ~PhoneNumberMatcher(); + + // Returns true if the text sequence has another match. + bool HasNext(); + + // Gets next match from text sequence. + bool Next(PhoneNumberMatch* match); + + private: + // The potential states of a PhoneNumberMatcher. + enum State { + NOT_READY, + READY, + DONE, + }; + + // Attempts to extract a match from a candidate string. Returns true if a + // match is found, otherwise returns false. The value "offset" refers to the + // start index of the candidate string within the overall text. + bool Find(int index, PhoneNumberMatch* match); + + // Attempts to extract a match from candidate. Returns true if the match was + // found, otherwise returns false. + bool ExtractMatch(const string& candidate, int offset, + PhoneNumberMatch* match); + + // Attempts to extract a match from a candidate string if the whole candidate + // does not qualify as a match. Returns true if a match is found, otherwise + // returns false. + bool ExtractInnerMatch(const string& candidate, int offset, + PhoneNumberMatch* match); + + // Parses a phone number from the candidate using PhoneNumberUtil::Parse() and + // verifies it matches the requested leniency. If parsing and verification + // succeed, returns true, otherwise this method returns false; + bool ParseAndVerify(const string& candidate, int offset, + PhoneNumberMatch* match); + + bool VerifyAccordingToLeniency(Leniency leniency, const PhoneNumber& number, + const string& candidate) const; + + // Helper method to determine if a character is a Latin-script letter or not. + // For our purposes, combining marks should also return true since we assume + // they have been added to a preceding Latin character. + static bool IsLatinLetter(char32 letter); + + // Helper class holding useful regular expressions. + const PhoneNumberMatcherRegExps* reg_exps_; + + // The phone number utility; + const PhoneNumberUtil& phone_util_; + + // The text searched for phone numbers; + const string text_; + + // The region(country) to assume for phone numbers without an international + // prefix. + const string preferred_region_; + + // The degree of validation requested. + Leniency leniency_; + + // The maximum number of retries after matching an invalid number. + int max_tries_; + + // The iteration tristate. + State state_; + + // The last successful match, NULL unless in State.READY. + scoped_ptr last_match_; + + // The next index to start searching at. Undefined in State.DONE. + int search_index_; + + DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcher); +}; + +} // namespace phonenumbers +} // namespace i18n + +#endif // I18N_PHONENUMBERS_PHONENUMBERMATCHER_H_ diff --git a/cpp/src/phonenumbers/phonenumberutil.h b/cpp/src/phonenumbers/phonenumberutil.h index bd9cfb7e6..6bc802250 100644 --- a/cpp/src/phonenumbers/phonenumberutil.h +++ b/cpp/src/phonenumbers/phonenumberutil.h @@ -67,6 +67,9 @@ class PhoneNumberUtil : public Singleton { friend class Singleton; #endif friend class AsYouTypeFormatter; + friend class PhoneNumberMatcher; + friend class PhoneNumberMatcherRegExps; + friend class PhoneNumberMatcherTest; friend class PhoneNumberUtilTest; public: ~PhoneNumberUtil(); diff --git a/cpp/src/phonenumbers/stringutil.cc b/cpp/src/phonenumbers/stringutil.cc index e9d7a8816..8d021a455 100644 --- a/cpp/src/phonenumbers/stringutil.cc +++ b/cpp/src/phonenumbers/stringutil.cc @@ -14,6 +14,7 @@ // Author: Philippe Liard +#include #include #include #include @@ -23,6 +24,7 @@ namespace i18n { namespace phonenumbers { +using std::equal; using std::stringstream; string operator+(const string& s, int n) { @@ -54,6 +56,43 @@ string SimpleItoa(uint64 n) { return GenericSimpleItoa(n); } +bool HasPrefixString(const string& s, const string& prefix) { + return s.size() >= prefix.size() && + equal(s.begin(), s.begin() + prefix.size(), prefix.begin()); +} + +size_t FindNth(const string& s, char c, int n) { + size_t pos = string::npos; + + for (int i = 0; i < n; ++i) { + pos = s.find_first_of(c, pos + 1); + if (pos == string::npos) { + break; + } + } + return pos; +} + +void SplitStringUsing(const string& s, const string& delimiter, + vector* result) { + assert(result); + size_t start_pos = 0; + size_t find_pos = string::npos; + if (delimiter.empty()) { + return; + } + while ((find_pos = s.find(delimiter, start_pos)) != string::npos) { + const string substring = s.substr(start_pos, find_pos - start_pos); + if (!substring.empty()) { + result->push_back(substring); + } + start_pos = find_pos + delimiter.length(); + } + if (start_pos != s.length()) { + result->push_back(s.substr(start_pos)); + } +} + void StripString(string* s, const char* remove, char replacewith) { const char* str_start = s->c_str(); const char* str = str_start; @@ -252,6 +291,25 @@ string StrCat(const StringHolder& s1, const StringHolder& s2, return result; } +string StrCat(const StringHolder& s1, const StringHolder& s2, + const StringHolder& s3, const StringHolder& s4, + const StringHolder& s5, const StringHolder& s6, + const StringHolder& s7, const StringHolder& s8) { + string result; + result.reserve(s1.Length() + s2.Length() + s3.Length() + s4.Length() + + s5.Length() + s6.Length() + s7.Length() + s8.Length() + 1); + result += s1; + result += s2; + result += s3; + result += s4; + result += s5; + result += s6; + result += s7; + result += s8; + + return result; +} + string StrCat(const StringHolder& s1, const StringHolder& s2, const StringHolder& s3, const StringHolder& s4, const StringHolder& s5, const StringHolder& s6, diff --git a/cpp/src/phonenumbers/stringutil.h b/cpp/src/phonenumbers/stringutil.h index ee46bb2ad..6d90d003b 100644 --- a/cpp/src/phonenumbers/stringutil.h +++ b/cpp/src/phonenumbers/stringutil.h @@ -19,6 +19,7 @@ #include #include +#include #include "base/basictypes.h" @@ -26,6 +27,7 @@ namespace i18n { namespace phonenumbers { using std::string; +using std::vector; // Supports string("hello") + 10. string operator+(const string& s, int n); @@ -34,6 +36,18 @@ string operator+(const string& s, int n); string SimpleItoa(uint64 n); string SimpleItoa(int n); +// Returns whether the provided string starts with the supplied prefix. +bool HasPrefixString(const string& s, const string& prefix); + +// Returns the index of the nth occurence of c in s or string::npos if less than +// n occurrences are present. +size_t FindNth(const string& s, char c, int n); + +// Splits a string using a character delimiter. Appends the components to the +// provided vector. Note that empty tokens are ignored. +void SplitStringUsing(const string& s, const string& delimiter, + vector* result); + // Replaces any occurrence of the character 'remove' (or the characters // in 'remove') with the character 'replacewith'. void StripString(string* s, const char* remove, char replacewith); @@ -113,6 +127,11 @@ string StrCat(const StringHolder& s1, const StringHolder& s2, const StringHolder& s5, const StringHolder& s6, const StringHolder& s7); +string StrCat(const StringHolder& s1, const StringHolder& s2, + const StringHolder& s3, const StringHolder& s4, + const StringHolder& s5, const StringHolder& s6, + const StringHolder& s7, const StringHolder& s8); + string StrCat(const StringHolder& s1, const StringHolder& s2, const StringHolder& s3, const StringHolder& s4, const StringHolder& s5, const StringHolder& s6, diff --git a/cpp/test/phonenumbers/asyoutypeformatter_test.cc b/cpp/test/phonenumbers/asyoutypeformatter_test.cc index 0d538084f..7f7a436fa 100644 --- a/cpp/test/phonenumbers/asyoutypeformatter_test.cc +++ b/cpp/test/phonenumbers/asyoutypeformatter_test.cc @@ -21,7 +21,7 @@ #include "base/logging.h" #include "base/memory/scoped_ptr.h" #include "phonenumbers/phonenumberutil.h" -#include "phonenumbers/region_code.h" +#include "phonenumbers/test_util.h" namespace i18n { namespace phonenumbers { @@ -61,7 +61,7 @@ TEST_F(AsYouTypeFormatterTest, ConvertUnicodeStringPosition) { } TEST_F(AsYouTypeFormatterTest, Constructor) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("US")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::US())); EXPECT_TRUE(GetCurrentMetadata() != NULL); } @@ -107,7 +107,7 @@ TEST_F(AsYouTypeFormatterTest, TooLongNumberMatchingMultipleLeadingDigits) { } TEST_F(AsYouTypeFormatterTest, AYTF_US) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("US")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::US())); EXPECT_EQ("6", formatter_->InputDigit('6', &result_)); EXPECT_EQ("65", formatter_->InputDigit('5', &result_)); @@ -203,7 +203,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_US) { } TEST_F(AsYouTypeFormatterTest, AYTF_USFullWidthCharacters) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("US")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::US())); EXPECT_EQ("\xEF\xBC\x96" /* "6" */, formatter_->InputDigit(UnicodeString("\xEF\xBC\x96" /* "6" */)[0], @@ -238,7 +238,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_USFullWidthCharacters) { } TEST_F(AsYouTypeFormatterTest, AYTF_USMobileShortCode) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("US")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::US())); EXPECT_EQ("*", formatter_->InputDigit('*', &result_)); EXPECT_EQ("*1", formatter_->InputDigit('1', &result_)); @@ -248,7 +248,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_USMobileShortCode) { } TEST_F(AsYouTypeFormatterTest, AYTF_USVanityNumber) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("US")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::US())); EXPECT_EQ("8", formatter_->InputDigit('8', &result_)); EXPECT_EQ("80", formatter_->InputDigit('0', &result_)); @@ -265,7 +265,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_USVanityNumber) { } TEST_F(AsYouTypeFormatterTest, AYTFAndRememberPositionUS) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("US")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::US())); EXPECT_EQ("1", formatter_->InputDigitAndRememberPosition('1', &result_)); EXPECT_EQ(1, formatter_->GetRememberedPosition()); @@ -407,7 +407,7 @@ TEST_F(AsYouTypeFormatterTest, AYTFAndRememberPositionUS) { } TEST_F(AsYouTypeFormatterTest, AYTF_GBFixedLine) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("GB")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::GB())); EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("02", formatter_->InputDigit('2', &result_)); @@ -425,7 +425,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_GBFixedLine) { } TEST_F(AsYouTypeFormatterTest, AYTF_GBTollFree) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("GB")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::GB())); EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("08", formatter_->InputDigit('8', &result_)); @@ -441,7 +441,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_GBTollFree) { } TEST_F(AsYouTypeFormatterTest, AYTF_GBPremiumRate) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("GB")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::GB())); EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("09", formatter_->InputDigit('9', &result_)); @@ -457,7 +457,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_GBPremiumRate) { } TEST_F(AsYouTypeFormatterTest, AYTF_NZMobile) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("NZ")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::NZ())); EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("02", formatter_->InputDigit('2', &result_)); @@ -473,7 +473,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_NZMobile) { } TEST_F(AsYouTypeFormatterTest, AYTF_DE) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("DE")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::DE())); EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("03", formatter_->InputDigit('3', &result_)); @@ -513,7 +513,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_DE) { } TEST_F(AsYouTypeFormatterTest, AYTF_AR) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("AR")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::AR())); EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("01", formatter_->InputDigit('1', &result_)); @@ -529,7 +529,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_AR) { } TEST_F(AsYouTypeFormatterTest, AYTF_ARMobile) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("AR")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::AR())); EXPECT_EQ("+", formatter_->InputDigit('+', &result_)); EXPECT_EQ("+5", formatter_->InputDigit('5', &result_)); @@ -548,7 +548,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_ARMobile) { } TEST_F(AsYouTypeFormatterTest, AYTF_KR) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("KR")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::KR())); // +82 51 234 5678 EXPECT_EQ("+", formatter_->InputDigit('+', &result_)); @@ -639,7 +639,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_KR) { } TEST_F(AsYouTypeFormatterTest, AYTF_MX) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("MX")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::MX())); // +52 800 123 4567 EXPECT_EQ("+", formatter_->InputDigit('+', &result_)); @@ -724,7 +724,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_MX) { } TEST_F(AsYouTypeFormatterTest, AYTF_MultipleLeadingDigitPatterns) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("JP")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::JP())); // +81 50 2345 6789 EXPECT_EQ("+", formatter_->InputDigit('+', &result_)); @@ -773,7 +773,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_MultipleLeadingDigitPatterns) { } TEST_F(AsYouTypeFormatterTest, AYTF_LongIDD_AU) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("AU")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::AU())); // 0011 1 650 253 2250 EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("00", formatter_->InputDigit('0', &result_)); @@ -830,7 +830,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_LongIDD_AU) { } TEST_F(AsYouTypeFormatterTest, AYTF_LongIDD_KR) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("KR")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::KR())); // 00300 1 650 253 2250 EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("00", formatter_->InputDigit('0', &result_)); @@ -851,7 +851,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_LongIDD_KR) { } TEST_F(AsYouTypeFormatterTest, AYTF_LongNDD_KR) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("KR")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::KR())); // 08811-9876-7890 EXPECT_EQ("0", formatter_->InputDigit('0', &result_)); EXPECT_EQ("08", formatter_->InputDigit('8', &result_)); @@ -887,7 +887,7 @@ TEST_F(AsYouTypeFormatterTest, AYTF_LongNDD_KR) { } TEST_F(AsYouTypeFormatterTest, AYTF_LongNDD_SG) { - formatter_.reset(phone_util_.GetAsYouTypeFormatter("SG")); + formatter_.reset(phone_util_.GetAsYouTypeFormatter(RegionCode::SG())); // 777777 9876 7890 EXPECT_EQ("7", formatter_->InputDigit('7', &result_)); EXPECT_EQ("77", formatter_->InputDigit('7', &result_)); diff --git a/cpp/test/phonenumbers/phonenumbermatch_test.cc b/cpp/test/phonenumbers/phonenumbermatch_test.cc new file mode 100644 index 000000000..006411503 --- /dev/null +++ b/cpp/test/phonenumbers/phonenumbermatch_test.cc @@ -0,0 +1,91 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Author: Tao Huang +// +// Basic test cases for PhoneNumberMatch. + +#include "phonenumbers/phonenumber.h" +#include "phonenumbers/phonenumbermatch.h" + +#include + +#include "phonenumbers/phonenumber.pb.h" + +namespace i18n { +namespace phonenumbers { + +TEST(PhoneNumberMatch, TestGetterMethods) { + PhoneNumber number; + const int start_index = 10; + const string raw_phone_number("1 800 234 45 67"); + PhoneNumberMatch match1(start_index, raw_phone_number, number); + + EXPECT_EQ(start_index, match1.start()); + EXPECT_EQ(start_index + raw_phone_number.length(), match1.end()); + EXPECT_EQ(raw_phone_number.length(), match1.length()); + EXPECT_EQ(raw_phone_number, match1.raw_string()); + + EXPECT_EQ("PhoneNumberMatch [10,25) 1 800 234 45 67", match1.ToString()); +} + +TEST(PhoneNumberMatch, TestEquals) { + PhoneNumber number; + PhoneNumberMatch match1(10, "1 800 234 45 67", number); + PhoneNumberMatch match2(10, "1 800 234 45 67", number); + + match2.set_start(11); + ASSERT_FALSE(match1.Equals(match2)); + match2.set_start(match1.start()); + EXPECT_TRUE(match1.Equals(match2)); + + PhoneNumber number2; + number2.set_raw_input("123"); + match2.set_number(number2); + ASSERT_FALSE(match1.Equals(match2)); + match2.set_number(match1.number()); + EXPECT_TRUE(ExactlySameAs(match1.number(), match2.number())); + EXPECT_TRUE(match1.Equals(match2)); + + match2.set_raw_string("123"); + ASSERT_FALSE(match1.Equals(match2)); +} + +TEST(PhoneNumberMatch, TestAssignmentOverload) { + PhoneNumber number; + PhoneNumberMatch match1(10, "1 800 234 45 67", number); + PhoneNumberMatch match2; + ASSERT_FALSE(match1.Equals(match2)); + + match2.CopyFrom(match1); + ASSERT_TRUE(match1.Equals(match2)); + + PhoneNumberMatch match3; + PhoneNumberMatch match4; + match4.CopyFrom(match2); + match3.CopyFrom(match2); + ASSERT_TRUE(match3.Equals(match4)); + ASSERT_TRUE(match4.Equals(match2)); +} + +TEST(PhoneNumberMatch, TestCopyConstructor) { + PhoneNumber number; + PhoneNumberMatch match1(10, "1 800 234 45 67", number); + PhoneNumberMatch match2; + match2.CopyFrom(match1); + ASSERT_TRUE(match1.Equals(match2)); +} + +} // namespace phonenumbers +} // namespace i18n diff --git a/cpp/test/phonenumbers/phonenumbermatcher_test.cc b/cpp/test/phonenumbers/phonenumbermatcher_test.cc new file mode 100644 index 000000000..cd58811a4 --- /dev/null +++ b/cpp/test/phonenumbers/phonenumbermatcher_test.cc @@ -0,0 +1,1022 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Lara Rennie + +#include "phonenumbers/phonenumbermatcher.h" + +#include +#include + +#include +#include + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "base/memory/singleton.h" +#include "phonenumbers/phonenumber.h" +#include "phonenumbers/phonenumber.pb.h" +#include "phonenumbers/phonenumbermatch.h" +#include "phonenumbers/phonenumberutil.h" +#include "phonenumbers/stringutil.h" +#include "phonenumbers/test_util.h" + +namespace i18n { +namespace phonenumbers { + +using std::string; +using std::vector; +using icu::UnicodeString; + +namespace { +// Small class that holds the context of the number we are testing against. The +// test will insert the phone number to be found between leading_text and +// trailing_text. +struct NumberContext { + string leading_text_; + string trailing_text_; + NumberContext(const string& leading_text, const string& trailing_text) + : leading_text_(leading_text), + trailing_text_(trailing_text) { + } +}; + +// Small class that holds the number we want to test and the region for which it +// should be valid. +struct NumberTest { + string raw_string_; + string region_; + + string ToString() const { + return StrCat(raw_string_, " (", region_, ")"); + } + + NumberTest(const string& raw_string, const string& region) + : raw_string_(raw_string), + region_(region) { + } +}; +} // namespace + +class PhoneNumberMatcherTest : public testing::Test { + protected: + PhoneNumberMatcherTest() + : phone_util_(*PhoneNumberUtil::GetInstance()), + matcher_(phone_util_, "", + RegionCode::US(), + PhoneNumberMatcher::VALID, 5), + offset_(0) { + } + + bool IsLatinLetter(char32 letter) { + return PhoneNumberMatcher::IsLatinLetter(letter); + } + + bool ExtractMatch(const string& text, PhoneNumberMatch* match) { + return matcher_.ExtractMatch(text, offset_, match); + } + + PhoneNumberMatcher* GetMatcherWithLeniency( + const string& text, const string& region, + PhoneNumberMatcher::Leniency leniency) const { + return new PhoneNumberMatcher(phone_util_, text, region, leniency, + 100 /* max_tries */); + } + + // Tests each number in the test cases provided is found in its entirety for + // the specified leniency level. + void DoTestNumberMatchesForLeniency( + const vector& test_cases, + PhoneNumberMatcher::Leniency leniency) const { + scoped_ptr matcher; + for (vector::const_iterator test = test_cases.begin(); + test != test_cases.end(); ++test) { + matcher.reset(GetMatcherWithLeniency( + test->raw_string_, test->region_, leniency)); + EXPECT_TRUE(matcher->HasNext()) + << "No match found in " << test->ToString() + << " for leniency: " << leniency; + if (matcher->HasNext()) { + PhoneNumberMatch match; + matcher->Next(&match); + EXPECT_EQ(test->raw_string_, match.raw_string()) + << "Found wrong match in test " << test->ToString() + << ". Found " << match.raw_string(); + } + } + } + + // Tests no number in the test cases provided is found for the specified + // leniency level. + void DoTestNumberNonMatchesForLeniency( + const vector& test_cases, + PhoneNumberMatcher::Leniency leniency) const { + scoped_ptr matcher; + for (vector::const_iterator test = test_cases.begin(); + test != test_cases.end(); ++test) { + matcher.reset(GetMatcherWithLeniency( + test->raw_string_, test->region_, leniency)); + EXPECT_FALSE(matcher->HasNext()) << "Match found in " << test->ToString() + << " for leniency: " << leniency; + } + } + + // Asserts that another number can be found in "text" starting at "index", and + // that its corresponding range is [start, end). + void AssertEqualRange(const string& text, int index, int start, int end) { + string sub = text.substr(index); + PhoneNumberMatcher matcher(phone_util_, sub, RegionCode::NZ(), + PhoneNumberMatcher::POSSIBLE, + 1000000 /* max_tries */); + PhoneNumberMatch match; + ASSERT_TRUE(matcher.HasNext()); + matcher.Next(&match); + EXPECT_EQ(start - index, match.start()); + EXPECT_EQ(end - index, match.end()); + EXPECT_EQ(sub.substr(match.start(), match.length()), match.raw_string()); + } + + // Tests numbers found by the PhoneNumberMatcher in various textual contexts. + void DoTestFindInContext(const string& number, + const string& default_country) { + FindPossibleInContext(number, default_country); + + PhoneNumber parsed; + phone_util_.Parse(number, default_country, &parsed); + if (phone_util_.IsValidNumber(parsed)) { + FindValidInContext(number, default_country); + } + } + + // Helper method which tests the contexts provided and ensures that: + // -- if is_valid is true, they all find a test number inserted in the middle + // when leniency of matching is set to VALID; else no test number should be + // extracted at that leniency level + // -- if is_possible is true, they all find a test number inserted in the + // middle when leniency of matching is set to POSSIBLE; else no test number + // should be extracted at that leniency level + void FindMatchesInContexts(const vector& contexts, + bool is_valid, bool is_possible, + const string& region, const string& number) { + if (is_valid) { + DoTestInContext(number, region, contexts, PhoneNumberMatcher::VALID); + } else { + for (vector::const_iterator it = contexts.begin(); + it != contexts.end(); ++it) { + string text = StrCat(it->leading_text_, number, it->trailing_text_); + PhoneNumberMatcher matcher(text, region); + EXPECT_FALSE(matcher.HasNext()); + } + } + if (is_possible) { + DoTestInContext(number, region, contexts, PhoneNumberMatcher::POSSIBLE); + } else { + for (vector::const_iterator it = contexts.begin(); + it != contexts.end(); ++it) { + string text = StrCat(it->leading_text_, number, it->trailing_text_); + PhoneNumberMatcher matcher(phone_util_, text, region, + PhoneNumberMatcher::POSSIBLE, + 10000); // Number of matches. + EXPECT_FALSE(matcher.HasNext()); + } + } + } + + // Variant of FindMatchesInContexts that uses a default number and region. + void FindMatchesInContexts(const vector& contexts, + bool is_valid, bool is_possible) { + const string& region = RegionCode::US(); + const string number("415-666-7777"); + + FindMatchesInContexts(contexts, is_valid, is_possible, region, number); + } + + // Tests valid numbers in contexts that should pass for + // PhoneNumberMatcher::POSSIBLE. + void FindPossibleInContext(const string& number, + const string& default_country) { + vector context_pairs; + context_pairs.push_back(NumberContext("", "")); // no context + context_pairs.push_back(NumberContext(" ", "\t")); // whitespace only + context_pairs.push_back(NumberContext("Hello ", "")); // no context at end + // No context at start. + context_pairs.push_back(NumberContext("", " to call me!")); + context_pairs.push_back(NumberContext("Hi there, call ", " to reach me!")); + // With commas. + context_pairs.push_back(NumberContext("Hi there, call ", ", or don't")); + // Three examples without whitespace around the number. + context_pairs.push_back(NumberContext("Hi call", "")); + context_pairs.push_back(NumberContext("", "forme")); + context_pairs.push_back(NumberContext("Hi call", "forme")); + // With other small numbers. + context_pairs.push_back(NumberContext("It's cheap! Call ", " before 6:30")); + // With a second number later. + context_pairs.push_back(NumberContext("Call ", " or +1800-123-4567!")); + // With a Month-Day date. + context_pairs.push_back(NumberContext("Call me on June 21 at", "")); + // With publication pages. + context_pairs.push_back(NumberContext( + "As quoted by Alfonso 12-15 (2009), you may call me at ", "")); + context_pairs.push_back(NumberContext( + "As quoted by Alfonso et al. 12-15 (2009), you may call me at ", "")); + // With dates, written in the American style. + context_pairs.push_back(NumberContext( + "As I said on 03/10/2011, you may call me at ", "")); + // With trailing numbers after a comma. The 45 should not be considered an + // extension. + context_pairs.push_back(NumberContext("", ", 45 days a year")); + // With a postfix stripped off as it looks like the start of another number. + context_pairs.push_back(NumberContext("Call ", "/x12 more")); + + DoTestInContext(number, default_country, context_pairs, + PhoneNumberMatcher::POSSIBLE); + } + + // Tests valid numbers in contexts that fail for PhoneNumberMatcher::POSSIBLE + // but are valid for PhoneNumberMatcher::VALID. + void FindValidInContext(const string& number, const string& default_country) { + vector context_pairs; + // With other small numbers. + context_pairs.push_back(NumberContext("It's only 9.99! Call ", " to buy")); + // With a number Day.Month.Year date. + context_pairs.push_back(NumberContext("Call me on 21.6.1984 at ", "")); + // With a number Month/Day date. + context_pairs.push_back(NumberContext("Call me on 06/21 at ", "")); + // With a number Day.Month date. + context_pairs.push_back(NumberContext("Call me on 21.6. at ", "")); + // With a number Month/Day/Year date. + context_pairs.push_back(NumberContext("Call me on 06/21/84 at ", "")); + + DoTestInContext(number, default_country, context_pairs, + PhoneNumberMatcher::VALID); + } + + void DoTestInContext(const string& number, const string& default_country, + const vector& context_pairs, + PhoneNumberMatcher::Leniency leniency) { + for (vector::const_iterator it = context_pairs.begin(); + it != context_pairs.end(); ++it) { + string prefix = it->leading_text_; + string text = StrCat(prefix, number, it->trailing_text_); + + int start = prefix.length(); + int end = start + number.length(); + PhoneNumberMatcher matcher(phone_util_, text, default_country, leniency, + 1000000 /* max_tries */); + PhoneNumberMatch match; + ASSERT_TRUE(matcher.HasNext()) + << "Did not find a number in '" << text << "'; expected '" + << number << "'"; + matcher.Next(&match); + + string extracted = text.substr(match.start(), match.length()); + EXPECT_EQ(start, match.start()); + EXPECT_EQ(end, match.end()); + EXPECT_EQ(number, extracted); + EXPECT_EQ(extracted, match.raw_string()) + << "Unexpected phone region in '" << text << "'; extracted '" + << extracted << "'"; + EnsureTermination(text, default_country, leniency); + } + } + + // Exhaustively searches for phone numbers from each index within "text" to + // test that finding matches always terminates. + void EnsureTermination(const string& text, const string& default_country, + PhoneNumberMatcher::Leniency leniency) { + for (size_t index = 0; index <= text.length(); ++index) { + string sub = text.substr(index); + // Iterates over all matches. + PhoneNumberMatcher matcher(phone_util_, text, default_country, leniency, + 1000000 /* max_tries */); + string matches; + PhoneNumberMatch match; + int match_count = 0; + while (matcher.HasNext()) { + matcher.Next(&match); + StrAppend(&matches, ",", match.ToString()); + ++match_count; + } + // We should not ever find more than 10 matches in a single candidate text + // in these test cases, so we check here that the matcher was limited by + // the number of matches, rather than by max_tries. + ASSERT_LT(match_count, 10); + } + } + + const PhoneNumberUtil& phone_util_; + + private: + PhoneNumberMatcher matcher_; + int offset_; +}; + +// See PhoneNumberUtilTest::ParseNationalNumber. +TEST_F(PhoneNumberMatcherTest, FindNationalNumber) { + // Same cases as in ParseNationalNumber. + DoTestFindInContext("033316005", RegionCode::NZ()); + DoTestFindInContext("33316005", RegionCode::NZ()); + // National prefix attached and some formatting present. + DoTestFindInContext("03-331 6005", RegionCode::NZ()); + DoTestFindInContext("03 331 6005", RegionCode::NZ()); + // Testing international prefixes. + // Should strip country code. + DoTestFindInContext("0064 3 331 6005", RegionCode::NZ()); + // Try again, but this time we have an international number with Region Code + // US. It should recognize the country code and parse accordingly. + DoTestFindInContext("01164 3 331 6005", RegionCode::US()); + DoTestFindInContext("+64 3 331 6005", RegionCode::US()); + + DoTestFindInContext("64(0)64123456", RegionCode::NZ()); + // Check that using a "/" is fine in a phone number. + DoTestFindInContext("123/45678", RegionCode::DE()); + DoTestFindInContext("123-456-7890", RegionCode::US()); +} + +// See PhoneNumberUtilTest::ParseWithInternationalPrefixes. +TEST_F(PhoneNumberMatcherTest, FindWithInternationalPrefixes) { + DoTestFindInContext("+1 (650) 333-6000", RegionCode::NZ()); + DoTestFindInContext("1-650-333-6000", RegionCode::US()); + // Calling the US number from Singapore by using different service providers + // 1st test: calling using SingTel IDD service (IDD is 001) + DoTestFindInContext("0011-650-333-6000", RegionCode::SG()); + // 2nd test: calling using StarHub IDD service (IDD is 008) + DoTestFindInContext("0081-650-333-6000", RegionCode::SG()); + // 3rd test: calling using SingTel V019 service (IDD is 019) + DoTestFindInContext("0191-650-333-6000", RegionCode::SG()); + // Calling the US number from Poland + DoTestFindInContext("0~01-650-333-6000", RegionCode::PL()); + // Using "++" at the start. + DoTestFindInContext("++1 (650) 333-6000", RegionCode::PL()); + // Using a full-width plus sign. + DoTestFindInContext( + "\xEF\xBC\x8B""1 (650) 333-6000" /* "+1 (650) 333-6000" */, + RegionCode::SG()); + // The whole number, including punctuation, is here represented in full-width + // form. + DoTestFindInContext( + /* "+1 (650) 333-6000" */ + "\xEF\xBC\x8B\xEF\xBC\x91\xE3\x80\x80\xEF\xBC\x88\xEF\xBC\x96\xEF\xBC\x95" + "\xEF\xBC\x90\xEF\xBC\x89\xE3\x80\x80\xEF\xBC\x93\xEF\xBC\x93\xEF\xBC\x93" + "\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x90\xEF\xBC\x90\xEF\xBC\x90", + RegionCode::SG()); +} + +// See PhoneNumberUtilTest::ParseWithLeadingZero. +TEST_F(PhoneNumberMatcherTest, FindWithLeadingZero) { + DoTestFindInContext("+39 02-36618 300", RegionCode::NZ()); + DoTestFindInContext("02-36618 300", RegionCode::IT()); + DoTestFindInContext("312 345 678", RegionCode::IT()); +} + +// See PhoneNumberUtilTest::ParseNationalNumberArgentina. +TEST_F(PhoneNumberMatcherTest, FindNationalNumberArgentina) { + // Test parsing mobile numbers of Argentina. + DoTestFindInContext("+54 9 343 555 1212", RegionCode::AR()); + DoTestFindInContext("0343 15 555 1212", RegionCode::AR()); + + DoTestFindInContext("+54 9 3715 65 4320", RegionCode::AR()); + DoTestFindInContext("03715 15 65 4320", RegionCode::AR()); + + // Test parsing fixed-line numbers of Argentina. + DoTestFindInContext("+54 11 3797 0000", RegionCode::AR()); + DoTestFindInContext("011 3797 0000", RegionCode::AR()); + + DoTestFindInContext("+54 3715 65 4321", RegionCode::AR()); + DoTestFindInContext("03715 65 4321", RegionCode::AR()); + + DoTestFindInContext("+54 23 1234 0000", RegionCode::AR()); + DoTestFindInContext("023 1234 0000", RegionCode::AR()); +} + +// See PhoneNumberMatcherTest::ParseWithXInNumber. +TEST_F(PhoneNumberMatcherTest, FindWithXInNumber) { + DoTestFindInContext("(0xx) 123456789", RegionCode::AR()); + // A case where x denotes both carrier codes and extension symbol. + DoTestFindInContext("(0xx) 123456789 x 1234", RegionCode::AR()); + + // This test is intentionally constructed such that the number of digit after + // xx is larger than 7, so that the number won't be mistakenly treated as an + // extension, as we allow extensions up to 7 digits. This assumption is okay + // for now as all the countries where a carrier selection code is written in + // the form of xx have a national significant number of length larger than 7. + DoTestFindInContext("011xx5481429712", RegionCode::US()); +} + +// See PhoneNumberUtilTest::ParseNumbersMexico. +TEST_F(PhoneNumberMatcherTest, FindNumbersMexico) { + // Test parsing fixed-line numbers of Mexico. + DoTestFindInContext("+52 (449)978-0001", RegionCode::MX()); + DoTestFindInContext("01 (449)978-0001", RegionCode::MX()); + DoTestFindInContext("(449)978-0001", RegionCode::MX()); + + // Test parsing mobile numbers of Mexico. + DoTestFindInContext("+52 1 33 1234-5678", RegionCode::MX()); + DoTestFindInContext("044 (33) 1234-5678", RegionCode::MX()); + DoTestFindInContext("045 33 1234-5678", RegionCode::MX()); +} + +// See PhoneNumberUtilTest::ParseNumbersWithPlusWithNoRegion. +TEST_F(PhoneNumberMatcherTest, FindNumbersWithPlusWithNoRegion) { + // RegionCode::ZZ() is allowed only if the number starts with a '+' - then the + // country code can be calculated. + DoTestFindInContext("+64 3 331 6005", RegionCode::ZZ()); +} + +// See PhoneNumberUtilTest::ParseExtensions. +TEST_F(PhoneNumberMatcherTest, FindExtensions) { + DoTestFindInContext("03 331 6005 ext 3456", RegionCode::NZ()); + DoTestFindInContext("03-3316005x3456", RegionCode::NZ()); + DoTestFindInContext("03-3316005 int.3456", RegionCode::NZ()); + DoTestFindInContext("03 3316005 #3456", RegionCode::NZ()); + DoTestFindInContext("0~0 1800 7493 524", RegionCode::PL()); + DoTestFindInContext("(1800) 7493.524", RegionCode::US()); + // Check that the last instance of an extension token is matched. + DoTestFindInContext("0~0 1800 7493 524 ~1234", RegionCode::PL()); + // Verifying bug-fix where the last digit of a number was previously omitted + // if it was a 0 when extracting the extension. Also verifying a few different + // cases of extensions. + DoTestFindInContext("+44 2034567890x456", RegionCode::NZ()); + DoTestFindInContext("+44 2034567890x456", RegionCode::GB()); + DoTestFindInContext("+44 2034567890 x456", RegionCode::GB()); + DoTestFindInContext("+44 2034567890 X456", RegionCode::GB()); + DoTestFindInContext("+44 2034567890 X 456", RegionCode::GB()); + DoTestFindInContext("+44 2034567890 X 456", RegionCode::GB()); + DoTestFindInContext("+44 2034567890 X 456", RegionCode::GB()); + + DoTestFindInContext("(800) 901-3355 x 7246433", RegionCode::US()); + DoTestFindInContext("(800) 901-3355 , ext 7246433", RegionCode::US()); + DoTestFindInContext("(800) 901-3355 ,extension 7246433", RegionCode::US()); + // The next test differs from PhoneNumberUtil -> when matching we don't + // consider a lone comma to indicate an extension, although we accept it when + // parsing. + DoTestFindInContext("(800) 901-3355 ,x 7246433", RegionCode::US()); + DoTestFindInContext("(800) 901-3355 ext: 7246433", RegionCode::US()); +} + +TEST_F(PhoneNumberMatcherTest, FindInterspersedWithSpace) { + DoTestFindInContext("0 3 3 3 1 6 0 0 5", RegionCode::NZ()); +} + +// Test matching behavior when starting in the middle of a phone number. +TEST_F(PhoneNumberMatcherTest, IntermediateParsePositions) { + string text = "Call 033316005 or 032316005!"; + // | | | | | | + // 0 5 10 15 20 25 + + // Iterate over all possible indices. + for (int i = 0; i <= 5; ++i) { + AssertEqualRange(text, i, 5, 14); + } + // 7 and 8 digits in a row are still parsed as number. + AssertEqualRange(text, 6, 6, 14); + AssertEqualRange(text, 7, 7, 14); + // Anything smaller is skipped to the second instance. + for (int i = 8; i <= 19; ++i) { + AssertEqualRange(text, i, 19, 28); + } +} + +TEST_F(PhoneNumberMatcherTest, MatchWithSurroundingZipcodes) { + string number = "415-666-7777"; + string zip_preceding = + StrCat("My address is CA 34215 - ", number, " is my number."); + PhoneNumber expected_result; + phone_util_.Parse(number, RegionCode::US(), &expected_result); + + scoped_ptr matcher( + GetMatcherWithLeniency(zip_preceding, RegionCode::US(), + PhoneNumberMatcher::VALID)); + + PhoneNumberMatch match; + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->Next(&match)); + EXPECT_EQ(expected_result, match.number()); + EXPECT_EQ(number, match.raw_string()); + + // Now repeat, but this time the phone number has spaces in it. It should + // still be found. + number = "(415) 666 7777"; + + string zip_following = + StrCat("My number is ", number, ". 34215 is my zip-code."); + matcher.reset( + GetMatcherWithLeniency(zip_following, RegionCode::US(), + PhoneNumberMatcher::VALID)); + + PhoneNumberMatch match_with_spaces; + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->Next(&match_with_spaces)); + EXPECT_EQ(expected_result, match_with_spaces.number()); + EXPECT_EQ(number, match_with_spaces.raw_string()); +} + +TEST_F(PhoneNumberMatcherTest, IsLatinLetter) { + EXPECT_TRUE(IsLatinLetter('c')); + EXPECT_TRUE(IsLatinLetter('C')); + EXPECT_TRUE(IsLatinLetter(UnicodeString("\xC3\x89" /* "É" */)[0])); + // Combining acute accent. + EXPECT_TRUE(IsLatinLetter(UnicodeString("\xCC\x81")[0])); + EXPECT_FALSE(IsLatinLetter(':')); + EXPECT_FALSE(IsLatinLetter('5')); + EXPECT_FALSE(IsLatinLetter('-')); + EXPECT_FALSE(IsLatinLetter('.')); + EXPECT_FALSE(IsLatinLetter(' ')); + EXPECT_FALSE(IsLatinLetter(UnicodeString("\xE6\x88\x91" /* "我" */)[0])); +} + +TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingLatinChars) { + vector possible_only_contexts; + possible_only_contexts.push_back(NumberContext("abc", "def")); + possible_only_contexts.push_back(NumberContext("abc", "")); + possible_only_contexts.push_back(NumberContext("", "def")); + possible_only_contexts.push_back(NumberContext("\xC3\x89" /* "É" */, "")); + // e with an acute accent decomposed (with combining mark). + possible_only_contexts.push_back( + NumberContext("\x20\x22\xCC\x81""e\xCC\x81" /* "́e\xCC\x81" */, "")); + + // Numbers should not be considered valid, if they are surrounded by Latin + // characters, but should be considered possible. + FindMatchesInContexts(possible_only_contexts, false, true); +} + +TEST_F(PhoneNumberMatcherTest, MoneyNotSeenAsPhoneNumber) { + vector possible_only_contexts; + possible_only_contexts.push_back(NumberContext("$", "")); + possible_only_contexts.push_back(NumberContext("", "$")); + possible_only_contexts.push_back(NumberContext("\xC2\xA3" /* "£" */, "")); + possible_only_contexts.push_back(NumberContext("\xC2\xA5" /* "¥" */, "")); + FindMatchesInContexts(possible_only_contexts, false, true); +} + +TEST_F(PhoneNumberMatcherTest, PhoneNumberWithLeadingOrTrailingMoneyMatches) { + vector contexts; + contexts.push_back(NumberContext("$20 ", "")); + contexts.push_back(NumberContext("", " 100$")); + // Because of the space after the 20 (or before the 100) these dollar amounts + // should not stop the actual number from being found. + FindMatchesInContexts(contexts, true, true); +} + +TEST_F(PhoneNumberMatcherTest, + MatchesWithSurroundingLatinCharsAndLeadingPunctuation) { + vector possible_only_contexts; + // Contexts with trailing characters. Leading characters are okay here since + // the numbers we will insert start with punctuation, but trailing characters + // are still not allowed. + possible_only_contexts.push_back(NumberContext("abc", "def")); + possible_only_contexts.push_back(NumberContext("", "def")); + possible_only_contexts.push_back(NumberContext("", "\xC3\x89" /* "É" */)); + + // Numbers should not be considered valid, if they have trailing Latin + // characters, but should be considered possible. + string number_with_plus = "+14156667777"; + string number_with_brackets = "(415)6667777"; + FindMatchesInContexts(possible_only_contexts, false, true, RegionCode::US(), + number_with_plus); + FindMatchesInContexts(possible_only_contexts, false, true, RegionCode::US(), + number_with_brackets); + + vector valid_contexts; + valid_contexts.push_back(NumberContext("abc", "")); + valid_contexts.push_back(NumberContext("\xC3\x89" /* "É" */, "")); + valid_contexts.push_back( + NumberContext("\xC3\x89" /* "É" */, ".")); // Trailing punctuation. + // Trailing white-space. + valid_contexts.push_back(NumberContext("\xC3\x89" /* "É" */, " def")); + + // Numbers should be considered valid, since they start with punctuation. + FindMatchesInContexts(valid_contexts, true, true, RegionCode::US(), + number_with_plus); + FindMatchesInContexts(valid_contexts, true, true, RegionCode::US(), + number_with_brackets); +} + +TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingChineseChars) { + vector valid_contexts; + valid_contexts.push_back(NumberContext( + /* "我的电话号码是" */ + "\xE6\x88\x91\xE7\x9A\x84\xE7\x94\xB5\xE8\xAF\x9D\xE5\x8F\xB7\xE7\xA0\x81" + "\xE6\x98\xAF", "")); + valid_contexts.push_back(NumberContext( + "", + /* "是我的电话号码" */ + "\xE6\x98\xAF\xE6\x88\x91\xE7\x9A\x84\xE7\x94\xB5\xE8\xAF\x9D\xE5\x8F\xB7" + "\xE7\xA0\x81")); + valid_contexts.push_back(NumberContext( + "\xE8\xAF\xB7\xE6\x8B\xA8\xE6\x89\x93" /* "请拨打" */, + "\xE6\x88\x91\xE5\x9C\xA8\xE6\x98\x8E\xE5\xA4\xA9" /* "我在明天" */)); + + // Numbers should be considered valid, since they are surrounded by Chinese. + FindMatchesInContexts(valid_contexts, true, true); +} + +TEST_F(PhoneNumberMatcherTest, MatchesWithSurroundingPunctuation) { + vector valid_contexts; + // At end of text. + valid_contexts.push_back(NumberContext("My number-", "")); + // At start of text. + valid_contexts.push_back(NumberContext("", ".Nice day.")); + // Punctuation surround number. + valid_contexts.push_back(NumberContext("Tel:", ".")); + // White-space is also fine. + valid_contexts.push_back(NumberContext("Tel: ", " on Saturdays.")); + + // Numbers should be considered valid, since they are surrounded by + // punctuation. + FindMatchesInContexts(valid_contexts, true, true); +} + +TEST_F(PhoneNumberMatcherTest, + MatchesMultiplePhoneNumbersSeparatedByPhoneNumberPunctuation) { + const string text = "Call 650-253-4561 -- 455-234-3451"; + const string& region = RegionCode::US(); + PhoneNumber number1; + number1.set_country_code(phone_util_.GetCountryCodeForRegion(region)); + number1.set_national_number(6502534561); + PhoneNumberMatch match1(5, "650-253-4561", number1); + + PhoneNumber number2; + number2.set_country_code(phone_util_.GetCountryCodeForRegion(region)); + number2.set_national_number(4552343451); + PhoneNumberMatch match2(21, "455-234-3451", number2); + + PhoneNumberMatcher matcher( + phone_util_, text, region, PhoneNumberMatcher::VALID, 100); + + PhoneNumberMatch actual_match1; + PhoneNumberMatch actual_match2; + matcher.Next(&actual_match1); + matcher.Next(&actual_match2); + EXPECT_TRUE(match1.Equals(actual_match1)) + << "Got: " << actual_match1.ToString(); + EXPECT_TRUE(match2.Equals(actual_match2)) + << "Got: " << actual_match2.ToString(); +} + +TEST_F(PhoneNumberMatcherTest, + DoesNotMatchMultiplePhoneNumbersSeparatedWithNoWhiteSpace) { + const string text = "Call 650-253-4561--455-234-3451"; + const string& region = RegionCode::US(); + PhoneNumberMatcher matcher( + phone_util_, text, region, PhoneNumberMatcher::VALID, 100); + EXPECT_FALSE(matcher.HasNext()); +} + +// Strings with number-like things that shouldn't be found under any level. +static const NumberTest IMPOSSIBLE_CASES[] = { + NumberTest("12345", RegionCode::US()), + NumberTest("23456789", RegionCode::US()), + NumberTest("234567890112", RegionCode::US()), + NumberTest("650+253+1234", RegionCode::US()), + NumberTest("3/10/1984", RegionCode::CA()), + NumberTest("03/27/2011", RegionCode::US()), + NumberTest("31/8/2011", RegionCode::US()), + NumberTest("1/12/2011", RegionCode::US()), + NumberTest("10/12/82", RegionCode::DE()), +}; + +// Strings with number-like things that should only be found under "possible". +static const NumberTest POSSIBLE_ONLY_CASES[] = { + NumberTest("abc8002345678", RegionCode::US()), + // US numbers cannot start with 7 in the test metadata to be valid. + NumberTest("7121115678", RegionCode::US()), + // 'X' should not be found in numbers at leniencies stricter than POSSIBLE, + // unless it represents a carrier code or extension. + NumberTest("1650 x 253 - 1234", RegionCode::US()), + NumberTest("650 x 253 - 1234", RegionCode::US()), + NumberTest("650x2531234", RegionCode::US()), +}; + +// Strings with number-like things that should only be found up to and including +// the "valid" leniency level. +static const NumberTest VALID_CASES[] = { + NumberTest("65 02 53 00 00.", RegionCode::US()), + NumberTest("6502 538365", RegionCode::US()), + // 2 slashes are illegal at higher levels. + NumberTest("650//253-1234", RegionCode::US()), + NumberTest("650/253/1234", RegionCode::US()), + NumberTest("9002309. 158", RegionCode::US()), + NumberTest("21 7/8 - 14 12/34 - 5", RegionCode::US()), + NumberTest("12.1 - 23.71 - 23.45", RegionCode::US()), + NumberTest("1979-2011 100%", RegionCode::US()), + NumberTest("800 234 1 111x1111", RegionCode::US()), + // National number in wrong format. + NumberTest("+494949-4-94", RegionCode::DE()), + NumberTest( + /* "415666-7777" */ + "\xEF\xBC\x94\xEF\xBC\x91\xEF\xBC\x95\xEF\xBC\x96\xEF\xBC\x96\xEF\xBC\x96" + "\x2D\xEF\xBC\x97\xEF\xBC\x97\xEF\xBC\x97\xEF\xBC\x97", RegionCode::US()), +}; + +// Strings with number-like things that should only be found up to and including +// the "strict_grouping" leniency level. +static const NumberTest STRICT_GROUPING_CASES[] = { + NumberTest("(415) 6667777", RegionCode::US()), + NumberTest("415-6667777", RegionCode::US()), + // Should be found by strict grouping but not exact grouping, as the last two + // groups are formatted together as a block. + NumberTest("800-2491234", RegionCode::DE()), +}; + +// Strings with number-like things that should found at all levels. +static const NumberTest EXACT_GROUPING_CASES[] = { + NumberTest( + /* "4156667777" */ + "\xEF\xBC\x94\xEF\xBC\x91\xEF\xBC\x95\xEF\xBC\x96\xEF\xBC\x96\xEF\xBC\x96" + "\xEF\xBC\x97\xEF\xBC\x97\xEF\xBC\x97\xEF\xBC\x97", RegionCode::US()), + NumberTest( + /* "415-666-7777" */ + "\xEF\xBC\x94\xEF\xBC\x91\xEF\xBC\x95\xEF\xBC\x8D\xEF\xBC\x96\xEF\xBC\x96" + "\xEF\xBC\x96\xEF\xBC\x8D\xEF\xBC\x97\xEF\xBC\x97\xEF\xBC\x97" + "\xEF\xBC\x97", RegionCode::US()), + NumberTest("4156667777", RegionCode::US()), + NumberTest("4156667777 x 123", RegionCode::US()), + NumberTest("415-666-7777", RegionCode::US()), + NumberTest("415/666-7777", RegionCode::US()), + NumberTest("415-666-7777 ext. 503", RegionCode::US()), + NumberTest("1 415 666 7777 x 123", RegionCode::US()), + NumberTest("+1 415-666-7777", RegionCode::US()), + NumberTest("+494949 49", RegionCode::DE()), + NumberTest("+49-49-34", RegionCode::DE()), + NumberTest("+49-4931-49", RegionCode::DE()), + NumberTest("04931-49", RegionCode::DE()), // With National Prefix + NumberTest("+49-494949", RegionCode::DE()), // One group with country code + NumberTest("+49-494949 ext. 49", RegionCode::DE()), + NumberTest("+49494949 ext. 49", RegionCode::DE()), + NumberTest("0494949", RegionCode::DE()), + NumberTest("0494949 ext. 49", RegionCode::DE()), +}; + +TEST_F(PhoneNumberMatcherTest, MatchesWithStrictGroupingLeniency) { + vector test_cases; + test_cases.insert(test_cases.begin(), STRICT_GROUPING_CASES, + STRICT_GROUPING_CASES + arraysize(STRICT_GROUPING_CASES)); + test_cases.insert(test_cases.begin(), EXACT_GROUPING_CASES, + EXACT_GROUPING_CASES + arraysize(EXACT_GROUPING_CASES)); + DoTestNumberMatchesForLeniency(test_cases, + PhoneNumberMatcher::STRICT_GROUPING); +} + +TEST_F(PhoneNumberMatcherTest, NonMatchesWithStrictGroupingLeniency) { + vector test_cases; + test_cases.insert(test_cases.begin(), IMPOSSIBLE_CASES, + IMPOSSIBLE_CASES + arraysize(IMPOSSIBLE_CASES)); + test_cases.insert(test_cases.begin(), POSSIBLE_ONLY_CASES, + POSSIBLE_ONLY_CASES + arraysize(POSSIBLE_ONLY_CASES)); + test_cases.insert(test_cases.begin(), VALID_CASES, + VALID_CASES + arraysize(VALID_CASES)); + DoTestNumberNonMatchesForLeniency(test_cases, + PhoneNumberMatcher::STRICT_GROUPING); +} + +TEST_F(PhoneNumberMatcherTest, MatchesWithExactGroupingLeniency) { + vector test_cases; + test_cases.insert(test_cases.begin(), EXACT_GROUPING_CASES, + EXACT_GROUPING_CASES + arraysize(EXACT_GROUPING_CASES)); + DoTestNumberMatchesForLeniency(test_cases, + PhoneNumberMatcher::EXACT_GROUPING); +} + +TEST_F(PhoneNumberMatcherTest, NonMatchesWithExactGroupingLeniency) { + vector test_cases; + test_cases.insert(test_cases.begin(), IMPOSSIBLE_CASES, + IMPOSSIBLE_CASES + arraysize(IMPOSSIBLE_CASES)); + test_cases.insert(test_cases.begin(), POSSIBLE_ONLY_CASES, + POSSIBLE_ONLY_CASES + arraysize(POSSIBLE_ONLY_CASES)); + test_cases.insert(test_cases.begin(), VALID_CASES, + VALID_CASES + arraysize(VALID_CASES)); + test_cases.insert(test_cases.begin(), STRICT_GROUPING_CASES, + STRICT_GROUPING_CASES + arraysize(STRICT_GROUPING_CASES)); + DoTestNumberNonMatchesForLeniency(test_cases, + PhoneNumberMatcher::EXACT_GROUPING); +} + +TEST_F(PhoneNumberMatcherTest, ExtractMatchIgnoresAmericanDates) { + PhoneNumberMatch match; + string text = "As I said on 03/10/2011, you may call me at "; + EXPECT_FALSE(ExtractMatch(text, &match)); + text = "As I said on 03/27/2011, you may call me at "; + EXPECT_FALSE(ExtractMatch(text, &match)); + text = "As I said on 31/8/2011, you may call me at "; + EXPECT_FALSE(ExtractMatch(text, &match)); + text = "As I said on 1/12/2011, you may call me at "; + EXPECT_FALSE(ExtractMatch(text, &match)); + text = "I was born on 10/12/82. Please call me at "; + EXPECT_FALSE(ExtractMatch(text, &match)); +} + +TEST_F(PhoneNumberMatcherTest, NonMatchingBracketsAreInvalid) { + // The digits up to the ", " form a valid US number, but it shouldn't be + // matched as one since there was a non-matching bracket present. + scoped_ptr matcher(GetMatcherWithLeniency( + "80.585 [79.964, 81.191]", RegionCode::US(), + PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); + + // The trailing "]" is thrown away before parsing, so the resultant number, + // while a valid US number, does not have matching brackets. + matcher.reset(GetMatcherWithLeniency( + "80.585 [79.964]", RegionCode::US(), PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); + + matcher.reset(GetMatcherWithLeniency( + "80.585 ((79.964)", RegionCode::US(), PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); + + // This case has too many sets of brackets to be valid. + matcher.reset(GetMatcherWithLeniency( + "(80).(585) (79).(9)64", RegionCode::US(), PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, NoMatchIfRegionIsUnknown) { + // Fail on non-international prefix if region code is ZZ. + scoped_ptr matcher(GetMatcherWithLeniency( + "Random text body - number is 0331 6005, see you there", + RegionCode::ZZ(), PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, NoMatchInEmptyString) { + scoped_ptr matcher(GetMatcherWithLeniency( + "", RegionCode::US(), PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); + matcher.reset(GetMatcherWithLeniency(" ", RegionCode::US(), + PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, NoMatchIfNoNumber) { + scoped_ptr matcher(GetMatcherWithLeniency( + "Random text body - number is foobar, see you there", RegionCode::US(), + PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, Sequences) { + // Test multiple occurrences. + const string text = "Call 033316005 or 032316005!"; + const string& region = RegionCode::NZ(); + + PhoneNumber number1; + number1.set_country_code(phone_util_.GetCountryCodeForRegion(region)); + number1.set_national_number(33316005); + PhoneNumberMatch match1(5, "033316005", number1); + + PhoneNumber number2; + number2.set_country_code(phone_util_.GetCountryCodeForRegion(region)); + number2.set_national_number(32316005); + PhoneNumberMatch match2(19, "032316005", number2); + + PhoneNumberMatcher matcher( + phone_util_, text, region, PhoneNumberMatcher::POSSIBLE, 100); + + PhoneNumberMatch actual_match1; + PhoneNumberMatch actual_match2; + matcher.Next(&actual_match1); + matcher.Next(&actual_match2); + EXPECT_TRUE(match1.Equals(actual_match1)); + EXPECT_TRUE(match2.Equals(actual_match2)); +} + +TEST_F(PhoneNumberMatcherTest, MaxMatches) { + // Set up text with 100 valid phone numbers. + string numbers; + for (int i = 0; i < 100; ++i) { + numbers.append("My info: 415-666-7777,"); + } + + // Matches all 100. Max only applies to failed cases. + PhoneNumber number; + phone_util_.Parse("+14156667777", RegionCode::US(), &number); + vector expected(100, number); + + PhoneNumberMatcher matcher( + phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10); + vector actual; + PhoneNumberMatch match; + while (matcher.HasNext()) { + matcher.Next(&match); + actual.push_back(match.number()); + } + EXPECT_EQ(expected, actual); +} + +TEST_F(PhoneNumberMatcherTest, MaxMatchesInvalid) { + // Set up text with 10 invalid phone numbers followed by 100 valid. + string numbers; + for (int i = 0; i < 10; ++i) { + numbers.append("My address 949-8945-0"); + } + for (int i = 0; i < 100; ++i) { + numbers.append("My info: 415-666-7777,"); + } + + PhoneNumberMatcher matcher( + phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10); + EXPECT_FALSE(matcher.HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, MaxMatchesMixed) { + // Set up text with 100 valid numbers inside an invalid number. + string numbers; + for (int i = 0; i < 100; ++i) { + numbers.append("My info: 415-666-7777 123 fake street"); + } + + PhoneNumber number; + phone_util_.Parse("+14156667777", RegionCode::ZZ(), &number); + vector expected(10, number); + + PhoneNumberMatcher matcher( + phone_util_, numbers, RegionCode::US(), PhoneNumberMatcher::VALID, 10); + vector actual; + PhoneNumberMatch match; + while (matcher.HasNext()) { + matcher.Next(&match); + actual.push_back(match.number()); + } + EXPECT_EQ(expected, actual); +} + +TEST_F(PhoneNumberMatcherTest, TestEmptyIteration) { + PhoneNumberMatch match; + scoped_ptr matcher( + GetMatcherWithLeniency("", RegionCode::GetUnknown(), + PhoneNumberMatcher::VALID)); + EXPECT_FALSE(matcher->HasNext()); + EXPECT_FALSE(matcher->HasNext()); + EXPECT_FALSE(matcher->Next(&match)); + EXPECT_FALSE(matcher->HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, TestSingleIteration) { + PhoneNumberMatch match; + scoped_ptr matcher( + GetMatcherWithLeniency("+14156667777", RegionCode::GetUnknown(), + PhoneNumberMatcher::VALID)); + + // Try HasNext() twice to ensure it does not advance. + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->Next(&match)); + + EXPECT_FALSE(matcher->HasNext()); + EXPECT_FALSE(matcher->Next(&match)); +} + +TEST_F(PhoneNumberMatcherTest, TestSingleIteration_WithNextOnly) { + PhoneNumberMatch match; + scoped_ptr matcher( + GetMatcherWithLeniency("+14156667777", RegionCode::GetUnknown(), + PhoneNumberMatcher::VALID)); + EXPECT_TRUE(matcher->Next(&match)); + EXPECT_FALSE(matcher->Next(&match)); +} + +TEST_F(PhoneNumberMatcherTest, TestDoubleIteration) { + PhoneNumberMatch match; + scoped_ptr matcher( + GetMatcherWithLeniency("+14156667777 foobar +14156667777 ", + RegionCode::GetUnknown(), + PhoneNumberMatcher::VALID)); + + // Double HasNext() to ensure it does not advance. + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->Next(&match)); + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->HasNext()); + EXPECT_TRUE(matcher->Next(&match)); + + EXPECT_FALSE(matcher->HasNext()); + EXPECT_FALSE(matcher->Next(&match)); + EXPECT_FALSE(matcher->HasNext()); +} + +TEST_F(PhoneNumberMatcherTest, TestDoubleIteration_WithNextOnly) { + PhoneNumberMatch match; + scoped_ptr matcher( + GetMatcherWithLeniency("+14156667777 foobar +14156667777 ", + RegionCode::GetUnknown(), + PhoneNumberMatcher::VALID)); + + EXPECT_TRUE(matcher->Next(&match)); + EXPECT_TRUE(matcher->Next(&match)); + EXPECT_FALSE(matcher->Next(&match)); +} + +} // namespace phonenumbers +} // namespace i18n diff --git a/cpp/test/phonenumbers/phonenumberutil_test.cc b/cpp/test/phonenumbers/phonenumberutil_test.cc index f8a2b4a21..f35035057 100644 --- a/cpp/test/phonenumbers/phonenumberutil_test.cc +++ b/cpp/test/phonenumbers/phonenumberutil_test.cc @@ -26,6 +26,7 @@ #include "phonenumbers/phonenumber.h" #include "phonenumbers/phonenumber.pb.h" #include "phonenumbers/phonenumberutil.h" +#include "phonenumbers/test_util.h" namespace i18n { namespace phonenumbers { @@ -36,112 +37,6 @@ using std::ostream; using google::protobuf::RepeatedPtrField; -namespace { - -// Class containing string constants of region codes for easier testing. This is -// intended to replace region_code.h for testing in this file, with more -// constants defined. -class RegionCode { - public: - static const string& AD() { - static const string s = "AD"; - return s; - } - - static const string& AO() { - static const string s = "AO"; - return s; - } - - static const string& AR() { - static const string s = "AR"; - return s; - } - - static const string& AU() { - static const string s = "AU"; - return s; - } - - static const string& BS() { - static const string s = "BS"; - return s; - } - - static const string& CN() { - static const string s = "CN"; - return s; - } - - static const string& CS() { - static const string s = "CS"; - return s; - } - - static const string& DE() { - static const string s = "DE"; - return s; - } - - static const string& GB() { - static const string s = "GB"; - return s; - } - - static const string& IT() { - static const string s = "IT"; - return s; - } - - static const string& KR() { - static const string s = "KR"; - return s; - } - - static const string& MX() { - static const string s = "MX"; - return s; - } - - static const string& NZ() { - static const string s = "NZ"; - return s; - } - - static const string& PL() { - static const string s = "PL"; - return s; - } - - static const string& RE() { - static const string s = "RE"; - return s; - } - - static const string& SG() { - static const string s = "SG"; - return s; - } - - static const string& US() { - static const string s = "US"; - return s; - } - - static const string& YT() { - static const string s = "YT"; - return s; - } - - // Returns a region code string representing the "unknown" region. - static const string& GetUnknown() { - static const string s = "ZZ"; - return s; - } -}; - -} // namespace - class PhoneNumberUtilTest : public testing::Test { protected: PhoneNumberUtilTest() : phone_util_(*PhoneNumberUtil::GetInstance()) { @@ -219,40 +114,6 @@ class PhoneNumberUtilTest : public testing::Test { const PhoneNumberUtil& phone_util_; }; -// Provides PhoneNumber comparison operators to support the use of EXPECT_EQ and -// EXPECT_NE in the unittests. -bool operator==(const PhoneNumber& number1, const PhoneNumber& number2) { - return ExactlySameAs(number1, number2); -} - -bool operator!=(const PhoneNumber& number1, const PhoneNumber& number2) { - return !(number1 == number2); -} - -// Needed by Google Test to display errors. -ostream& operator<<(ostream& os, const PhoneNumber& number) { - os << endl - << "country_code: " << number.country_code() << endl - << "national_number: " << number.national_number() << endl; - if (number.has_extension()) { - os << "extension: " << number.extension() << endl; - } - if (number.has_italian_leading_zero()) { - os << "italian_leading_zero: " << number.italian_leading_zero() << endl; - } - if (number.has_raw_input()) { - os << "raw_input: " << number.raw_input() << endl; - } - if (number.has_country_code_source()) { - os << "country_code_source: " << number.country_code_source() << endl; - } - if (number.has_preferred_domestic_carrier_code()) { - os << "preferred_domestic_carrier_code: " - << number.preferred_domestic_carrier_code() << endl; - } - return os; -} - TEST_F(PhoneNumberUtilTest, GetSupportedRegions) { set regions; diff --git a/cpp/test/phonenumbers/stringutil_test.cc b/cpp/test/phonenumbers/stringutil_test.cc index 3f68719cc..3231fb36a 100644 --- a/cpp/test/phonenumbers/stringutil_test.cc +++ b/cpp/test/phonenumbers/stringutil_test.cc @@ -14,9 +14,15 @@ // Author: Philippe Liard +#include "phonenumbers/stringutil.h" + +#include +#include + #include -#include "phonenumbers/stringutil.h" +using std::string; +using std::vector; namespace i18n { namespace phonenumbers { @@ -31,6 +37,55 @@ TEST(StringUtilTest, SimpleItoa) { EXPECT_EQ("10", SimpleItoa(10)); } +TEST(StringUtilTest, HasPrefixString) { + EXPECT_TRUE(HasPrefixString("hello world", "hello")); + EXPECT_FALSE(HasPrefixString("hello world", "hellO")); +} + +TEST(StringUtilTest, FindNthWithEmptyString) { + EXPECT_EQ(string::npos, FindNth("", 'a', 1)); +} + +TEST(StringUtilTest, FindNthWithNNegative) { + EXPECT_EQ(string::npos, FindNth("hello world", 'o', -1)); +} + +TEST(StringUtilTest, FindNthWithNTooHigh) { + EXPECT_EQ(string::npos, FindNth("hello world", 'o', 3)); +} + +TEST(StringUtilTest, FindNth) { + EXPECT_EQ(7, FindNth("hello world", 'o', 2)); +} + +TEST(StringUtilTest, SplitStringUsingWithEmptyString) { + vector result; + SplitStringUsing("", ":", &result); + EXPECT_EQ(0, result.size()); +} + +TEST(StringUtilTest, SplitStringUsingWithEmptyDelimiter) { + vector result; + SplitStringUsing("hello", "", &result); + EXPECT_EQ(0, result.size()); +} + +TEST(StringUtilTest, SplitStringUsing) { + vector result; + SplitStringUsing(":hello:world:", ":", &result); + EXPECT_EQ(2, result.size()); + EXPECT_EQ("hello", result[0]); + EXPECT_EQ("world", result[1]); +} + +TEST(StringUtilTest, SplitStringUsingIgnoresEmptyToken) { + vector result; + SplitStringUsing("hello::world", ":", &result); + EXPECT_EQ(2, result.size()); + EXPECT_EQ("hello", result[0]); + EXPECT_EQ("world", result[1]); +} + // Test TryStripPrefixString. TEST(StringUtilTest, TryStripPrefixString) { string s; @@ -205,6 +260,10 @@ TEST(StringUtilTest, StrCat) { s = StrCat("a", "b", "c", "d", "e", "f", "g"); EXPECT_EQ("abcdefg", s); + // Test with 8 arguments. + s = StrCat("a", "b", "c", "d", "e", "f", "g", "h"); + EXPECT_EQ("abcdefgh", s); + // Test with 9 arguments. s = StrCat("a", "b", "c", "d", "e", "f", "g", "h", "i"); EXPECT_EQ("abcdefghi", s); diff --git a/cpp/test/phonenumbers/test_util.cc b/cpp/test/phonenumbers/test_util.cc new file mode 100644 index 000000000..e4182186c --- /dev/null +++ b/cpp/test/phonenumbers/test_util.cc @@ -0,0 +1,63 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Philippe Liard + +#include +#include + +#include "phonenumbers/phonenumber.pb.h" +#include "phonenumbers/test_util.h" + +using std::cout; +using std::endl; + +namespace i18n { +namespace phonenumbers { + +ostream& operator<<(ostream& os, const PhoneNumber& number) { + os << endl + << "country_code: " << number.country_code() << endl + << "national_number: " << number.national_number() << endl; + if (number.has_extension()) { + os << "extension: " << number.extension() << endl; + } + if (number.has_italian_leading_zero()) { + os << "italian_leading_zero: " << number.italian_leading_zero() << endl; + } + if (number.has_raw_input()) { + os << "raw_input: " << number.raw_input() << endl; + } + if (number.has_country_code_source()) { + os << "country_code_source: " << number.country_code_source() << endl; + } + if (number.has_preferred_domestic_carrier_code()) { + os << "preferred_domestic_carrier_code: " + << number.preferred_domestic_carrier_code() << endl; + } + return os; +} + +ostream& operator<<(ostream& os, const vector& numbers) { + os << "[" << endl; + for (vector::const_iterator it = numbers.begin(); + it != numbers.end(); ++it) { + os << *it; + } + os << endl << "]" << endl; + return os; +} + +} // namespace phonenumbers +} // namespace i18n diff --git a/cpp/test/phonenumbers/test_util.h b/cpp/test/phonenumbers/test_util.h new file mode 100644 index 000000000..561ebc7b8 --- /dev/null +++ b/cpp/test/phonenumbers/test_util.h @@ -0,0 +1,162 @@ +// Copyright (C) 2011 The Libphonenumber Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Philippe Liard + +#include +#include +#include + +#include "phonenumbers/phonenumber.h" + +namespace i18n { +namespace phonenumbers { + +using std::string; +using std::ostream; +using std::vector; + +class PhoneNumber; + +// Provides PhoneNumber comparison operators to support the use of EXPECT_EQ and +// EXPECT_NE in the unittests. +inline bool operator==(const PhoneNumber& number1, const PhoneNumber& number2) { + return ExactlySameAs(number1, number2); +} + +inline bool operator!=(const PhoneNumber& number1, const PhoneNumber& number2) { + return !(number1 == number2); +} + +// Needed by Google Test to display errors. +ostream& operator<<(ostream& os, const PhoneNumber& number); + +ostream& operator<<(ostream& os, const vector& numbers); + +// Class containing string constants of region codes for easier testing. +class RegionCode { + public: + static const string& AD() { + static const string s = "AD"; + return s; + } + + static const string& AO() { + static const string s = "AO"; + return s; + } + + static const string& AR() { + static const string s = "AR"; + return s; + } + + static const string& AU() { + static const string s = "AU"; + return s; + } + + static const string& BS() { + static const string s = "BS"; + return s; + } + + static const string& CA() { + static const string s = "CA"; + return s; + } + + static const string& CN() { + static const string s = "CN"; + return s; + } + + static const string& CS() { + static const string s = "CS"; + return s; + } + + static const string& DE() { + static const string s = "DE"; + return s; + } + + static const string& GB() { + static const string s = "GB"; + return s; + } + + static const string& IT() { + static const string s = "IT"; + return s; + } + + static const string& JP() { + static const string s = "JP"; + return s; + } + + static const string& KR() { + static const string s = "KR"; + return s; + } + + static const string& MX() { + static const string s = "MX"; + return s; + } + + static const string& NZ() { + static const string s = "NZ"; + return s; + } + + static const string& PL() { + static const string s = "PL"; + return s; + } + + static const string& RE() { + static const string s = "RE"; + return s; + } + + static const string& SG() { + static const string s = "SG"; + return s; + } + + static const string& US() { + static const string s = "US"; + return s; + } + + static const string& YT() { + static const string s = "YT"; + return s; + } + + // Returns a region code string representing the "unknown" region. + static const string& GetUnknown() { + static const string s = "ZZ"; + return s; + } + + static const string& ZZ() { + return GetUnknown(); + } +}; + +} // namespace phonenumbers +} // namespace i18n diff --git a/tools/script/continuous-integration.sh b/tools/script/continuous-integration.sh index 6b2d04c86..0fb674186 100755 --- a/tools/script/continuous-integration.sh +++ b/tools/script/continuous-integration.sh @@ -30,11 +30,19 @@ test_cpp_version() { CC_TEST_FILE=`mktemp`.cc CC_TEST_BINARY=`mktemp` CMAKE_FLAGS="$1" + # Write the program that tests the installation of the library to a temporary # source file. > $CC_TEST_FILE echo ' + #include + #include + + // Include all the public headers. #include + #include + #include + #include #include using i18n::phonenumbers::AsYouTypeFormatter; @@ -44,8 +52,11 @@ test_cpp_version() { PhoneNumberUtil* const phone_util = PhoneNumberUtil::GetInstance(); const scoped_ptr asytf( phone_util->GetAsYouTypeFormatter("US")); - return !(phone_util != NULL && asytf != NULL); + + assert(phone_util != NULL); + assert(asytf != NULL); }' + # Run the build and tests. ( rm -rf cpp/build /tmp/libphonenumber && @@ -64,8 +75,9 @@ test_cpp_version() { [ $STATUS -ne 0 ] && exit $STATUS } test_cpp_version '' -test_cpp_version '-DUSE_RE2=ON' +test_cpp_version '-DUSE_ICU_REGEXP=ON' test_cpp_version '-DUSE_LITE_METADATA=ON' +test_cpp_version '-DUSE_RE2=ON' test_cpp_version '-DUSE_STD_MAP=ON' # Test Java version using Ant.