From 3285ff2a4031c681aac1307b4760a82838df36c1 Mon Sep 17 00:00:00 2001 From: penmetsaa Date: Fri, 7 Jan 2022 16:20:17 +0530 Subject: [PATCH] Disallow non-utf8 chars as input to phonenumbermatcher API (#2707) --- cpp/src/phonenumbers/phonenumbermatcher.cc | 21 ++++++++++++++++--- cpp/src/phonenumbers/phonenumbermatcher.h | 9 +++++++- .../phonenumbers/phonenumbermatcher_test.cc | 17 +++++++++++++++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/cpp/src/phonenumbers/phonenumbermatcher.cc b/cpp/src/phonenumbers/phonenumbermatcher.cc index 0ce8fd9f4..5621c65d3 100644 --- a/cpp/src/phonenumbers/phonenumbermatcher.cc +++ b/cpp/src/phonenumbers/phonenumbermatcher.cc @@ -33,7 +33,6 @@ #include #include #include - #include #include "phonenumbers/alternate_format.h" @@ -52,6 +51,7 @@ #include "phonenumbers/regexp_adapter_icu.h" #include "phonenumbers/regexp_cache.h" #include "phonenumbers/stringutil.h" +#include "phonenumbers/utf/unicodetext.h" #ifdef I18N_PHONENUMBERS_USE_RE2 #include "phonenumbers/regexp_adapter_re2.h" @@ -407,7 +407,9 @@ PhoneNumberMatcher::PhoneNumberMatcher(const PhoneNumberUtil& util, max_tries_(max_tries), state_(NOT_READY), last_match_(NULL), - search_index_(0) { + search_index_(0), + is_input_valid_utf8_(true) { + is_input_valid_utf8_ = IsInputUtf8(); } PhoneNumberMatcher::PhoneNumberMatcher(const string& text, @@ -421,12 +423,20 @@ PhoneNumberMatcher::PhoneNumberMatcher(const string& text, max_tries_(numeric_limits::max()), state_(NOT_READY), last_match_(NULL), - search_index_(0) { + search_index_(0), + is_input_valid_utf8_(true) { + is_input_valid_utf8_ = IsInputUtf8(); } PhoneNumberMatcher::~PhoneNumberMatcher() { } +bool PhoneNumberMatcher::IsInputUtf8() { + UnicodeText number_as_unicode; + number_as_unicode.PointToUTF8(text_.c_str(), text_.size()); + return number_as_unicode.UTF8WasValid(); +} + // static bool PhoneNumberMatcher::IsLatinLetter(char32 letter) { // Combining marks are a subset of non-spacing-mark. @@ -626,6 +636,11 @@ bool PhoneNumberMatcher::ExtractMatch(const string& candidate, int offset, } bool PhoneNumberMatcher::HasNext() { + // Input should contain only UTF-8 characters. + if (!is_input_valid_utf8_) { + state_ = DONE; + return false; + } if (state_ == NOT_READY) { PhoneNumberMatch temp_match; if (!Find(search_index_, &temp_match)) { diff --git a/cpp/src/phonenumbers/phonenumbermatcher.h b/cpp/src/phonenumbers/phonenumbermatcher.h index 90cea9c15..b680d1b07 100644 --- a/cpp/src/phonenumbers/phonenumbermatcher.h +++ b/cpp/src/phonenumbers/phonenumbermatcher.h @@ -87,7 +87,8 @@ class PhoneNumberMatcher { ~PhoneNumberMatcher(); - // Returns true if the text sequence has another match. + // Returns true if the text sequence has another match. Return false if not. + // Always returns false when input contains non UTF-8 characters. bool HasNext(); // Gets next match from text sequence. @@ -101,6 +102,9 @@ class PhoneNumberMatcher { DONE, }; + // Checks if the to check if the provided text_ is in UTF-8 or not. + bool IsInputUtf8(); + // Attempts to extract a match from a candidate string. Returns true if a // match is found, otherwise returns false. The value "offset" refers to the // start index of the candidate string within the overall text. @@ -202,6 +206,9 @@ class PhoneNumberMatcher { // The next index to start searching at. Undefined in State.DONE. int search_index_; + // Flag to set or check if input text is in UTF-8 or not. + bool is_input_valid_utf8_; + DISALLOW_COPY_AND_ASSIGN(PhoneNumberMatcher); }; diff --git a/cpp/test/phonenumbers/phonenumbermatcher_test.cc b/cpp/test/phonenumbers/phonenumbermatcher_test.cc index 78520608f..4b0f12767 100644 --- a/cpp/test/phonenumbers/phonenumbermatcher_test.cc +++ b/cpp/test/phonenumbers/phonenumbermatcher_test.cc @@ -1078,6 +1078,23 @@ TEST_F(PhoneNumberMatcherTest, NoMatchIfNoNumber) { EXPECT_FALSE(matcher->HasNext()); } +TEST_F(PhoneNumberMatcherTest, NoErrorWithSpecialCharacters) { + string stringWithSpecialCharacters = + "Myfuzzvar1152: \"My info:%415-666-7777 123 fake street\"\nfuzzvar1155: " + "47\nfuzzvar1158: %415-666-1234 " + "i18n_phonenumbers_Pho\356eNumberMatcher_Leniency_VALID_1" + "\nfuzzvar1159: 20316 info:%415-666-7777 123 fake str79ee\nt"; + string Numbers; + for (int i = 0; i < 100; ++i) + Numbers.append(stringWithSpecialCharacters); + scoped_ptr matcher( + GetMatcherWithLeniency(Numbers, RegionCode::US(), + PhoneNumberMatcher::POSSIBLE)); + // Since the input text contains invalid UTF-8, we do not return + // any matches. + EXPECT_FALSE(matcher->HasNext()); +} + TEST_F(PhoneNumberMatcherTest, Sequences) { // Test multiple occurrences. const string text = "Call 033316005 or 032316005!";