From 96293eec3bcc985e15db8952dca43b1046a27e2d Mon Sep 17 00:00:00 2001 From: Neil Mayhew Date: Fri, 7 Jan 2022 01:52:02 -0700 Subject: [PATCH] Be stricter about invalid UTF-8 input (#2698) Keep track of whether unicode strings were created from valid UTF-8 and use this to abort operations that use these strings later. Typically, this is done by returning an empty string when a modified string is being produced, since an empty string is never a valid phone number. --- cpp/src/phonenumbers/normalize_utf8.h | 2 ++ cpp/src/phonenumbers/phonenumberutil.cc | 17 ++++++++++- cpp/src/phonenumbers/unicodestring.h | 3 ++ cpp/src/phonenumbers/utf/unicodetext.cc | 9 ++++-- cpp/src/phonenumbers/utf/unicodetext.h | 9 ++++-- cpp/test/phonenumbers/phonenumberutil_test.cc | 28 +++++++++++++++++++ cpp/test/phonenumbers/utf/unicodetext_test.cc | 1 + 7 files changed, 63 insertions(+), 6 deletions(-) diff --git a/cpp/src/phonenumbers/normalize_utf8.h b/cpp/src/phonenumbers/normalize_utf8.h index 08d44ea3c..ee1ea2f64 100644 --- a/cpp/src/phonenumbers/normalize_utf8.h +++ b/cpp/src/phonenumbers/normalize_utf8.h @@ -27,6 +27,8 @@ struct NormalizeUTF8 { string normalized; UnicodeText number_as_unicode; number_as_unicode.PointToUTF8(number.data(), static_cast(number.size())); + if (!number_as_unicode.UTF8WasValid()) + return normalized; // Return an empty result to indicate an error for (UnicodeText::const_iterator it = number_as_unicode.begin(); it != number_as_unicode.end(); ++it) { diff --git a/cpp/src/phonenumbers/phonenumberutil.cc b/cpp/src/phonenumbers/phonenumberutil.cc index 48366e082..a4a5173c2 100644 --- a/cpp/src/phonenumbers/phonenumberutil.cc +++ b/cpp/src/phonenumbers/phonenumberutil.cc @@ -331,6 +331,11 @@ void NormalizeHelper(const std::map& normalization_replacements, DCHECK(number); UnicodeText number_as_unicode; number_as_unicode.PointToUTF8(number->data(), static_cast(number->size())); + if (!number_as_unicode.UTF8WasValid()) { + // The input wasn't valid UTF-8. Produce an empty string to indicate an error. + number->clear(); + return; + } string normalized_number; char unicode_char[5]; for (UnicodeText::const_iterator it = number_as_unicode.begin(); @@ -973,6 +978,11 @@ void PhoneNumberUtil::TrimUnwantedEndChars(string* number) const { DCHECK(number); UnicodeText number_as_unicode; number_as_unicode.PointToUTF8(number->data(), static_cast(number->size())); + if (!number_as_unicode.UTF8WasValid()) { + // The input wasn't valid UTF-8. Produce an empty string to indicate an error. + number->clear(); + return; + } char current_char[5]; int len; UnicodeText::const_reverse_iterator reverse_it(number_as_unicode.end()); @@ -2312,6 +2322,11 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number, UnicodeText number_as_unicode; number_as_unicode.PointToUTF8(number.data(), static_cast(number.size())); + if (!number_as_unicode.UTF8WasValid()) { + // The input wasn't valid UTF-8. Produce an empty string to indicate an error. + extracted_number->clear(); + return; + } char current_char[5]; int len; UnicodeText::const_iterator it; @@ -2326,7 +2341,7 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number, if (it == number_as_unicode.end()) { // No valid start character was found. extracted_number should be set to // empty string. - extracted_number->assign(""); + extracted_number->clear(); return; } diff --git a/cpp/src/phonenumbers/unicodestring.h b/cpp/src/phonenumbers/unicodestring.h index de756d6f8..301dd82e1 100644 --- a/cpp/src/phonenumbers/unicodestring.h +++ b/cpp/src/phonenumbers/unicodestring.h @@ -92,6 +92,9 @@ class UnicodeString { text_.CopyUTF8(s, static_cast(len)); } + // Was this UnicodeString created from valid UTF-8? + bool UTF8WasValid() const { return text_.UTF8WasValid(); } + // Returns the substring located at [ start, start + length - 1 ] without // copying the underlying C string. If one of the provided parameters is out // of range, the function returns an empty unicode string. diff --git a/cpp/src/phonenumbers/utf/unicodetext.cc b/cpp/src/phonenumbers/utf/unicodetext.cc index 53f055af4..5050abf41 100644 --- a/cpp/src/phonenumbers/utf/unicodetext.cc +++ b/cpp/src/phonenumbers/utf/unicodetext.cc @@ -231,7 +231,8 @@ UnicodeText& UnicodeText::Copy(const UnicodeText& src) { UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { repr_.Copy(buffer, byte_length); - if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length); + if (!repr_.utf8_was_valid_) { LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); } @@ -250,7 +251,8 @@ UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, int byte_length, int byte_capacity) { repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); - if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { + repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length); + if (!repr_.utf8_was_valid_) { LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); } @@ -267,7 +269,8 @@ UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, // ----- PointTo ----- UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { - if (UniLib:: IsInterchangeValid(buffer, byte_length)) { + repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length); + if (repr_.utf8_was_valid_) { repr_.PointTo(buffer, byte_length); } else { LOG(WARNING) << "UTF-8 buffer is not interchange-valid."; diff --git a/cpp/src/phonenumbers/utf/unicodetext.h b/cpp/src/phonenumbers/utf/unicodetext.h index d2673213a..b66b376dc 100644 --- a/cpp/src/phonenumbers/utf/unicodetext.h +++ b/cpp/src/phonenumbers/utf/unicodetext.h @@ -295,7 +295,8 @@ class UnicodeText { // the data is tested for interchange-validity. If it is not // interchange-valid, a LOG(WARNING) is issued, and each // structurally invalid byte and each interchange-invalid codepoint - // is replaced with a space. + // is replaced with a space. The `utf8_was_valid_` status is set + // appropriately and may be queried afterwards. // x.CopyUTF8(buf, len) copies buf into x. UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); @@ -312,6 +313,9 @@ class UnicodeText { // CopyUTF8(utf8_buffer, byte_length). UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); + // Was this UnicodeText created from valid UTF-8? + bool UTF8WasValid() const { return repr_.utf8_was_valid_; } + // Occasionally it is necessary to use functions that operate on the // pointer returned by utf8_data(). MakeIterator(p) provides a way // to get back to the UnicodeText level. It uses CHECK to ensure @@ -331,8 +335,9 @@ class UnicodeText { int size_; int capacity_; bool ours_; // Do we own data_? + bool utf8_was_valid_; // Were we created from valid UTF-8? - Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} + Repr() : data_(NULL), size_(0), capacity_(0), ours_(true), utf8_was_valid_(true) {} ~Repr() { if (ours_) delete[] data_; } void clear(); diff --git a/cpp/test/phonenumbers/phonenumberutil_test.cc b/cpp/test/phonenumbers/phonenumberutil_test.cc index 1986be8a2..fecca4205 100644 --- a/cpp/test/phonenumbers/phonenumberutil_test.cc +++ b/cpp/test/phonenumbers/phonenumberutil_test.cc @@ -29,8 +29,10 @@ #include #include +#include #include "phonenumbers/default_logger.h" +#include "phonenumbers/normalize_utf8.h" #include "phonenumbers/phonemetadata.pb.h" #include "phonenumbers/phonenumber.h" #include "phonenumbers/phonenumber.pb.h" @@ -124,6 +126,32 @@ TEST_F(PhoneNumberUtilTest, ContainsOnlyValidDigits) { EXPECT_FALSE(ContainsOnlyValidDigits("2a")); } +TEST_F(PhoneNumberUtilTest, InterchangeInvalidCodepoints) { + PhoneNumber phone_number; + + std::vector valid_inputs = { + "+44" "\xE2\x80\x93" "2087654321", // U+2013, EN DASH + }; + for (auto input : valid_inputs) { + EXPECT_EQ(input, NormalizeUTF8::NormalizeDecimalDigits(input)); + EXPECT_TRUE(IsViablePhoneNumber(input)); + EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR, + phone_util_.Parse(input, RegionCode::GB(), &phone_number)); + } + + std::vector invalid_inputs = { + "+44" "\x96" "2087654321", // Invalid sequence + "+44" "\xC2\x96" "2087654321", // U+0096 + "+44" "\xEF\xBF\xBE" "2087654321", // U+FFFE + }; + for (auto input : invalid_inputs) { + EXPECT_TRUE(NormalizeUTF8::NormalizeDecimalDigits(input).empty()); + EXPECT_FALSE(IsViablePhoneNumber(input)); + EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER, + phone_util_.Parse(input, RegionCode::GB(), &phone_number)); + } +} + TEST_F(PhoneNumberUtilTest, GetSupportedRegions) { std::set regions; diff --git a/cpp/test/phonenumbers/utf/unicodetext_test.cc b/cpp/test/phonenumbers/utf/unicodetext_test.cc index 5e5147620..8a1a165b7 100644 --- a/cpp/test/phonenumbers/utf/unicodetext_test.cc +++ b/cpp/test/phonenumbers/utf/unicodetext_test.cc @@ -36,6 +36,7 @@ TEST(UnicodeTextTest, Iterator) { string number(values[i].utf8); UnicodeText number_as_unicode; number_as_unicode.PointToUTF8(number.data(), number.size()); + EXPECT_TRUE(number_as_unicode.UTF8WasValid()); UnicodeText::const_iterator it = number_as_unicode.begin(); EXPECT_EQ(values[i].code_point, *it); }