Be stricter about invalid UTF-8 input (#2698)

Keep track of whether unicode strings were created from valid UTF-8 and use this to abort operations that use these strings later. Typically, this is done by returning an empty string when a modified string is being produced, since an empty string is never a valid phone number.
4 years ago · 96293eec3b
--- a/cpp/src/phonenumbers/normalize_utf8.h
+++ b/cpp/src/phonenumbers/normalize_utf8.h
@ -27,6 +27,8 @@ struct NormalizeUTF8 {
    string normalized;
    UnicodeText number_as_unicode;
    number_as_unicode.PointToUTF8(number.data(), static_cast<int>(number.size()));
    if (!number_as_unicode.UTF8WasValid())
      return normalized; // Return an empty result to indicate an error
    for (UnicodeText::const_iterator it = number_as_unicode.begin();
         it != number_as_unicode.end();
         ++it) {
--- a/cpp/src/phonenumbers/phonenumberutil.cc
+++ b/cpp/src/phonenumbers/phonenumberutil.cc
@ -331,6 +331,11 @@ void NormalizeHelper(const std::map<char32, char>& normalization_replacements,
  DCHECK(number);
  UnicodeText number_as_unicode;
  number_as_unicode.PointToUTF8(number->data(), static_cast<int>(number->size()));
  if (!number_as_unicode.UTF8WasValid()) {
    // The input wasn't valid UTF-8. Produce an empty string to indicate an error.
    number->clear();
    return;
  }
  string normalized_number;
  char unicode_char[5];
  for (UnicodeText::const_iterator it = number_as_unicode.begin();
@ -973,6 +978,11 @@ void PhoneNumberUtil::TrimUnwantedEndChars(string* number) const {
  DCHECK(number);
  UnicodeText number_as_unicode;
  number_as_unicode.PointToUTF8(number->data(), static_cast<int>(number->size()));
  if (!number_as_unicode.UTF8WasValid()) {
    // The input wasn't valid UTF-8. Produce an empty string to indicate an error.
    number->clear();
    return;
  }
  char current_char[5];
  int len;
  UnicodeText::const_reverse_iterator reverse_it(number_as_unicode.end());
@ -2312,6 +2322,11 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number,

  UnicodeText number_as_unicode;
  number_as_unicode.PointToUTF8(number.data(), static_cast<int>(number.size()));
  if (!number_as_unicode.UTF8WasValid()) {
    // The input wasn't valid UTF-8. Produce an empty string to indicate an error.
    extracted_number->clear();
    return;
  }
  char current_char[5];
  int len;
  UnicodeText::const_iterator it;
@ -2326,7 +2341,7 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number,
  if (it == number_as_unicode.end()) {
    // No valid start character was found. extracted_number should be set to
    // empty string.
    extracted_number->assign("");
    extracted_number->clear();
    return;
  }

--- a/cpp/src/phonenumbers/unicodestring.h
+++ b/cpp/src/phonenumbers/unicodestring.h
@ -92,6 +92,9 @@ class UnicodeString {
    text_.CopyUTF8(s, static_cast<int>(len));
  }

  // Was this UnicodeString created from valid UTF-8?
  bool UTF8WasValid() const { return text_.UTF8WasValid(); }

  // Returns the substring located at [ start, start + length - 1 ] without
  // copying the underlying C string. If one of the provided parameters is out
  // of range, the function returns an empty unicode string.
--- a/cpp/src/phonenumbers/utf/unicodetext.cc
+++ b/cpp/src/phonenumbers/utf/unicodetext.cc
@ -231,7 +231,8 @@ UnicodeText& UnicodeText::Copy(const UnicodeText& src) {

 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
  repr_.Copy(buffer, byte_length);
  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
  repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
  if (!repr_.utf8_was_valid_) {
    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  }
@ -250,7 +251,8 @@ UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
                                              int byte_length,
                                              int byte_capacity) {
  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
  repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
  if (!repr_.utf8_was_valid_) {
    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
  }
@ -267,7 +269,8 @@ UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
 // ----- PointTo -----

 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
  repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
  if (repr_.utf8_was_valid_) {
    repr_.PointTo(buffer, byte_length);
  } else {
    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
--- a/cpp/src/phonenumbers/utf/unicodetext.h
+++ b/cpp/src/phonenumbers/utf/unicodetext.h
@ -295,7 +295,8 @@ class UnicodeText {
  // the data is tested for interchange-validity. If it is not
  // interchange-valid, a LOG(WARNING) is issued, and each
  // structurally invalid byte and each interchange-invalid codepoint
  // is replaced with a space.
  // is replaced with a space. The `utf8_was_valid_` status is set
  // appropriately and may be queried afterwards.

  // x.CopyUTF8(buf, len) copies buf into x.
  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
@ -312,6 +313,9 @@ class UnicodeText {
  // CopyUTF8(utf8_buffer, byte_length).
  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);

  // Was this UnicodeText created from valid UTF-8?
  bool UTF8WasValid() const { return repr_.utf8_was_valid_; }

  // Occasionally it is necessary to use functions that operate on the
  // pointer returned by utf8_data(). MakeIterator(p) provides a way
  // to get back to the UnicodeText level. It uses CHECK to ensure
@ -331,8 +335,9 @@ class UnicodeText {
    int size_;
    int capacity_;
    bool ours_;  // Do we own data_?
    bool utf8_was_valid_; // Were we created from valid UTF-8?

    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true), utf8_was_valid_(true) {}
    ~Repr() { if (ours_) delete[] data_; }

    void clear();
--- a/cpp/test/phonenumbers/phonenumberutil_test.cc
+++ b/cpp/test/phonenumbers/phonenumberutil_test.cc
@ -29,8 +29,10 @@
 #include <string>

 #include <gtest/gtest.h>
 #include <unicode/uchar.h>

 #include "phonenumbers/default_logger.h"
 #include "phonenumbers/normalize_utf8.h"
 #include "phonenumbers/phonemetadata.pb.h"
 #include "phonenumbers/phonenumber.h"
 #include "phonenumbers/phonenumber.pb.h"
@ -124,6 +126,32 @@ TEST_F(PhoneNumberUtilTest, ContainsOnlyValidDigits) {
  EXPECT_FALSE(ContainsOnlyValidDigits("2a"));
 }

 TEST_F(PhoneNumberUtilTest, InterchangeInvalidCodepoints) {
  PhoneNumber phone_number;

  std::vector<string> valid_inputs = {
    "+44" "\xE2\x80\x93" "2087654321", // U+2013, EN DASH
  };
  for (auto input : valid_inputs) {
    EXPECT_EQ(input, NormalizeUTF8::NormalizeDecimalDigits(input));
    EXPECT_TRUE(IsViablePhoneNumber(input));
    EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
              phone_util_.Parse(input, RegionCode::GB(), &phone_number));
  }

  std::vector<string> invalid_inputs = {
    "+44" "\x96"         "2087654321", // Invalid sequence
    "+44" "\xC2\x96"     "2087654321", // U+0096
    "+44" "\xEF\xBF\xBE" "2087654321", // U+FFFE
  };
  for (auto input : invalid_inputs) {
    EXPECT_TRUE(NormalizeUTF8::NormalizeDecimalDigits(input).empty());
    EXPECT_FALSE(IsViablePhoneNumber(input));
    EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER,
              phone_util_.Parse(input, RegionCode::GB(), &phone_number));
  }
 }

 TEST_F(PhoneNumberUtilTest, GetSupportedRegions) {
  std::set<string> regions;

--- a/cpp/test/phonenumbers/utf/unicodetext_test.cc
+++ b/cpp/test/phonenumbers/utf/unicodetext_test.cc
@ -36,6 +36,7 @@ TEST(UnicodeTextTest, Iterator) {
    string number(values[i].utf8);
    UnicodeText number_as_unicode;
    number_as_unicode.PointToUTF8(number.data(), number.size());
    EXPECT_TRUE(number_as_unicode.UTF8WasValid());
    UnicodeText::const_iterator it = number_as_unicode.begin();
    EXPECT_EQ(values[i].code_point, *it);
  }