From 96293eec3bcc985e15db8952dca43b1046a27e2d Mon Sep 17 00:00:00 2001
From: Neil Mayhew <neil@mayhew.name>
Date: Fri, 7 Jan 2022 01:52:02 -0700
Subject: [PATCH] Be stricter about invalid UTF-8 input (#2698)

Keep track of whether unicode strings were created from valid UTF-8 and
use this to abort operations that use these strings later.

Typically, this is done by returning an empty string when a modified
string is being produced, since an empty string is never a valid phone
number.
---
 cpp/src/phonenumbers/normalize_utf8.h         |  2 ++
 cpp/src/phonenumbers/phonenumberutil.cc       | 17 ++++++++++-
 cpp/src/phonenumbers/unicodestring.h          |  3 ++
 cpp/src/phonenumbers/utf/unicodetext.cc       |  9 ++++--
 cpp/src/phonenumbers/utf/unicodetext.h        |  9 ++++--
 cpp/test/phonenumbers/phonenumberutil_test.cc | 28 +++++++++++++++++++
 cpp/test/phonenumbers/utf/unicodetext_test.cc |  1 +
 7 files changed, 63 insertions(+), 6 deletions(-)
diff --git a/cpp/src/phonenumbers/normalize_utf8.h b/cpp/src/phonenumbers/normalize_utf8.h
index 08d44ea3c..ee1ea2f64 100644
--- a/cpp/src/phonenumbers/normalize_utf8.h
+++ b/cpp/src/phonenumbers/normalize_utf8.h
@@ -27,6 +27,8 @@ struct NormalizeUTF8 {
     string normalized;
     UnicodeText number_as_unicode;
     number_as_unicode.PointToUTF8(number.data(), static_cast<int>(number.size()));
+    if (!number_as_unicode.UTF8WasValid())
+      return normalized; // Return an empty result to indicate an error
     for (UnicodeText::const_iterator it = number_as_unicode.begin();
          it != number_as_unicode.end();
          ++it) {
diff --git a/cpp/src/phonenumbers/phonenumberutil.cc b/cpp/src/phonenumbers/phonenumberutil.cc
index 48366e082..a4a5173c2 100644
--- a/cpp/src/phonenumbers/phonenumberutil.cc
+++ b/cpp/src/phonenumbers/phonenumberutil.cc
@@ -331,6 +331,11 @@ void NormalizeHelper(const std::map<char32, char>& normalization_replacements,
   DCHECK(number);
   UnicodeText number_as_unicode;
   number_as_unicode.PointToUTF8(number->data(), static_cast<int>(number->size()));
+  if (!number_as_unicode.UTF8WasValid()) {
+    // The input wasn't valid UTF-8. Produce an empty string to indicate an error.
+    number->clear();
+    return;
+  }
   string normalized_number;
   char unicode_char[5];
   for (UnicodeText::const_iterator it = number_as_unicode.begin();
@@ -973,6 +978,11 @@ void PhoneNumberUtil::TrimUnwantedEndChars(string* number) const {
   DCHECK(number);
   UnicodeText number_as_unicode;
   number_as_unicode.PointToUTF8(number->data(), static_cast<int>(number->size()));
+  if (!number_as_unicode.UTF8WasValid()) {
+    // The input wasn't valid UTF-8. Produce an empty string to indicate an error.
+    number->clear();
+    return;
+  }
   char current_char[5];
   int len;
   UnicodeText::const_reverse_iterator reverse_it(number_as_unicode.end());
@@ -2312,6 +2322,11 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number,
 
   UnicodeText number_as_unicode;
   number_as_unicode.PointToUTF8(number.data(), static_cast<int>(number.size()));
+  if (!number_as_unicode.UTF8WasValid()) {
+    // The input wasn't valid UTF-8. Produce an empty string to indicate an error.
+    extracted_number->clear();
+    return;
+  }
   char current_char[5];
   int len;
   UnicodeText::const_iterator it;
@@ -2326,7 +2341,7 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number,
   if (it == number_as_unicode.end()) {
     // No valid start character was found. extracted_number should be set to
     // empty string.
-    extracted_number->assign("");
+    extracted_number->clear();
     return;
   }
 
diff --git a/cpp/src/phonenumbers/unicodestring.h b/cpp/src/phonenumbers/unicodestring.h
index de756d6f8..301dd82e1 100644
--- a/cpp/src/phonenumbers/unicodestring.h
+++ b/cpp/src/phonenumbers/unicodestring.h
@@ -92,6 +92,9 @@ class UnicodeString {
     text_.CopyUTF8(s, static_cast<int>(len));
   }
 
+  // Was this UnicodeString created from valid UTF-8?
+  bool UTF8WasValid() const { return text_.UTF8WasValid(); }
+
   // Returns the substring located at [ start, start + length - 1 ] without
   // copying the underlying C string. If one of the provided parameters is out
   // of range, the function returns an empty unicode string.
diff --git a/cpp/src/phonenumbers/utf/unicodetext.cc b/cpp/src/phonenumbers/utf/unicodetext.cc
index 53f055af4..5050abf41 100644
--- a/cpp/src/phonenumbers/utf/unicodetext.cc
+++ b/cpp/src/phonenumbers/utf/unicodetext.cc
@@ -231,7 +231,8 @@ UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
 
 UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
   repr_.Copy(buffer, byte_length);
-  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+  repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
+  if (!repr_.utf8_was_valid_) {
     LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
   }
@@ -250,7 +251,8 @@ UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
                                               int byte_length,
                                               int byte_capacity) {
   repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
-  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+  repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
+  if (!repr_.utf8_was_valid_) {
     LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
     repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
   }
@@ -267,7 +269,8 @@ UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
 // ----- PointTo -----
 
 UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
-  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
+  repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
+  if (repr_.utf8_was_valid_) {
     repr_.PointTo(buffer, byte_length);
   } else {
     LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
diff --git a/cpp/src/phonenumbers/utf/unicodetext.h b/cpp/src/phonenumbers/utf/unicodetext.h
index d2673213a..b66b376dc 100644
--- a/cpp/src/phonenumbers/utf/unicodetext.h
+++ b/cpp/src/phonenumbers/utf/unicodetext.h
@@ -295,7 +295,8 @@ class UnicodeText {
   // the data is tested for interchange-validity. If it is not
   // interchange-valid, a LOG(WARNING) is issued, and each
   // structurally invalid byte and each interchange-invalid codepoint
-  // is replaced with a space.
+  // is replaced with a space. The `utf8_was_valid_` status is set
+  // appropriately and may be queried afterwards.
 
   // x.CopyUTF8(buf, len) copies buf into x.
   UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
@@ -312,6 +313,9 @@ class UnicodeText {
   // CopyUTF8(utf8_buffer, byte_length).
   UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
 
+  // Was this UnicodeText created from valid UTF-8?
+  bool UTF8WasValid() const { return repr_.utf8_was_valid_; }
+
   // Occasionally it is necessary to use functions that operate on the
   // pointer returned by utf8_data(). MakeIterator(p) provides a way
   // to get back to the UnicodeText level. It uses CHECK to ensure
@@ -331,8 +335,9 @@ class UnicodeText {
     int size_;
     int capacity_;
     bool ours_;  // Do we own data_?
+    bool utf8_was_valid_; // Were we created from valid UTF-8?
 
-    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
+    Repr() : data_(NULL), size_(0), capacity_(0), ours_(true), utf8_was_valid_(true) {}
     ~Repr() { if (ours_) delete[] data_; }
 
     void clear();
diff --git a/cpp/test/phonenumbers/phonenumberutil_test.cc b/cpp/test/phonenumbers/phonenumberutil_test.cc
index 1986be8a2..fecca4205 100644
--- a/cpp/test/phonenumbers/phonenumberutil_test.cc
+++ b/cpp/test/phonenumbers/phonenumberutil_test.cc
@@ -29,8 +29,10 @@
 #include <string>
 
 #include <gtest/gtest.h>
+#include <unicode/uchar.h>
 
 #include "phonenumbers/default_logger.h"
+#include "phonenumbers/normalize_utf8.h"
 #include "phonenumbers/phonemetadata.pb.h"
 #include "phonenumbers/phonenumber.h"
 #include "phonenumbers/phonenumber.pb.h"
@@ -124,6 +126,32 @@ TEST_F(PhoneNumberUtilTest, ContainsOnlyValidDigits) {
   EXPECT_FALSE(ContainsOnlyValidDigits("2a"));
 }
 
+TEST_F(PhoneNumberUtilTest, InterchangeInvalidCodepoints) {
+  PhoneNumber phone_number;
+
+  std::vector<string> valid_inputs = {
+    "+44" "\xE2\x80\x93" "2087654321", // U+2013, EN DASH
+  };
+  for (auto input : valid_inputs) {
+    EXPECT_EQ(input, NormalizeUTF8::NormalizeDecimalDigits(input));
+    EXPECT_TRUE(IsViablePhoneNumber(input));
+    EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
+              phone_util_.Parse(input, RegionCode::GB(), &phone_number));
+  }
+
+  std::vector<string> invalid_inputs = {
+    "+44" "\x96"         "2087654321", // Invalid sequence
+    "+44" "\xC2\x96"     "2087654321", // U+0096
+    "+44" "\xEF\xBF\xBE" "2087654321", // U+FFFE
+  };
+  for (auto input : invalid_inputs) {
+    EXPECT_TRUE(NormalizeUTF8::NormalizeDecimalDigits(input).empty());
+    EXPECT_FALSE(IsViablePhoneNumber(input));
+    EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER,
+              phone_util_.Parse(input, RegionCode::GB(), &phone_number));
+  }
+}
+
 TEST_F(PhoneNumberUtilTest, GetSupportedRegions) {
   std::set<string> regions;
 
diff --git a/cpp/test/phonenumbers/utf/unicodetext_test.cc b/cpp/test/phonenumbers/utf/unicodetext_test.cc
index 5e5147620..8a1a165b7 100644
--- a/cpp/test/phonenumbers/utf/unicodetext_test.cc
+++ b/cpp/test/phonenumbers/utf/unicodetext_test.cc
@@ -36,6 +36,7 @@ TEST(UnicodeTextTest, Iterator) {
     string number(values[i].utf8);
     UnicodeText number_as_unicode;
     number_as_unicode.PointToUTF8(number.data(), number.size());
+    EXPECT_TRUE(number_as_unicode.UTF8WasValid());
     UnicodeText::const_iterator it = number_as_unicode.begin();
     EXPECT_EQ(values[i].code_point, *it);
   }