Browse Source

Be stricter about invalid UTF-8 input (#2698)

Keep track of whether unicode strings were created from valid UTF-8 and
use this to abort operations that use these strings later.

Typically, this is done by returning an empty string when a modified
string is being produced, since an empty string is never a valid phone
number.
pull/2716/head
Neil Mayhew 4 years ago
committed by GitHub
parent
commit
96293eec3b
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 63 additions and 6 deletions
  1. +2
    -0
      cpp/src/phonenumbers/normalize_utf8.h
  2. +16
    -1
      cpp/src/phonenumbers/phonenumberutil.cc
  3. +3
    -0
      cpp/src/phonenumbers/unicodestring.h
  4. +6
    -3
      cpp/src/phonenumbers/utf/unicodetext.cc
  5. +7
    -2
      cpp/src/phonenumbers/utf/unicodetext.h
  6. +28
    -0
      cpp/test/phonenumbers/phonenumberutil_test.cc
  7. +1
    -0
      cpp/test/phonenumbers/utf/unicodetext_test.cc

+ 2
- 0
cpp/src/phonenumbers/normalize_utf8.h View File

@ -27,6 +27,8 @@ struct NormalizeUTF8 {
string normalized;
UnicodeText number_as_unicode;
number_as_unicode.PointToUTF8(number.data(), static_cast<int>(number.size()));
if (!number_as_unicode.UTF8WasValid())
return normalized; // Return an empty result to indicate an error
for (UnicodeText::const_iterator it = number_as_unicode.begin();
it != number_as_unicode.end();
++it) {


+ 16
- 1
cpp/src/phonenumbers/phonenumberutil.cc View File

@ -331,6 +331,11 @@ void NormalizeHelper(const std::map<char32, char>& normalization_replacements,
DCHECK(number);
UnicodeText number_as_unicode;
number_as_unicode.PointToUTF8(number->data(), static_cast<int>(number->size()));
if (!number_as_unicode.UTF8WasValid()) {
// The input wasn't valid UTF-8. Produce an empty string to indicate an error.
number->clear();
return;
}
string normalized_number;
char unicode_char[5];
for (UnicodeText::const_iterator it = number_as_unicode.begin();
@ -973,6 +978,11 @@ void PhoneNumberUtil::TrimUnwantedEndChars(string* number) const {
DCHECK(number);
UnicodeText number_as_unicode;
number_as_unicode.PointToUTF8(number->data(), static_cast<int>(number->size()));
if (!number_as_unicode.UTF8WasValid()) {
// The input wasn't valid UTF-8. Produce an empty string to indicate an error.
number->clear();
return;
}
char current_char[5];
int len;
UnicodeText::const_reverse_iterator reverse_it(number_as_unicode.end());
@ -2312,6 +2322,11 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number,
UnicodeText number_as_unicode;
number_as_unicode.PointToUTF8(number.data(), static_cast<int>(number.size()));
if (!number_as_unicode.UTF8WasValid()) {
// The input wasn't valid UTF-8. Produce an empty string to indicate an error.
extracted_number->clear();
return;
}
char current_char[5];
int len;
UnicodeText::const_iterator it;
@ -2326,7 +2341,7 @@ void PhoneNumberUtil::ExtractPossibleNumber(const string& number,
if (it == number_as_unicode.end()) {
// No valid start character was found. extracted_number should be set to
// empty string.
extracted_number->assign("");
extracted_number->clear();
return;
}


+ 3
- 0
cpp/src/phonenumbers/unicodestring.h View File

@ -92,6 +92,9 @@ class UnicodeString {
text_.CopyUTF8(s, static_cast<int>(len));
}
// Was this UnicodeString created from valid UTF-8?
bool UTF8WasValid() const { return text_.UTF8WasValid(); }
// Returns the substring located at [ start, start + length - 1 ] without
// copying the underlying C string. If one of the provided parameters is out
// of range, the function returns an empty unicode string.


+ 6
- 3
cpp/src/phonenumbers/utf/unicodetext.cc View File

@ -231,7 +231,8 @@ UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
repr_.Copy(buffer, byte_length);
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
if (!repr_.utf8_was_valid_) {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
@ -250,7 +251,8 @@ UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
int byte_length,
int byte_capacity) {
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
if (!repr_.utf8_was_valid_) {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
@ -267,7 +269,8 @@ UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
// ----- PointTo -----
UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
repr_.utf8_was_valid_ = UniLib:: IsInterchangeValid(buffer, byte_length);
if (repr_.utf8_was_valid_) {
repr_.PointTo(buffer, byte_length);
} else {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";


+ 7
- 2
cpp/src/phonenumbers/utf/unicodetext.h View File

@ -295,7 +295,8 @@ class UnicodeText {
// the data is tested for interchange-validity. If it is not
// interchange-valid, a LOG(WARNING) is issued, and each
// structurally invalid byte and each interchange-invalid codepoint
// is replaced with a space.
// is replaced with a space. The `utf8_was_valid_` status is set
// appropriately and may be queried afterwards.
// x.CopyUTF8(buf, len) copies buf into x.
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
@ -312,6 +313,9 @@ class UnicodeText {
// CopyUTF8(utf8_buffer, byte_length).
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
// Was this UnicodeText created from valid UTF-8?
bool UTF8WasValid() const { return repr_.utf8_was_valid_; }
// Occasionally it is necessary to use functions that operate on the
// pointer returned by utf8_data(). MakeIterator(p) provides a way
// to get back to the UnicodeText level. It uses CHECK to ensure
@ -331,8 +335,9 @@ class UnicodeText {
int size_;
int capacity_;
bool ours_; // Do we own data_?
bool utf8_was_valid_; // Were we created from valid UTF-8?
Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {}
Repr() : data_(NULL), size_(0), capacity_(0), ours_(true), utf8_was_valid_(true) {}
~Repr() { if (ours_) delete[] data_; }
void clear();


+ 28
- 0
cpp/test/phonenumbers/phonenumberutil_test.cc View File

@ -29,8 +29,10 @@
#include <string>
#include <gtest/gtest.h>
#include <unicode/uchar.h>
#include "phonenumbers/default_logger.h"
#include "phonenumbers/normalize_utf8.h"
#include "phonenumbers/phonemetadata.pb.h"
#include "phonenumbers/phonenumber.h"
#include "phonenumbers/phonenumber.pb.h"
@ -124,6 +126,32 @@ TEST_F(PhoneNumberUtilTest, ContainsOnlyValidDigits) {
EXPECT_FALSE(ContainsOnlyValidDigits("2a"));
}
TEST_F(PhoneNumberUtilTest, InterchangeInvalidCodepoints) {
PhoneNumber phone_number;
std::vector<string> valid_inputs = {
"+44" "\xE2\x80\x93" "2087654321", // U+2013, EN DASH
};
for (auto input : valid_inputs) {
EXPECT_EQ(input, NormalizeUTF8::NormalizeDecimalDigits(input));
EXPECT_TRUE(IsViablePhoneNumber(input));
EXPECT_EQ(PhoneNumberUtil::NO_PARSING_ERROR,
phone_util_.Parse(input, RegionCode::GB(), &phone_number));
}
std::vector<string> invalid_inputs = {
"+44" "\x96" "2087654321", // Invalid sequence
"+44" "\xC2\x96" "2087654321", // U+0096
"+44" "\xEF\xBF\xBE" "2087654321", // U+FFFE
};
for (auto input : invalid_inputs) {
EXPECT_TRUE(NormalizeUTF8::NormalizeDecimalDigits(input).empty());
EXPECT_FALSE(IsViablePhoneNumber(input));
EXPECT_EQ(PhoneNumberUtil::NOT_A_NUMBER,
phone_util_.Parse(input, RegionCode::GB(), &phone_number));
}
}
TEST_F(PhoneNumberUtilTest, GetSupportedRegions) {
std::set<string> regions;


+ 1
- 0
cpp/test/phonenumbers/utf/unicodetext_test.cc View File

@ -36,6 +36,7 @@ TEST(UnicodeTextTest, Iterator) {
string number(values[i].utf8);
UnicodeText number_as_unicode;
number_as_unicode.PointToUTF8(number.data(), number.size());
EXPECT_TRUE(number_as_unicode.UTF8WasValid());
UnicodeText::const_iterator it = number_as_unicode.begin();
EXPECT_EQ(values[i].code_point, *it);
}


Loading…
Cancel
Save