diff --git a/cpp/src/phonenumbers/utf/rune.c b/cpp/src/phonenumbers/utf/rune.c index c268489ee..b4aa93b5d 100644 --- a/cpp/src/phonenumbers/utf/rune.c +++ b/cpp/src/phonenumbers/utf/rune.c @@ -1,6 +1,7 @@ /* * The authors of this software are Rob Pike and Ken Thompson. * Copyright (c) 2002 by Lucent Technologies. + * Portions Copyright (c) 2009 The Go Authors. All rights reserved. * Permission to use, copy, modify, and distribute this software for any * purpose without fee is hereby granted, provided that this entire notice * is included in all copies of any software which is or includes a copy @@ -11,8 +12,6 @@ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. */ -#include -#include #include "phonenumbers/utf/utf.h" #include "phonenumbers/utf/utfdef.h" @@ -35,12 +34,14 @@ enum Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ - Rune4 = (1<<(Bit4+3*Bitx))-1, - /* 0001 1111 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */ Maskx = (1< Runemax) goto bad; - *rune = l; + *rune = (Rune)l; return 4; } @@ -175,7 +178,7 @@ chartorune(Rune *rune, const char *str) */ c = *(uchar*)str; if(c < Tx) { - *rune = c; + *rune = (Rune)c; return 1; } @@ -192,7 +195,7 @@ chartorune(Rune *rune, const char *str) l = ((c << Bitx) | c1) & Rune2; if(l <= Rune1) goto bad; - *rune = l; + *rune = (Rune)l; return 2; } @@ -207,7 +210,9 @@ chartorune(Rune *rune, const char *str) l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; if(l <= Rune2) goto bad; - *rune = l; + if (SurrogateMin <= l && l <= SurrogateMax) + goto bad; + *rune = (Rune)l; return 3; } @@ -220,9 +225,9 @@ chartorune(Rune *rune, const char *str) goto bad; if (c < T5) { l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if (l <= Rune3) + if (l <= Rune3 || l > Runemax) goto bad; - *rune = l; + *rune = (Rune)l; return 4; } @@ -240,7 +245,8 @@ bad: } int -isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { +isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) +{ *consumed = charntorune(rune, str, length); return *rune != Runeerror || *consumed == 3; } @@ -257,7 +263,7 @@ runetochar(char *str, const Rune *rune) */ c = *rune; if(c <= Rune1) { - str[0] = c; + str[0] = (char)c; return 1; } @@ -266,28 +272,30 @@ runetochar(char *str, const Rune *rune) * 0080-07FF => T2 Tx */ if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); + str[0] = (char)(T2 | (c >> 1*Bitx)); + str[1] = (char)(Tx | (c & Maskx)); return 2; } /* - * If the Rune is out of range, convert it to the error rune. + * If the Rune is out of range or a surrogate half, convert it to the error rune. * Do this test here because the error rune encodes to three bytes. * Doing it earlier would duplicate work, since an out of range * Rune wouldn't have fit in one or two bytes. */ if (c > Runemax) c = Runeerror; + if (SurrogateMin <= c && c <= SurrogateMax) + c = Runeerror; /* * three character sequence * 0800-FFFF => T3 Tx Tx */ if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); + str[0] = (char)(T3 | (c >> 2*Bitx)); + str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx)); + str[2] = (char)(Tx | (c & Maskx)); return 3; } @@ -295,10 +303,10 @@ runetochar(char *str, const Rune *rune) * four character sequence (21-bit value) * 10000-1FFFFF => T4 Tx Tx Tx */ - str[0] = T4 | (c >> 3*Bitx); - str[1] = Tx | ((c >> 2*Bitx) & Maskx); - str[2] = Tx | ((c >> 1*Bitx) & Maskx); - str[3] = Tx | (c & Maskx); + str[0] = (char)(T4 | (c >> 3*Bitx)); + str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx)); + str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx)); + str[3] = (char)(Tx | (c & Maskx)); return 4; } @@ -317,7 +325,7 @@ runenlen(const Rune *r, int nrune) nb = 0; while(nrune--) { - c = *r++; + c = (int)*r++; if (c <= Rune1) nb++; else if (c <= Rune2) diff --git a/cpp/src/phonenumbers/utf/unicodetext.cc b/cpp/src/phonenumbers/utf/unicodetext.cc index 55ffedf1b..ecd323069 100644 --- a/cpp/src/phonenumbers/utf/unicodetext.cc +++ b/cpp/src/phonenumbers/utf/unicodetext.cc @@ -85,7 +85,7 @@ static int ConvertToInterchangeValid(char* start, int len) { } } // Is the current string invalid UTF8 or just non-interchange UTF8? - char32 rune; + Rune rune; int n; if (isvalidcharntorune(start, end - start, &rune, &n)) { // structurally valid UTF8, but not interchange valid @@ -362,7 +362,8 @@ UnicodeText::~UnicodeText() {} void UnicodeText::push_back(char32 c) { if (UniLib::IsValidCodepoint(c)) { char buf[UTFmax]; - int len = runetochar(buf, &c); + Rune rune = c; + int len = runetochar(buf, &rune); if (UniLib::IsInterchangeValid(buf, len)) { repr_.append(buf, len); } else { diff --git a/cpp/src/phonenumbers/utf/unilib.cc b/cpp/src/phonenumbers/utf/unilib.cc index ffcb8b056..918134e8c 100644 --- a/cpp/src/phonenumbers/utf/unilib.cc +++ b/cpp/src/phonenumbers/utf/unilib.cc @@ -46,7 +46,7 @@ inline bool IsInterchangeValidCodepoint(char32 c) { } // namespace int SpanInterchangeValid(const char* begin, int byte_length) { - char32 rune; + Rune rune; const char* p = begin; const char* end = begin + byte_length; while (p < end) { diff --git a/cpp/src/phonenumbers/utf/utf.h b/cpp/src/phonenumbers/utf/utf.h index f4fd482a4..72d01ed63 100644 --- a/cpp/src/phonenumbers/utf/utf.h +++ b/cpp/src/phonenumbers/utf/utf.h @@ -1,27 +1,22 @@ /* * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 1998-2002 by Lucent Technologies. - * Portions Copyright (c) 2009 The Go Authors. All rights reserved. + * Copyright (c) 1998-2002 by Lucent Technologies. + * Portions Copyright (c) 2009 The Go Authors. All rights reserved. * Permission to use, copy, modify, and distribute this software for any * purpose without fee is hereby granted, provided that this entire notice * is included in all copies of any software which is or includes a copy * or modification of this software and in all copies of the supporting * documentation for such software. * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY + * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - */ + */ #ifndef _UTFH_ #define _UTFH_ 1 -// stdint.h content doesn't seem to be used in this file and doesn't exist on -// Windows, therefore we comment it out here so that the code could be compiled -// on Windows. -//#include - -typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ +typedef unsigned int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ enum { @@ -71,7 +66,7 @@ int chartorune(Rune* r, const char* s); // n bytes of s. If the UTF sequence is incomplete within n bytes, // charntorune will set *r to Runeerror and return 0. If it is complete // but not in UTF format, it will set *r to Runeerror and return 1. -// +// // Added 2004-09-24 by Wei-Hwa Huang int charntorune(Rune* r, const char* s, int n); @@ -126,7 +121,7 @@ int utfnlen(const char* s, long n); // byte terminating a string is considered to be part of the string s. // (cf. strchr) -const char* utfrune(const char* s, Rune r); +/*const*/ char* utfrune(const char* s, Rune r); // utfrrune returns a pointer to the last occurrence of rune r in the @@ -134,7 +129,7 @@ const char* utfrune(const char* s, Rune r); // byte terminating a string is considered to be part of the string s. // (cf. strrchr) -const char* utfrrune(const char* s, Rune r); +/*const*/ char* utfrrune(const char* s, Rune r); // utfutf returns a pointer to the first occurrence of the UTF string @@ -155,7 +150,7 @@ char* utfecpy(char *s1, char *es1, const char *s2); // These functions are rune-string analogues of the corresponding // functions in strcat (3). -// +// // These routines first appeared in Plan 9. // SEE ALSO // memmove (3) @@ -208,8 +203,8 @@ Rune totitlerune(Rune r); // isupperrune tests for upper case characters, including Unicode // upper case letters and targets of the toupper mapping. islowerrune -// and istitlerune are defined analogously. - +// and istitlerune are defined analogously. + int isupperrune(Rune r); int islowerrune(Rune r); int istitlerune(Rune r); @@ -227,12 +222,6 @@ int isalpharune(Rune r); int isdigitrune(Rune r); -// isideographicrune tests for ideographic characters and numbers, as -// defined by the Unicode standard. - -int isideographicrune(Rune r); - - // isspacerune tests for whitespace characters, including "C" locale // whitespace, Unicode defined whitespace, and the "zero-width // non-break space" character. diff --git a/cpp/src/phonenumbers/utf/utfdef.h b/cpp/src/phonenumbers/utf/utfdef.h index adc6d95fb..4bbdfc643 100644 --- a/cpp/src/phonenumbers/utf/utfdef.h +++ b/cpp/src/phonenumbers/utf/utfdef.h @@ -25,4 +25,3 @@ typedef unsigned int uint; typedef unsigned long ulong; #define nelem(x) (sizeof(x)/sizeof((x)[0])) -#define nil ((void*)0)