Copy updated files from lib9/utf from upstream repository.

This also requires updating code that previously took a shortcut and depended on the Rune and char32 typedefs to be identical (which is no longer the case, as Rune is now unsigned). R=jia.shao.peng@gmail.com BUG= Review URL: https://codereview.appspot.com/136920043
11 years ago · d66fa8d928
--- a/cpp/src/phonenumbers/utf/rune.c
+++ b/cpp/src/phonenumbers/utf/rune.c
@ -1,6 +1,7 @@
 /*
 * The authors of this software are Rob Pike and Ken Thompson.
 *              Copyright (c) 2002 by Lucent Technologies.
 *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
@ -11,8 +12,6 @@
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */
 #include <stdarg.h>
 #include <string.h>
 #include "phonenumbers/utf/utf.h"
 #include "phonenumbers/utf/utfdef.h"

@ -35,12 +34,14 @@ enum
 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
 	Rune4	= (1<<(Bit4+3*Bitx))-1,
                                        /* 0001 1111 1111 1111 1111 1111 */
 	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */

 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */

 	SurrogateMin	= 0xD800,
 	SurrogateMax	= 0xDFFF,

 	Bad	= Runeerror,
 };

@ -79,7 +80,7 @@ charntorune(Rune *rune, const char *str, int length)
 	 */
 	c = *(uchar*)str;
 	if(c < Tx) {
 		*rune = c;
 		*rune = (Rune)c;
 		return 1;
 	}

@ -101,7 +102,7 @@ charntorune(Rune *rune, const char *str, int length)
 		l = ((c << Bitx) | c1) & Rune2;
 		if(l <= Rune1)
 			goto bad;
 		*rune = l;
 		*rune = (Rune)l;
 		return 2;
 	}

@ -121,7 +122,9 @@ charntorune(Rune *rune, const char *str, int length)
 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 		if(l <= Rune2)
 			goto bad;
 		*rune = l;
 		if (SurrogateMin <= l && l <= SurrogateMax)
 			goto bad;
 		*rune = (Rune)l;
 		return 3;
 	}

@ -137,9 +140,9 @@ charntorune(Rune *rune, const char *str, int length)
 		goto bad;
 	if (c < T5) {
 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
 		if (l <= Rune3)
 		if (l <= Rune3 || l > Runemax)
 			goto bad;
 		*rune = l;
 		*rune = (Rune)l;
 		return 4;
 	}

@ -175,7 +178,7 @@ chartorune(Rune *rune, const char *str)
 	 */
 	c = *(uchar*)str;
 	if(c < Tx) {
 		*rune = c;
 		*rune = (Rune)c;
 		return 1;
 	}

@ -192,7 +195,7 @@ chartorune(Rune *rune, const char *str)
 		l = ((c << Bitx) | c1) & Rune2;
 		if(l <= Rune1)
 			goto bad;
 		*rune = l;
 		*rune = (Rune)l;
 		return 2;
 	}

@ -207,7 +210,9 @@ chartorune(Rune *rune, const char *str)
 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
 		if(l <= Rune2)
 			goto bad;
 		*rune = l;
 		if (SurrogateMin <= l && l <= SurrogateMax)
 			goto bad;
 		*rune = (Rune)l;
 		return 3;
 	}

@ -220,9 +225,9 @@ chartorune(Rune *rune, const char *str)
 		goto bad;
 	if (c < T5) {
 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
 		if (l <= Rune3)
 		if (l <= Rune3 || l > Runemax)
 			goto bad;
 		*rune = l;
 		*rune = (Rune)l;
 		return 4;
 	}

@ -240,7 +245,8 @@ bad:
 }

 int
 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
 {
 	*consumed = charntorune(rune, str, length);
 	return *rune != Runeerror || *consumed == 3;
 }
@ -257,7 +263,7 @@ runetochar(char *str, const Rune *rune)
 	 */
 	c = *rune;
 	if(c <= Rune1) {
 		str[0] = c;
 		str[0] = (char)c;
 		return 1;
 	}

@ -266,28 +272,30 @@ runetochar(char *str, const Rune *rune)
 	 *	0080-07FF => T2 Tx
 	 */
 	if(c <= Rune2) {
 		str[0] = T2 | (c >> 1*Bitx);
 		str[1] = Tx | (c & Maskx);
 		str[0] = (char)(T2 | (c >> 1*Bitx));
 		str[1] = (char)(Tx | (c & Maskx));
 		return 2;
 	}

 	/*
 	 * If the Rune is out of range, convert it to the error rune.
 	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
 	 * Do this test here because the error rune encodes to three bytes.
 	 * Doing it earlier would duplicate work, since an out of range
 	 * Rune wouldn't have fit in one or two bytes.
 	 */
 	if (c > Runemax)
 		c = Runeerror;
 	if (SurrogateMin <= c && c <= SurrogateMax)
 		c = Runeerror;

 	/*
 	 * three character sequence
 	 *	0800-FFFF => T3 Tx Tx
 	 */
 	if (c <= Rune3) {
 		str[0] = T3 |  (c >> 2*Bitx);
 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
 		str[2] = Tx |  (c & Maskx);
 		str[0] = (char)(T3 |  (c >> 2*Bitx));
 		str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
 		str[2] = (char)(Tx |  (c & Maskx));
 		return 3;
 	}

@ -295,10 +303,10 @@ runetochar(char *str, const Rune *rune)
 	 * four character sequence (21-bit value)
 	 *     10000-1FFFFF => T4 Tx Tx Tx
 	 */
 	str[0] = T4 | (c >> 3*Bitx);
 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
 	str[3] = Tx | (c & Maskx);
 	str[0] = (char)(T4 | (c >> 3*Bitx));
 	str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
 	str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
 	str[3] = (char)(Tx | (c & Maskx));
 	return 4;
 }

@ -317,7 +325,7 @@ runenlen(const Rune *r, int nrune)

 	nb = 0;
 	while(nrune--) {
 		c = *r++;
 		c = (int)*r++;
 		if (c <= Rune1)
 			nb++;
 		else if (c <= Rune2)
--- a/cpp/src/phonenumbers/utf/unicodetext.cc
+++ b/cpp/src/phonenumbers/utf/unicodetext.cc
@ -85,7 +85,7 @@ static int ConvertToInterchangeValid(char* start, int len) {
      }
    }
    // Is the current string invalid UTF8 or just non-interchange UTF8?
    char32 rune;
    Rune rune;
    int n;
    if (isvalidcharntorune(start, end - start, &rune, &n)) {
      // structurally valid UTF8, but not interchange valid
@ -362,7 +362,8 @@ UnicodeText::~UnicodeText() {}
 void UnicodeText::push_back(char32 c) {
  if (UniLib::IsValidCodepoint(c)) {
    char buf[UTFmax];
    int len = runetochar(buf, &c);
    Rune rune = c;
    int len = runetochar(buf, &rune);
    if (UniLib::IsInterchangeValid(buf, len)) {
      repr_.append(buf, len);
    } else {
--- a/cpp/src/phonenumbers/utf/unilib.cc
+++ b/cpp/src/phonenumbers/utf/unilib.cc
@ -46,7 +46,7 @@ inline bool IsInterchangeValidCodepoint(char32 c) {
 }  // namespace

 int SpanInterchangeValid(const char* begin, int byte_length) {
  char32 rune;
  Rune rune;
  const char* p = begin;
  const char* end = begin + byte_length;
  while (p < end) {
--- a/cpp/src/phonenumbers/utf/utf.h
+++ b/cpp/src/phonenumbers/utf/utf.h
@ -1,27 +1,22 @@
 /*
 * The authors of this software are Rob Pike and Ken Thompson.
 * Copyright (c) 1998-2002 by Lucent Technologies.
 * Portions Copyright (c) 2009 The Go Authors. All rights reserved.
 *              Copyright (c) 1998-2002 by Lucent Technologies.
 *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
 * Permission to use, copy, modify, and distribute this software for any
 * purpose without fee is hereby granted, provided that this entire notice
 * is included in all copies of any software which is or includes a copy
 * or modification of this software and in all copies of the supporting
 * documentation for such software.
 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
 * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
 */ 
 */

 #ifndef _UTFH_
 #define _UTFH_ 1

 // stdint.h content doesn't seem to be used in this file and doesn't exist on
 // Windows, therefore we comment it out here so that the code could be compiled
 // on Windows.
 //#include <stdint.h>

 typedef signed int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/
 typedef unsigned int Rune;	/* Code-point values in Unicode 4.0 are 21 bits wide.*/

 enum
 {
@ -71,7 +66,7 @@ int chartorune(Rune* r, const char* s);
 // n bytes of s.  If the UTF sequence is incomplete within n bytes,
 // charntorune will set *r to Runeerror and return 0. If it is complete
 // but not in UTF format, it will set *r to Runeerror and return 1.
 // 
 //
 // Added 2004-09-24 by Wei-Hwa Huang

 int charntorune(Rune* r, const char* s, int n);
@ -126,7 +121,7 @@ int utfnlen(const char* s, long n);
 // byte terminating a string is considered to be part of the string s.
 // (cf. strchr)

 const char* utfrune(const char* s, Rune r);
 /*const*/ char* utfrune(const char* s, Rune r);


 // utfrrune returns a pointer to the last occurrence of rune r in the
@ -134,7 +129,7 @@ const char* utfrune(const char* s, Rune r);
 // byte terminating a string is considered to be part of the string s.
 // (cf. strrchr)

 const char* utfrrune(const char* s, Rune r);
 /*const*/ char* utfrrune(const char* s, Rune r);


 // utfutf returns a pointer to the first occurrence of the UTF string
@ -155,7 +150,7 @@ char* utfecpy(char *s1, char *es1, const char *s2);

 // These functions are rune-string analogues of the corresponding
 // functions in strcat (3).
 // 
 //
 // These routines first appeared in Plan 9.
 // SEE ALSO
 // memmove (3)
@ -208,8 +203,8 @@ Rune totitlerune(Rune r);

 // isupperrune tests for upper case characters, including Unicode
 // upper case letters and targets of the toupper mapping. islowerrune
 // and istitlerune are defined analogously. 
 
 // and istitlerune are defined analogously.

 int isupperrune(Rune r);
 int islowerrune(Rune r);
 int istitlerune(Rune r);
@ -227,12 +222,6 @@ int isalpharune(Rune r);
 int isdigitrune(Rune r);


 // isideographicrune tests for ideographic characters and numbers, as
 // defined by the Unicode standard.

 int isideographicrune(Rune r);


 // isspacerune tests for whitespace characters, including "C" locale
 // whitespace, Unicode defined whitespace, and the "zero-width
 // non-break space" character.
--- a/cpp/src/phonenumbers/utf/utfdef.h
+++ b/cpp/src/phonenumbers/utf/utfdef.h
@ -25,4 +25,3 @@ typedef unsigned int		uint;
 typedef unsigned long		ulong;

 #define nelem(x) (sizeof(x)/sizeof((x)[0]))
 #define nil ((void*)0)