Browse Source

Copy updated files from lib9/utf from upstream repository.

This also requires updating code that previously took a shortcut and
depended on the Rune and char32 typedefs to be identical (which is no
longer the case, as Rune is now unsigned).

R=jia.shao.peng@gmail.com
BUG=

Review URL: https://codereview.appspot.com/136920043
pull/567/head
Fredrik Roubert 11 years ago
committed by Mihaela Rosca
parent
commit
d66fa8d928
5 changed files with 50 additions and 53 deletions
  1. +35
    -27
      cpp/src/phonenumbers/utf/rune.c
  2. +3
    -2
      cpp/src/phonenumbers/utf/unicodetext.cc
  3. +1
    -1
      cpp/src/phonenumbers/utf/unilib.cc
  4. +11
    -22
      cpp/src/phonenumbers/utf/utf.h
  5. +0
    -1
      cpp/src/phonenumbers/utf/utfdef.h

+ 35
- 27
cpp/src/phonenumbers/utf/rune.c View File

@ -1,6 +1,7 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
@ -11,8 +12,6 @@
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "phonenumbers/utf/utf.h"
#include "phonenumbers/utf/utfdef.h"
@ -35,12 +34,14 @@ enum
Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1,
/* 0001 1111 1111 1111 1111 1111 */
Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
Maskx = (1<<Bitx)-1, /* 0011 1111 */
Testx = Maskx ^ 0xFF, /* 1100 0000 */
SurrogateMin = 0xD800,
SurrogateMax = 0xDFFF,
Bad = Runeerror,
};
@ -79,7 +80,7 @@ charntorune(Rune *rune, const char *str, int length)
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
*rune = (Rune)c;
return 1;
}
@ -101,7 +102,7 @@ charntorune(Rune *rune, const char *str, int length)
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
*rune = (Rune)l;
return 2;
}
@ -121,7 +122,9 @@ charntorune(Rune *rune, const char *str, int length)
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
if (SurrogateMin <= l && l <= SurrogateMax)
goto bad;
*rune = (Rune)l;
return 3;
}
@ -137,9 +140,9 @@ charntorune(Rune *rune, const char *str, int length)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
if (l <= Rune3 || l > Runemax)
goto bad;
*rune = l;
*rune = (Rune)l;
return 4;
}
@ -175,7 +178,7 @@ chartorune(Rune *rune, const char *str)
*/
c = *(uchar*)str;
if(c < Tx) {
*rune = c;
*rune = (Rune)c;
return 1;
}
@ -192,7 +195,7 @@ chartorune(Rune *rune, const char *str)
l = ((c << Bitx) | c1) & Rune2;
if(l <= Rune1)
goto bad;
*rune = l;
*rune = (Rune)l;
return 2;
}
@ -207,7 +210,9 @@ chartorune(Rune *rune, const char *str)
l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
if(l <= Rune2)
goto bad;
*rune = l;
if (SurrogateMin <= l && l <= SurrogateMax)
goto bad;
*rune = (Rune)l;
return 3;
}
@ -220,9 +225,9 @@ chartorune(Rune *rune, const char *str)
goto bad;
if (c < T5) {
l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
if (l <= Rune3)
if (l <= Rune3 || l > Runemax)
goto bad;
*rune = l;
*rune = (Rune)l;
return 4;
}
@ -240,7 +245,8 @@ bad:
}
int
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) {
isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
{
*consumed = charntorune(rune, str, length);
return *rune != Runeerror || *consumed == 3;
}
@ -257,7 +263,7 @@ runetochar(char *str, const Rune *rune)
*/
c = *rune;
if(c <= Rune1) {
str[0] = c;
str[0] = (char)c;
return 1;
}
@ -266,28 +272,30 @@ runetochar(char *str, const Rune *rune)
* 0080-07FF => T2 Tx
*/
if(c <= Rune2) {
str[0] = T2 | (c >> 1*Bitx);
str[1] = Tx | (c & Maskx);
str[0] = (char)(T2 | (c >> 1*Bitx));
str[1] = (char)(Tx | (c & Maskx));
return 2;
}
/*
* If the Rune is out of range, convert it to the error rune.
* If the Rune is out of range or a surrogate half, convert it to the error rune.
* Do this test here because the error rune encodes to three bytes.
* Doing it earlier would duplicate work, since an out of range
* Rune wouldn't have fit in one or two bytes.
*/
if (c > Runemax)
c = Runeerror;
if (SurrogateMin <= c && c <= SurrogateMax)
c = Runeerror;
/*
* three character sequence
* 0800-FFFF => T3 Tx Tx
*/
if (c <= Rune3) {
str[0] = T3 | (c >> 2*Bitx);
str[1] = Tx | ((c >> 1*Bitx) & Maskx);
str[2] = Tx | (c & Maskx);
str[0] = (char)(T3 | (c >> 2*Bitx));
str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
str[2] = (char)(Tx | (c & Maskx));
return 3;
}
@ -295,10 +303,10 @@ runetochar(char *str, const Rune *rune)
* four character sequence (21-bit value)
* 10000-1FFFFF => T4 Tx Tx Tx
*/
str[0] = T4 | (c >> 3*Bitx);
str[1] = Tx | ((c >> 2*Bitx) & Maskx);
str[2] = Tx | ((c >> 1*Bitx) & Maskx);
str[3] = Tx | (c & Maskx);
str[0] = (char)(T4 | (c >> 3*Bitx));
str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
str[3] = (char)(Tx | (c & Maskx));
return 4;
}
@ -317,7 +325,7 @@ runenlen(const Rune *r, int nrune)
nb = 0;
while(nrune--) {
c = *r++;
c = (int)*r++;
if (c <= Rune1)
nb++;
else if (c <= Rune2)


+ 3
- 2
cpp/src/phonenumbers/utf/unicodetext.cc View File

@ -85,7 +85,7 @@ static int ConvertToInterchangeValid(char* start, int len) {
}
}
// Is the current string invalid UTF8 or just non-interchange UTF8?
char32 rune;
Rune rune;
int n;
if (isvalidcharntorune(start, end - start, &rune, &n)) {
// structurally valid UTF8, but not interchange valid
@ -362,7 +362,8 @@ UnicodeText::~UnicodeText() {}
void UnicodeText::push_back(char32 c) {
if (UniLib::IsValidCodepoint(c)) {
char buf[UTFmax];
int len = runetochar(buf, &c);
Rune rune = c;
int len = runetochar(buf, &rune);
if (UniLib::IsInterchangeValid(buf, len)) {
repr_.append(buf, len);
} else {


+ 1
- 1
cpp/src/phonenumbers/utf/unilib.cc View File

@ -46,7 +46,7 @@ inline bool IsInterchangeValidCodepoint(char32 c) {
} // namespace
int SpanInterchangeValid(const char* begin, int byte_length) {
char32 rune;
Rune rune;
const char* p = begin;
const char* end = begin + byte_length;
while (p < end) {


+ 11
- 22
cpp/src/phonenumbers/utf/utf.h View File

@ -1,27 +1,22 @@
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 1998-2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Copyright (c) 1998-2002 by Lucent Technologies.
* Portions Copyright (c) 2009 The Go Authors. All rights reserved.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
*/
#ifndef _UTFH_
#define _UTFH_ 1
// stdint.h content doesn't seem to be used in this file and doesn't exist on
// Windows, therefore we comment it out here so that the code could be compiled
// on Windows.
//#include <stdint.h>
typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
typedef unsigned int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
enum
{
@ -71,7 +66,7 @@ int chartorune(Rune* r, const char* s);
// n bytes of s. If the UTF sequence is incomplete within n bytes,
// charntorune will set *r to Runeerror and return 0. If it is complete
// but not in UTF format, it will set *r to Runeerror and return 1.
//
//
// Added 2004-09-24 by Wei-Hwa Huang
int charntorune(Rune* r, const char* s, int n);
@ -126,7 +121,7 @@ int utfnlen(const char* s, long n);
// byte terminating a string is considered to be part of the string s.
// (cf. strchr)
const char* utfrune(const char* s, Rune r);
/*const*/ char* utfrune(const char* s, Rune r);
// utfrrune returns a pointer to the last occurrence of rune r in the
@ -134,7 +129,7 @@ const char* utfrune(const char* s, Rune r);
// byte terminating a string is considered to be part of the string s.
// (cf. strrchr)
const char* utfrrune(const char* s, Rune r);
/*const*/ char* utfrrune(const char* s, Rune r);
// utfutf returns a pointer to the first occurrence of the UTF string
@ -155,7 +150,7 @@ char* utfecpy(char *s1, char *es1, const char *s2);
// These functions are rune-string analogues of the corresponding
// functions in strcat (3).
//
//
// These routines first appeared in Plan 9.
// SEE ALSO
// memmove (3)
@ -208,8 +203,8 @@ Rune totitlerune(Rune r);
// isupperrune tests for upper case characters, including Unicode
// upper case letters and targets of the toupper mapping. islowerrune
// and istitlerune are defined analogously.
// and istitlerune are defined analogously.
int isupperrune(Rune r);
int islowerrune(Rune r);
int istitlerune(Rune r);
@ -227,12 +222,6 @@ int isalpharune(Rune r);
int isdigitrune(Rune r);
// isideographicrune tests for ideographic characters and numbers, as
// defined by the Unicode standard.
int isideographicrune(Rune r);
// isspacerune tests for whitespace characters, including "C" locale
// whitespace, Unicode defined whitespace, and the "zero-width
// non-break space" character.


+ 0
- 1
cpp/src/phonenumbers/utf/utfdef.h View File

@ -25,4 +25,3 @@ typedef unsigned int uint;
typedef unsigned long ulong;
#define nelem(x) (sizeof(x)/sizeof((x)[0]))
#define nil ((void*)0)

Loading…
Cancel
Save