// Copyright (C) 2011 The Libphonenumber Authors
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// Author: George Yakovlev
|
|
// Philippe Liard
|
|
|
|
// Note that we don't use features of ICU that depend on std::string (e.g.
|
|
// UnicodeString::toUTF8String()) to support clients that build ICU without
|
|
// -DU_HAVE_STD_STRING.
|
|
|
|
#include "phonenumbers/regexp_adapter_icu.h"
|
|
|
|
#include <stddef.h>
|
|
#include <string>
|
|
|
|
#include <unicode/regex.h>
|
|
#include <unicode/stringpiece.h>
|
|
#include <unicode/unistr.h>
|
|
|
|
#include "phonenumbers/base/basictypes.h"
|
|
#include "phonenumbers/base/logging.h"
|
|
#include "phonenumbers/base/memory/scoped_ptr.h"
|
|
#include "phonenumbers/default_logger.h"
|
|
#include "phonenumbers/string_byte_sink.h"
|
|
|
|
namespace i18n {
|
|
namespace phonenumbers {
|
|
|
|
using icu::RegexMatcher;
|
|
using icu::RegexPattern;
|
|
using icu::UnicodeString;
|
|
|
|
namespace {
|
|
|
|
// Converts UnicodeString 'source' to a UTF8-formatted std::string.
|
|
string UnicodeStringToUtf8String(const UnicodeString& source) {
|
|
string data;
|
|
source.toUTF8String(data);
|
|
return data;
|
|
}
|
|
|
|
// Converts UTF8-formatted std::string 'source' to a UnicodeString.
|
|
UnicodeString Utf8StringToUnicodeString(const string& source) {
|
|
// Note that we don't use icu::StringPiece(const string&).
|
|
return UnicodeString::fromUTF8(
|
|
icu::StringPiece(source.c_str(), static_cast<int>(source.size())));
|
|
}
|
|
|
|
} // namespace
|
|
|
|
// Implementation of the abstract classes RegExpInput and RegExp using ICU
|
|
// regular expression capabilities.
|
|
|
|
// ICU implementation of the RegExpInput abstract class.
|
|
class IcuRegExpInput : public RegExpInput {
|
|
public:
|
|
explicit IcuRegExpInput(const string& utf8_input)
|
|
: utf8_input_(Utf8StringToUnicodeString(utf8_input)),
|
|
position_(0) {}
|
|
|
|
virtual ~IcuRegExpInput() {}
|
|
|
|
virtual string ToString() const {
|
|
return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
|
|
}
|
|
|
|
UnicodeString* Data() {
|
|
return &utf8_input_;
|
|
}
|
|
|
|
// The current start position. For a newly created input, position is 0. Each
|
|
// call to ConsumeRegExp() or RegExp::Consume() advances the position in the
|
|
// case of the successful match to be after the match.
|
|
int position() const {
|
|
return position_;
|
|
}
|
|
|
|
void set_position(int position) {
|
|
DCHECK(position >= 0 && position <= utf8_input_.length());
|
|
position_ = position;
|
|
}
|
|
|
|
private:
|
|
UnicodeString utf8_input_;
|
|
int position_;
|
|
|
|
DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
|
|
};
|
|
|
|
// ICU implementation of the RegExp abstract class.
|
|
class IcuRegExp : public RegExp {
|
|
public:
|
|
explicit IcuRegExp(const string& utf8_regexp) {
|
|
UParseError parse_error;
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
utf8_regexp_.reset(RegexPattern::compile(
|
|
Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));
|
|
if (U_FAILURE(status)) {
|
|
// The provided regular expressions should compile correctly.
|
|
LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;
|
|
utf8_regexp_.reset(NULL);
|
|
}
|
|
}
|
|
|
|
virtual ~IcuRegExp() {}
|
|
|
|
virtual bool Consume(RegExpInput* input_string,
|
|
bool anchor_at_start,
|
|
string* matched_string1,
|
|
string* matched_string2,
|
|
string* matched_string3,
|
|
string* matched_string4,
|
|
string* matched_string5,
|
|
string* matched_string6) const {
|
|
DCHECK(input_string);
|
|
if (!utf8_regexp_.get()) {
|
|
return false;
|
|
}
|
|
IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
const scoped_ptr<RegexMatcher> matcher(
|
|
utf8_regexp_->matcher(*input->Data(), status));
|
|
bool match_succeeded = anchor_at_start
|
|
? matcher->lookingAt(input->position(), status)
|
|
: matcher->find(input->position(), status);
|
|
if (!match_succeeded || U_FAILURE(status)) {
|
|
return false;
|
|
}
|
|
string* const matched_strings[] = {matched_string1, matched_string2,
|
|
matched_string3, matched_string4,
|
|
matched_string5, matched_string6};
|
|
// If less matches than expected - fail.
|
|
for (size_t i = 0; i < arraysize(matched_strings); ++i) {
|
|
if (matched_strings[i]) {
|
|
// Groups are counted from 1 rather than 0.
|
|
const int group_index = static_cast<int>(i + 1);
|
|
if (group_index > matcher->groupCount()) {
|
|
return false;
|
|
}
|
|
*matched_strings[i] =
|
|
UnicodeStringToUtf8String(matcher->group(group_index, status));
|
|
}
|
|
}
|
|
input->set_position(matcher->end(status));
|
|
return !U_FAILURE(status);
|
|
}
|
|
|
|
bool Match(const string& input_string,
|
|
bool full_match,
|
|
string* matched_string) const {
|
|
if (!utf8_regexp_.get()) {
|
|
return false;
|
|
}
|
|
IcuRegExpInput input(input_string);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
const scoped_ptr<RegexMatcher> matcher(
|
|
utf8_regexp_->matcher(*input.Data(), status));
|
|
bool match_succeeded = full_match
|
|
? matcher->matches(input.position(), status)
|
|
: matcher->find(input.position(), status);
|
|
if (!match_succeeded || U_FAILURE(status)) {
|
|
return false;
|
|
}
|
|
if (matcher->groupCount() > 0 && matched_string) {
|
|
*matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
|
|
}
|
|
return !U_FAILURE(status);
|
|
}
|
|
|
|
bool Replace(string* string_to_process,
|
|
bool global,
|
|
const string& replacement_string) const {
|
|
DCHECK(string_to_process);
|
|
if (!utf8_regexp_.get()) {
|
|
return false;
|
|
}
|
|
IcuRegExpInput input(*string_to_process);
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
const scoped_ptr<RegexMatcher> matcher(
|
|
utf8_regexp_->matcher(*input.Data(), status));
|
|
if (U_FAILURE(status)) {
|
|
return false;
|
|
}
|
|
|
|
UnicodeString output;
|
|
// We reimplement ReplaceFirst and ReplaceAll such that their behaviour is
|
|
// consistent with the RE2 reg-ex matcher.
|
|
if (!matcher->find()) {
|
|
return false;
|
|
}
|
|
matcher->appendReplacement(output,
|
|
Utf8StringToUnicodeString(replacement_string),
|
|
status);
|
|
if (global) {
|
|
// Continue and look for more matches.
|
|
while (matcher->find()) {
|
|
matcher->appendReplacement(
|
|
output,
|
|
Utf8StringToUnicodeString(replacement_string),
|
|
status);
|
|
}
|
|
}
|
|
|
|
matcher->appendTail(output);
|
|
if (U_FAILURE(status)) {
|
|
return false;
|
|
}
|
|
const string replaced_string = UnicodeStringToUtf8String(output);
|
|
*string_to_process = replaced_string;
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
scoped_ptr<RegexPattern> utf8_regexp_;
|
|
|
|
DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
|
|
};
|
|
|
|
RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {
|
|
return new IcuRegExpInput(utf8_input);
|
|
}
|
|
|
|
RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {
|
|
return new IcuRegExp(utf8_regexp);
|
|
}
|
|
|
|
} // namespace phonenumbers
|
|
} // namespace i18n
|