Browse Source

CPP: ICU regex adaptor class. Contributed by philip.liard.

pull/567/head
Lara Scheidegger 15 years ago
committed by Mihaela Rosca
parent
commit
5ec881ddc6
1 changed files with 210 additions and 0 deletions
  1. +210
    -0
      cpp/src/regexp_adapter_icu.cc

+ 210
- 0
cpp/src/regexp_adapter_icu.cc View File

@ -0,0 +1,210 @@
// Copyright (C) 2011 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Author: George Yakovlev
// Philippe Liard
#include "regexp_adapter.h"
#include <string>
#include <unicode/regex.h>
#include <unicode/unistr.h>
#include "base/basictypes.h"
#include "base/logging.h"
#include "base/scoped_ptr.h"
#include "default_logger.h"
namespace i18n {
namespace phonenumbers {
using icu::RegexMatcher;
using icu::RegexPattern;
using icu::UnicodeString;
namespace {
// Converts UnicodeString 'source' to a UTF8-formatted std::string.
string UnicodeStringToUtf8String(const UnicodeString& source) {
string data;
source.toUTF8String<string>(data);
return data;
}
} // namespace
// Implementation of the abstract classes RegExpInput and RegExp using ICU
// regular expression capabilities.
// ICU implementation of the RegExpInput abstract class.
class IcuRegExpInput : public RegExpInput {
public:
explicit IcuRegExpInput(const string& utf8_input)
: utf8_input_(UnicodeString::fromUTF8(utf8_input)),
position_(0) {}
virtual ~IcuRegExpInput() {}
virtual string ToString() const {
return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));
}
UnicodeString* Data() {
return &utf8_input_;
}
// The current start position. For a newly created input, position is 0. Each
// call to ConsumeRegExp() or RegExp::Consume() advances the position in the
// case of the successful match to be after the match.
int position() const {
return position_;
}
void set_position(int position) {
DCHECK(position >= 0 && position <= utf8_input_.length());
position_ = position;
}
private:
UnicodeString utf8_input_;
int position_;
DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);
};
// ICU implementation of the RegExp abstract class.
class IcuRegExp : public RegExp {
public:
explicit IcuRegExp(const string& utf8_regexp) {
UParseError parse_error;
UErrorCode status = U_ZERO_ERROR;
utf8_regexp_.reset(RegexPattern::compile(
UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status));
if (U_FAILURE(status)) {
// The provided regular expressions should compile correctly.
logger_.Error("Error compiling regular expression: " + utf8_regexp);
utf8_regexp_.reset(NULL);
}
}
virtual ~IcuRegExp() {}
virtual bool Consume(RegExpInput* input_string,
bool anchor_at_start,
string* matched_string1,
string* matched_string2,
string* matched_string3) const {
DCHECK(input_string);
if (!utf8_regexp_.get()) {
return false;
}
IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);
UErrorCode status = U_ZERO_ERROR;
const scoped_ptr<RegexMatcher> matcher(
utf8_regexp_->matcher(*input->Data(), status));
bool match_succeeded = anchor_at_start
? matcher->lookingAt(input->position(), status)
: matcher->find(input->position(), status);
if (!match_succeeded || U_FAILURE(status)) {
return false;
}
string* const matched_strings[] = {
matched_string1, matched_string2, matched_string3
};
// If less matches than expected - fail.
for (size_t i = 0; i < arraysize(matched_strings); ++i) {
if (matched_strings[i]) {
// Groups are counted from 1 rather than 0.
const int group_index = i + 1;
if (group_index > matcher->groupCount()) {
return false;
}
*matched_strings[i] =
UnicodeStringToUtf8String(matcher->group(group_index, status));
}
}
input->set_position(matcher->end(status));
return !U_FAILURE(status);
}
bool Match(const string& input_string,
bool full_match,
string* matched_string) const {
if (!utf8_regexp_.get()) {
return false;
}
IcuRegExpInput input(input_string);
UErrorCode status = U_ZERO_ERROR;
const scoped_ptr<RegexMatcher> matcher(
utf8_regexp_->matcher(*input.Data(), status));
bool match_succeeded = full_match
? matcher->matches(input.position(), status)
: matcher->find(input.position(), status);
if (!match_succeeded || U_FAILURE(status)) {
return false;
}
if (matcher->groupCount() > 0 && matched_string) {
*matched_string = UnicodeStringToUtf8String(matcher->group(1, status));
}
return !U_FAILURE(status);
}
bool Replace(string* string_to_process,
bool global,
const string& replacement_string) const {
DCHECK(string_to_process);
if (!utf8_regexp_.get()) {
return false;
}
IcuRegExpInput input(*string_to_process);
UErrorCode status = U_ZERO_ERROR;
const scoped_ptr<RegexMatcher> matcher(
utf8_regexp_->matcher(*input.Data(), status));
if (U_FAILURE(status)) {
return false;
}
UnicodeString result = global
? matcher->replaceAll(
UnicodeString::fromUTF8(replacement_string), status)
: matcher->replaceFirst(
UnicodeString::fromUTF8(replacement_string), status);
if (U_FAILURE(status)) {
return false;
}
const string replaced_string = UnicodeStringToUtf8String(result);
if (replaced_string == *string_to_process) {
return false;
}
*string_to_process = replaced_string;
return true;
}
private:
DefaultLogger logger_;
scoped_ptr<RegexPattern> utf8_regexp_;
DISALLOW_COPY_AND_ASSIGN(IcuRegExp);
};
RegExpInput* RegExpInput::Create(const string& utf8_input) {
return new IcuRegExpInput(utf8_input);
}
RegExp* RegExp::Create(const string& utf8_regexp) {
return new IcuRegExp(utf8_regexp);
}
} // namespace phonenumbers
} // namespace i18n

Loading…
Cancel
Save