| @ -0,0 +1,210 @@ | |||
| // Copyright (C) 2011 Google Inc. | |||
| // | |||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||
| // you may not use this file except in compliance with the License. | |||
| // You may obtain a copy of the License at | |||
| // | |||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software | |||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| // See the License for the specific language governing permissions and | |||
| // limitations under the License. | |||
| // Author: George Yakovlev | |||
| // Philippe Liard | |||
| #include "regexp_adapter.h" | |||
| #include <string> | |||
| #include <unicode/regex.h> | |||
| #include <unicode/unistr.h> | |||
| #include "base/basictypes.h" | |||
| #include "base/logging.h" | |||
| #include "base/scoped_ptr.h" | |||
| #include "default_logger.h" | |||
| namespace i18n { | |||
| namespace phonenumbers { | |||
| using icu::RegexMatcher; | |||
| using icu::RegexPattern; | |||
| using icu::UnicodeString; | |||
| namespace { | |||
| // Converts UnicodeString 'source' to a UTF8-formatted std::string. | |||
| string UnicodeStringToUtf8String(const UnicodeString& source) { | |||
| string data; | |||
| source.toUTF8String<string>(data); | |||
| return data; | |||
| } | |||
| } // namespace | |||
| // Implementation of the abstract classes RegExpInput and RegExp using ICU | |||
| // regular expression capabilities. | |||
| // ICU implementation of the RegExpInput abstract class. | |||
| class IcuRegExpInput : public RegExpInput { | |||
| public: | |||
| explicit IcuRegExpInput(const string& utf8_input) | |||
| : utf8_input_(UnicodeString::fromUTF8(utf8_input)), | |||
| position_(0) {} | |||
| virtual ~IcuRegExpInput() {} | |||
| virtual string ToString() const { | |||
| return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); | |||
| } | |||
| UnicodeString* Data() { | |||
| return &utf8_input_; | |||
| } | |||
| // The current start position. For a newly created input, position is 0. Each | |||
| // call to ConsumeRegExp() or RegExp::Consume() advances the position in the | |||
| // case of the successful match to be after the match. | |||
| int position() const { | |||
| return position_; | |||
| } | |||
| void set_position(int position) { | |||
| DCHECK(position >= 0 && position <= utf8_input_.length()); | |||
| position_ = position; | |||
| } | |||
| private: | |||
| UnicodeString utf8_input_; | |||
| int position_; | |||
| DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); | |||
| }; | |||
| // ICU implementation of the RegExp abstract class. | |||
| class IcuRegExp : public RegExp { | |||
| public: | |||
| explicit IcuRegExp(const string& utf8_regexp) { | |||
| UParseError parse_error; | |||
| UErrorCode status = U_ZERO_ERROR; | |||
| utf8_regexp_.reset(RegexPattern::compile( | |||
| UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status)); | |||
| if (U_FAILURE(status)) { | |||
| // The provided regular expressions should compile correctly. | |||
| logger_.Error("Error compiling regular expression: " + utf8_regexp); | |||
| utf8_regexp_.reset(NULL); | |||
| } | |||
| } | |||
| virtual ~IcuRegExp() {} | |||
| virtual bool Consume(RegExpInput* input_string, | |||
| bool anchor_at_start, | |||
| string* matched_string1, | |||
| string* matched_string2, | |||
| string* matched_string3) const { | |||
| DCHECK(input_string); | |||
| if (!utf8_regexp_.get()) { | |||
| return false; | |||
| } | |||
| IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); | |||
| UErrorCode status = U_ZERO_ERROR; | |||
| const scoped_ptr<RegexMatcher> matcher( | |||
| utf8_regexp_->matcher(*input->Data(), status)); | |||
| bool match_succeeded = anchor_at_start | |||
| ? matcher->lookingAt(input->position(), status) | |||
| : matcher->find(input->position(), status); | |||
| if (!match_succeeded || U_FAILURE(status)) { | |||
| return false; | |||
| } | |||
| string* const matched_strings[] = { | |||
| matched_string1, matched_string2, matched_string3 | |||
| }; | |||
| // If less matches than expected - fail. | |||
| for (size_t i = 0; i < arraysize(matched_strings); ++i) { | |||
| if (matched_strings[i]) { | |||
| // Groups are counted from 1 rather than 0. | |||
| const int group_index = i + 1; | |||
| if (group_index > matcher->groupCount()) { | |||
| return false; | |||
| } | |||
| *matched_strings[i] = | |||
| UnicodeStringToUtf8String(matcher->group(group_index, status)); | |||
| } | |||
| } | |||
| input->set_position(matcher->end(status)); | |||
| return !U_FAILURE(status); | |||
| } | |||
| bool Match(const string& input_string, | |||
| bool full_match, | |||
| string* matched_string) const { | |||
| if (!utf8_regexp_.get()) { | |||
| return false; | |||
| } | |||
| IcuRegExpInput input(input_string); | |||
| UErrorCode status = U_ZERO_ERROR; | |||
| const scoped_ptr<RegexMatcher> matcher( | |||
| utf8_regexp_->matcher(*input.Data(), status)); | |||
| bool match_succeeded = full_match | |||
| ? matcher->matches(input.position(), status) | |||
| : matcher->find(input.position(), status); | |||
| if (!match_succeeded || U_FAILURE(status)) { | |||
| return false; | |||
| } | |||
| if (matcher->groupCount() > 0 && matched_string) { | |||
| *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); | |||
| } | |||
| return !U_FAILURE(status); | |||
| } | |||
| bool Replace(string* string_to_process, | |||
| bool global, | |||
| const string& replacement_string) const { | |||
| DCHECK(string_to_process); | |||
| if (!utf8_regexp_.get()) { | |||
| return false; | |||
| } | |||
| IcuRegExpInput input(*string_to_process); | |||
| UErrorCode status = U_ZERO_ERROR; | |||
| const scoped_ptr<RegexMatcher> matcher( | |||
| utf8_regexp_->matcher(*input.Data(), status)); | |||
| if (U_FAILURE(status)) { | |||
| return false; | |||
| } | |||
| UnicodeString result = global | |||
| ? matcher->replaceAll( | |||
| UnicodeString::fromUTF8(replacement_string), status) | |||
| : matcher->replaceFirst( | |||
| UnicodeString::fromUTF8(replacement_string), status); | |||
| if (U_FAILURE(status)) { | |||
| return false; | |||
| } | |||
| const string replaced_string = UnicodeStringToUtf8String(result); | |||
| if (replaced_string == *string_to_process) { | |||
| return false; | |||
| } | |||
| *string_to_process = replaced_string; | |||
| return true; | |||
| } | |||
| private: | |||
| DefaultLogger logger_; | |||
| scoped_ptr<RegexPattern> utf8_regexp_; | |||
| DISALLOW_COPY_AND_ASSIGN(IcuRegExp); | |||
| }; | |||
| RegExpInput* RegExpInput::Create(const string& utf8_input) { | |||
| return new IcuRegExpInput(utf8_input); | |||
| } | |||
| RegExp* RegExp::Create(const string& utf8_regexp) { | |||
| return new IcuRegExp(utf8_regexp); | |||
| } | |||
| } // namespace phonenumbers | |||
| } // namespace i18n | |||