| @ -0,0 +1,210 @@ | |||||
| // Copyright (C) 2011 Google Inc. | |||||
| // | |||||
| // Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| // you may not use this file except in compliance with the License. | |||||
| // You may obtain a copy of the License at | |||||
| // | |||||
| // http://www.apache.org/licenses/LICENSE-2.0 | |||||
| // | |||||
| // Unless required by applicable law or agreed to in writing, software | |||||
| // distributed under the License is distributed on an "AS IS" BASIS, | |||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| // See the License for the specific language governing permissions and | |||||
| // limitations under the License. | |||||
| // Author: George Yakovlev | |||||
| // Philippe Liard | |||||
| #include "regexp_adapter.h" | |||||
| #include <string> | |||||
| #include <unicode/regex.h> | |||||
| #include <unicode/unistr.h> | |||||
| #include "base/basictypes.h" | |||||
| #include "base/logging.h" | |||||
| #include "base/scoped_ptr.h" | |||||
| #include "default_logger.h" | |||||
| namespace i18n { | |||||
| namespace phonenumbers { | |||||
| using icu::RegexMatcher; | |||||
| using icu::RegexPattern; | |||||
| using icu::UnicodeString; | |||||
| namespace { | |||||
| // Converts UnicodeString 'source' to a UTF8-formatted std::string. | |||||
| string UnicodeStringToUtf8String(const UnicodeString& source) { | |||||
| string data; | |||||
| source.toUTF8String<string>(data); | |||||
| return data; | |||||
| } | |||||
| } // namespace | |||||
| // Implementation of the abstract classes RegExpInput and RegExp using ICU | |||||
| // regular expression capabilities. | |||||
| // ICU implementation of the RegExpInput abstract class. | |||||
| class IcuRegExpInput : public RegExpInput { | |||||
| public: | |||||
| explicit IcuRegExpInput(const string& utf8_input) | |||||
| : utf8_input_(UnicodeString::fromUTF8(utf8_input)), | |||||
| position_(0) {} | |||||
| virtual ~IcuRegExpInput() {} | |||||
| virtual string ToString() const { | |||||
| return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_)); | |||||
| } | |||||
| UnicodeString* Data() { | |||||
| return &utf8_input_; | |||||
| } | |||||
| // The current start position. For a newly created input, position is 0. Each | |||||
| // call to ConsumeRegExp() or RegExp::Consume() advances the position in the | |||||
| // case of the successful match to be after the match. | |||||
| int position() const { | |||||
| return position_; | |||||
| } | |||||
| void set_position(int position) { | |||||
| DCHECK(position >= 0 && position <= utf8_input_.length()); | |||||
| position_ = position; | |||||
| } | |||||
| private: | |||||
| UnicodeString utf8_input_; | |||||
| int position_; | |||||
| DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput); | |||||
| }; | |||||
| // ICU implementation of the RegExp abstract class. | |||||
| class IcuRegExp : public RegExp { | |||||
| public: | |||||
| explicit IcuRegExp(const string& utf8_regexp) { | |||||
| UParseError parse_error; | |||||
| UErrorCode status = U_ZERO_ERROR; | |||||
| utf8_regexp_.reset(RegexPattern::compile( | |||||
| UnicodeString::fromUTF8(utf8_regexp), 0, parse_error, status)); | |||||
| if (U_FAILURE(status)) { | |||||
| // The provided regular expressions should compile correctly. | |||||
| logger_.Error("Error compiling regular expression: " + utf8_regexp); | |||||
| utf8_regexp_.reset(NULL); | |||||
| } | |||||
| } | |||||
| virtual ~IcuRegExp() {} | |||||
| virtual bool Consume(RegExpInput* input_string, | |||||
| bool anchor_at_start, | |||||
| string* matched_string1, | |||||
| string* matched_string2, | |||||
| string* matched_string3) const { | |||||
| DCHECK(input_string); | |||||
| if (!utf8_regexp_.get()) { | |||||
| return false; | |||||
| } | |||||
| IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string); | |||||
| UErrorCode status = U_ZERO_ERROR; | |||||
| const scoped_ptr<RegexMatcher> matcher( | |||||
| utf8_regexp_->matcher(*input->Data(), status)); | |||||
| bool match_succeeded = anchor_at_start | |||||
| ? matcher->lookingAt(input->position(), status) | |||||
| : matcher->find(input->position(), status); | |||||
| if (!match_succeeded || U_FAILURE(status)) { | |||||
| return false; | |||||
| } | |||||
| string* const matched_strings[] = { | |||||
| matched_string1, matched_string2, matched_string3 | |||||
| }; | |||||
| // If less matches than expected - fail. | |||||
| for (size_t i = 0; i < arraysize(matched_strings); ++i) { | |||||
| if (matched_strings[i]) { | |||||
| // Groups are counted from 1 rather than 0. | |||||
| const int group_index = i + 1; | |||||
| if (group_index > matcher->groupCount()) { | |||||
| return false; | |||||
| } | |||||
| *matched_strings[i] = | |||||
| UnicodeStringToUtf8String(matcher->group(group_index, status)); | |||||
| } | |||||
| } | |||||
| input->set_position(matcher->end(status)); | |||||
| return !U_FAILURE(status); | |||||
| } | |||||
| bool Match(const string& input_string, | |||||
| bool full_match, | |||||
| string* matched_string) const { | |||||
| if (!utf8_regexp_.get()) { | |||||
| return false; | |||||
| } | |||||
| IcuRegExpInput input(input_string); | |||||
| UErrorCode status = U_ZERO_ERROR; | |||||
| const scoped_ptr<RegexMatcher> matcher( | |||||
| utf8_regexp_->matcher(*input.Data(), status)); | |||||
| bool match_succeeded = full_match | |||||
| ? matcher->matches(input.position(), status) | |||||
| : matcher->find(input.position(), status); | |||||
| if (!match_succeeded || U_FAILURE(status)) { | |||||
| return false; | |||||
| } | |||||
| if (matcher->groupCount() > 0 && matched_string) { | |||||
| *matched_string = UnicodeStringToUtf8String(matcher->group(1, status)); | |||||
| } | |||||
| return !U_FAILURE(status); | |||||
| } | |||||
| bool Replace(string* string_to_process, | |||||
| bool global, | |||||
| const string& replacement_string) const { | |||||
| DCHECK(string_to_process); | |||||
| if (!utf8_regexp_.get()) { | |||||
| return false; | |||||
| } | |||||
| IcuRegExpInput input(*string_to_process); | |||||
| UErrorCode status = U_ZERO_ERROR; | |||||
| const scoped_ptr<RegexMatcher> matcher( | |||||
| utf8_regexp_->matcher(*input.Data(), status)); | |||||
| if (U_FAILURE(status)) { | |||||
| return false; | |||||
| } | |||||
| UnicodeString result = global | |||||
| ? matcher->replaceAll( | |||||
| UnicodeString::fromUTF8(replacement_string), status) | |||||
| : matcher->replaceFirst( | |||||
| UnicodeString::fromUTF8(replacement_string), status); | |||||
| if (U_FAILURE(status)) { | |||||
| return false; | |||||
| } | |||||
| const string replaced_string = UnicodeStringToUtf8String(result); | |||||
| if (replaced_string == *string_to_process) { | |||||
| return false; | |||||
| } | |||||
| *string_to_process = replaced_string; | |||||
| return true; | |||||
| } | |||||
| private: | |||||
| DefaultLogger logger_; | |||||
| scoped_ptr<RegexPattern> utf8_regexp_; | |||||
| DISALLOW_COPY_AND_ASSIGN(IcuRegExp); | |||||
| }; | |||||
| RegExpInput* RegExpInput::Create(const string& utf8_input) { | |||||
| return new IcuRegExpInput(utf8_input); | |||||
| } | |||||
| RegExp* RegExp::Create(const string& utf8_regexp) { | |||||
| return new IcuRegExp(utf8_regexp); | |||||
| } | |||||
| } // namespace phonenumbers | |||||
| } // namespace i18n | |||||