RuhNetConsulting
/
libphonenumber
mirror of https://github.com/google/libphonenumber


								// Copyright (C) 2011 The Libphonenumber Authors

								//

								// Licensed under the Apache License, Version 2.0 (the "License");

								// you may not use this file except in compliance with the License.

								// You may obtain a copy of the License at

								//

								// http://www.apache.org/licenses/LICENSE-2.0

								//

								// Unless required by applicable law or agreed to in writing, software

								// distributed under the License is distributed on an "AS IS" BASIS,

								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

								// See the License for the specific language governing permissions and

								// limitations under the License.


								// Author: George Yakovlev

								//         Philippe Liard


								// Note that we don't use features of ICU that depend on std::string (e.g.

								// UnicodeString::toUTF8String()) to support clients that build ICU without

								// -DU_HAVE_STD_STRING.


								#include "phonenumbers/regexp_adapter_icu.h"


								#include <stddef.h>

								#include <string>


								#include <unicode/regex.h>

								#include <unicode/stringpiece.h>

								#include <unicode/unistr.h>


								#include "phonenumbers/base/basictypes.h"

								#include "phonenumbers/base/logging.h"

								#include "phonenumbers/base/memory/scoped_ptr.h"

								#include "phonenumbers/default_logger.h"

								#include "phonenumbers/string_byte_sink.h"


								namespace i18n {

								namespace phonenumbers {


								using icu::RegexMatcher;

								using icu::RegexPattern;

								using icu::UnicodeString;


								namespace {


								// Converts UnicodeString 'source' to a UTF8-formatted std::string.

								string UnicodeStringToUtf8String(const UnicodeString& source) {

								  string data;

								  source.toUTF8String(data);

								  return data;

								}


								// Converts UTF8-formatted std::string 'source' to a UnicodeString.

								UnicodeString Utf8StringToUnicodeString(const string& source) {

								  // Note that we don't use icu::StringPiece(const string&).

								  return UnicodeString::fromUTF8(

								      icu::StringPiece(source.c_str(), static_cast<int>(source.size())));

								}


								}  // namespace


								// Implementation of the abstract classes RegExpInput and RegExp using ICU

								// regular expression capabilities.


								// ICU implementation of the RegExpInput abstract class.

								class IcuRegExpInput : public RegExpInput {

								 public:

								  explicit IcuRegExpInput(const string& utf8_input)

								      : utf8_input_(Utf8StringToUnicodeString(utf8_input)),

								        position_(0) {}


								  virtual ~IcuRegExpInput() {}


								  virtual string ToString() const {

								    return UnicodeStringToUtf8String(utf8_input_.tempSubString(position_));

								  }


								  UnicodeString* Data() {

								    return &utf8_input_;

								  }


								  // The current start position. For a newly created input, position is 0. Each

								  // call to ConsumeRegExp() or RegExp::Consume() advances the position in the

								  // case of the successful match to be after the match.

								  int position() const {

								    return position_;

								  }


								  void set_position(int position) {

								    DCHECK(position >= 0 && position <= utf8_input_.length());

								    position_ = position;

								  }


								 private:

								  UnicodeString utf8_input_;

								  int position_;


								  DISALLOW_COPY_AND_ASSIGN(IcuRegExpInput);

								};


								// ICU implementation of the RegExp abstract class.

								class IcuRegExp : public RegExp {

								 public:

								  explicit IcuRegExp(const string& utf8_regexp) {

								    UParseError parse_error;

								    UErrorCode status = U_ZERO_ERROR;

								    utf8_regexp_.reset(RegexPattern::compile(

								        Utf8StringToUnicodeString(utf8_regexp), 0, parse_error, status));

								    if (U_FAILURE(status)) {

								      // The provided regular expressions should compile correctly.

								      LOG(ERROR) << "Error compiling regular expression: " << utf8_regexp;

								      utf8_regexp_.reset(NULL);

								    }

								  }


								  virtual ~IcuRegExp() {}


								  virtual bool Consume(RegExpInput* input_string,

								                       bool anchor_at_start,

								                       string* matched_string1,

								                       string* matched_string2,

								                       string* matched_string3,

								                       string* matched_string4,

								                       string* matched_string5,

								                       string* matched_string6) const {

								    DCHECK(input_string);

								    if (!utf8_regexp_.get()) {

								      return false;

								    }

								    IcuRegExpInput* const input = static_cast<IcuRegExpInput*>(input_string);

								    UErrorCode status = U_ZERO_ERROR;

								    const scoped_ptr<RegexMatcher> matcher(

								        utf8_regexp_->matcher(*input->Data(), status));

								    bool match_succeeded = anchor_at_start

								        ? matcher->lookingAt(input->position(), status)

								        : matcher->find(input->position(), status);

								    if (!match_succeeded || U_FAILURE(status)) {

								      return false;

								    }

								    string* const matched_strings[] = {matched_string1, matched_string2,

								                                       matched_string3, matched_string4,

								                                       matched_string5, matched_string6};

								    // If less matches than expected - fail.

								    for (size_t i = 0; i < arraysize(matched_strings); ++i) {

								      if (matched_strings[i]) {

								        // Groups are counted from 1 rather than 0.

								        const int group_index = static_cast<int>(i + 1);

								        if (group_index > matcher->groupCount()) {

								          return false;

								        }

								        *matched_strings[i] =

								            UnicodeStringToUtf8String(matcher->group(group_index, status));

								      }

								    }

								    input->set_position(matcher->end(status));

								    return !U_FAILURE(status);

								  }


								  bool Match(const string& input_string,

								             bool full_match,

								             string* matched_string) const {

								    if (!utf8_regexp_.get()) {

								      return false;

								    }

								    IcuRegExpInput input(input_string);

								    UErrorCode status = U_ZERO_ERROR;

								    const scoped_ptr<RegexMatcher> matcher(

								        utf8_regexp_->matcher(*input.Data(), status));

								    bool match_succeeded = full_match

								        ? matcher->matches(input.position(), status)

								        : matcher->find(input.position(), status);

								    if (!match_succeeded || U_FAILURE(status)) {

								      return false;

								    }

								    if (matcher->groupCount() > 0 && matched_string) {

								      *matched_string = UnicodeStringToUtf8String(matcher->group(1, status));

								    }

								    return !U_FAILURE(status);

								  }


								  bool Replace(string* string_to_process,

								               bool global,

								               const string& replacement_string) const {

								    DCHECK(string_to_process);

								    if (!utf8_regexp_.get()) {

								      return false;

								    }

								    IcuRegExpInput input(*string_to_process);

								    UErrorCode status = U_ZERO_ERROR;

								    const scoped_ptr<RegexMatcher> matcher(

								        utf8_regexp_->matcher(*input.Data(), status));

								    if (U_FAILURE(status)) {

								      return false;

								    }


								    UnicodeString output;

								    // We reimplement ReplaceFirst and ReplaceAll such that their behaviour is

								    // consistent with the RE2 reg-ex matcher.

								    if (!matcher->find()) {

								      return false;

								    }

								    matcher->appendReplacement(output,

								                               Utf8StringToUnicodeString(replacement_string),

								                               status);

								    if (global) {

								      // Continue and look for more matches.

								      while (matcher->find()) {

								        matcher->appendReplacement(

								            output,

								            Utf8StringToUnicodeString(replacement_string),

								            status);

								      }

								    }


								    matcher->appendTail(output);

								    if (U_FAILURE(status)) {

								      return false;

								    }

								    const string replaced_string = UnicodeStringToUtf8String(output);

								    *string_to_process = replaced_string;

								    return true;

								  }


								 private:

								  scoped_ptr<RegexPattern> utf8_regexp_;


								  DISALLOW_COPY_AND_ASSIGN(IcuRegExp);

								};


								RegExpInput* ICURegExpFactory::CreateInput(const string& utf8_input) const {

								  return new IcuRegExpInput(utf8_input);

								}


								RegExp* ICURegExpFactory::CreateRegExp(const string& utf8_regexp) const {

								  return new IcuRegExp(utf8_regexp);

								}


								}  // namespace phonenumbers

								}  // namespace i18n