1 files changed, 514 insertions, 0 deletions
diff --git a/utf/unicodetext.cc b/utf/unicodetext.cc
new file mode 100644
index 0000000..6d4762e
--- /dev/null
+++ b/utf/unicodetext.cc
@@ -0,0 +1,514 @@
+// Copyright (C) 2006 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Author: Jim Meehan
+
+#include <iostream>
+#include <sstream>
+#include <cassert>
+
+#include "phonenumbers/utf/unicodetext.h"
+//#include "base/logging.h"
+#include "phonenumbers/utf/stringpiece.h"
+//#include "utf/stringprintf.h"
+#include "phonenumbers/utf/utf.h"
+#include "phonenumbers/utf/unilib.h"
+
+using std::stringstream;
+using std::max;
+using std::hex;
+using std::dec;
+using std::cerr;
+using std::endl;
+
+static int CodepointDistance(const char* start, const char* end) {
+  int n = 0;
+  // Increment n on every non-trail-byte.
+  for (const char* p = start; p < end; ++p) {
+    n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
+  }
+  return n;
+}
+
+static int CodepointCount(const char* utf8, int len) {
+  return CodepointDistance(utf8, utf8 + len);
+}
+
+UnicodeText::const_iterator::difference_type
+distance(const UnicodeText::const_iterator& first,
+         const UnicodeText::const_iterator& last) {
+  return CodepointDistance(first.it_, last.it_);
+}
+
+// ---------- Utility ----------
+
+static int ConvertToInterchangeValid(char* start, int len) {
+  // This routine is called only when we've discovered that a UTF-8 buffer
+  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
+  // was not interchange valid. This indicates a bug in the caller, and
+  // a LOG(WARNING) is done in that case.
+  // This is similar to CoerceToInterchangeValid, but it replaces each
+  // structurally valid byte with a space, and each non-interchange
+  // character with a space, even when that character requires more
+  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
+  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
+  // code point. The result should contain one space, not three.
+  //
+  // Since the conversion never needs to write more data than it
+  // reads, it is safe to change the buffer in place. It returns the
+  // number of bytes written.
+  char* const in = start;
+  char* out = start;
+  char* const end = start + len;
+  while (start < end) {
+    int good = UniLib::SpanInterchangeValid(start, end - start);
+    if (good > 0) {
+      if (out != start) {
+        memmove(out, start, good);
+      }
+      out += good;
+      start += good;
+      if (start == end) {
+        break;
+      }
+    }
+    // Is the current string invalid UTF8 or just non-interchange UTF8?
+    char32 rune;
+    int n;
+    if (isvalidcharntorune(start, end - start, &rune, &n)) {
+      // structurally valid UTF8, but not interchange valid
+      start += n;  // Skip over the whole character.
+    } else {  // bad UTF8
+      start += 1;  // Skip over just one byte
+    }
+    *out++ = ' ';
+  }
+  return out - in;
+}
+
+
+// *************** Data representation **********
+
+// Note: the copy constructor is undefined.
+
+// After reserve(), resize(), or clear(), we're an owner, not an alias.
+
+void UnicodeText::Repr::reserve(int new_capacity) {
+  // If there's already enough capacity, and we're an owner, do nothing.
+  if (capacity_ >= new_capacity && ours_) return;
+
+  // Otherwise, allocate a new buffer.
+  capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20);
+  char* new_data = new char[capacity_];
+
+  // If there is an old buffer, copy it into the new buffer.
+  if (data_) {
+    memcpy(new_data, data_, size_);
+    if (ours_) delete[] data_;  // If we owned the old buffer, free it.
+  }
+  data_ = new_data;
+  ours_ = true;  // We own the new buffer.
+  // size_ is unchanged.
+}
+
+void UnicodeText::Repr::resize(int new_size) {
+  if (new_size == 0) {
+    clear();
+  } else {
+    if (!ours_ || new_size > capacity_) reserve(new_size);
+    // Clear the memory in the expanded part.
+    if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
+    size_ = new_size;
+    ours_ = true;
+  }
+}
+
+// This implementation of clear() deallocates the buffer if we're an owner.
+// That's not strictly necessary; we could just set size_ to 0.
+void UnicodeText::Repr::clear() {
+  if (ours_) delete[] data_;
+  data_ = NULL;
+  size_ = capacity_ = 0;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::Copy(const char* data, int size) {
+  resize(size);
+  memcpy(data_, data, size);
+}
+
+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
+  if (data == data_) return;  // We already own this memory. (Weird case.)
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = data;
+  size_ = size;
+  capacity_ = capacity;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::PointTo(const char* data, int size) {
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = const_cast<char*>(data);
+  size_ = size;
+  capacity_ = size;
+  ours_ = false;
+}
+
+void UnicodeText::Repr::append(const char* bytes, int byte_length) {
+  reserve(size_ + byte_length);
+  memcpy(data_ + size_, bytes, byte_length);
+  size_ += byte_length;
+}
+
+string UnicodeText::Repr::DebugString() const {
+  stringstream ss;
+
+  ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec
+     << size_ << " capacity=" << capacity_ << " "
+     << (ours_ ? "Owned" : "Alias") << "}";
+
+  string result;
+  ss >> result;
+
+  return result;
+}
+
+
+
+// *************** UnicodeText ******************
+
+// ----- Constructors -----
+
+// Default constructor
+UnicodeText::UnicodeText() {
+}
+
+// Copy constructor
+UnicodeText::UnicodeText(const UnicodeText& src) {
+  Copy(src);
+}
+
+// Substring constructor
+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
+                         const UnicodeText::const_iterator& last) {
+  assert(first <= last && "Incompatible iterators");
+  repr_.append(first.it_, last.it_ - first.it_);
+}
+
+string UnicodeText::UTF8Substring(const const_iterator& first,
+                                  const const_iterator& last) {
+  assert(first <= last && "Incompatible iterators");
+  return string(first.it_, last.it_ - first.it_);
+}
+
+
+// ----- Copy -----
+
+UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
+  if (this != &src) {
+    Copy(src);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
+  repr_.Copy(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    cerr << "UTF-8 buffer is not interchange-valid." << endl;
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
+                                           int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  return *this;
+}
+
+// ----- TakeOwnershipOf  -----
+
+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
+                                              int byte_length,
+                                              int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    cerr << "UTF-8 buffer is not interchange-valid." << endl;
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
+                                                    int byte_length,
+                                                    int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  return *this;
+}
+
+// ----- PointTo -----
+
+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
+  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    repr_.PointTo(buffer, byte_length);
+  } else {
+    cerr << "UTF-8 buffer is not interchange-valid." << endl;
+    repr_.Copy(buffer, byte_length);
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
+                                          int byte_length) {
+  repr_.PointTo(buffer, byte_length);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
+  repr_.PointTo(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const const_iterator &first,
+                                  const const_iterator &last) {
+  assert(first <= last && " Incompatible iterators");
+  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
+  return *this;
+}
+
+// ----- Append -----
+
+UnicodeText& UnicodeText::append(const UnicodeText& u) {
+  repr_.append(u.repr_.data_, u.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::append(const const_iterator& first,
+                                 const const_iterator& last) {
+  assert(first <= last && "Incompatible iterators");
+  repr_.append(first.it_, last.it_ - first.it_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
+  repr_.append(utf8, len);
+  return *this;
+}
+
+// ----- substring searching -----
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
+                                              const_iterator start_pos) const {
+  assert(start_pos.utf8_data() >= utf8_data());
+  assert(start_pos.utf8_data() <= utf8_data() + utf8_length());
+  return UnsafeFind(look, start_pos);
+}
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
+  return UnsafeFind(look, begin());
+}
+
+UnicodeText::const_iterator UnicodeText::UnsafeFind(
+    const UnicodeText& look, const_iterator start_pos) const {
+  // Due to the magic of the UTF8 encoding, searching for a sequence of
+  // letters is equivalent to substring search.
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece look_piece(look.utf8_data(), look.utf8_length());
+  StringPiece::size_type found =
+      searching.find(look_piece, start_pos.utf8_data() - utf8_data());
+  if (found == StringPiece::npos) return end();
+  return const_iterator(utf8_data() + found);
+}
+
+bool UnicodeText::HasReplacementChar() const {
+  // Equivalent to:
+  //   UnicodeText replacement_char;
+  //   replacement_char.push_back(0xFFFD);
+  //   return find(replacement_char) != end();
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece looking_for("\xEF\xBF\xBD", 3);
+  return searching.find(looking_for) != StringPiece::npos;
+}
+
+// ----- other methods -----
+
+// Clear operator
+void UnicodeText::clear() {
+  repr_.clear();
+}
+
+// Destructor
+UnicodeText::~UnicodeText() {}
+
+
+void UnicodeText::push_back(char32 c) {
+  if (UniLib::IsValidCodepoint(c)) {
+    char buf[UTFmax];
+    int len = runetochar(buf, &c);
+    if (UniLib::IsInterchangeValid(buf, len)) {
+      repr_.append(buf, len);
+    } else {
+      cerr << "Unicode value 0x" << hex << c
+           << " is not valid for interchange" << endl;
+      repr_.append(" ", 1);
+    }
+  } else {
+    cerr << "Illegal Unicode value: 0x" << hex << c << endl;
+    repr_.append(" ", 1);
+  }
+}
+
+int UnicodeText::size() const {
+  return CodepointCount(repr_.data_, repr_.size_);
+}
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
+  if (&lhs == &rhs) return true;
+  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
+  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
+}
+
+string UnicodeText::DebugString() const {
+  stringstream ss;
+
+  ss << "{UnicodeText " << hex << this << dec << " chars="
+     << size() << " repr=" << repr_.DebugString() << "}";
+#if 0
+  return StringPrintf("{UnicodeText %p chars=%d repr=%s}",
+                      this,
+                      size(),
+                      repr_.DebugString().c_str());
+#endif
+  string result;
+  ss >> result;
+
+  return result;
+}
+
+
+// ******************* UnicodeText::const_iterator *********************
+
+// The implementation of const_iterator would be nicer if it
+// inherited from boost::iterator_facade
+// (http://boost.org/libs/iterator/doc/iterator_facade.html).
+
+UnicodeText::const_iterator::const_iterator() : it_(0) {}
+
+UnicodeText::const_iterator::const_iterator(const const_iterator& other)
+    : it_(other.it_) {
+}
+
+UnicodeText::const_iterator&
+UnicodeText::const_iterator::operator=(const const_iterator& other) {
+  if (&other != this)
+    it_ = other.it_;
+  return *this;
+}
+
+UnicodeText::const_iterator UnicodeText::begin() const {
+  return const_iterator(repr_.data_);
+}
+
+UnicodeText::const_iterator UnicodeText::end() const {
+  return const_iterator(repr_.data_ + repr_.size_);
+}
+
+bool operator<(const UnicodeText::const_iterator& lhs,
+               const UnicodeText::const_iterator& rhs) {
+  return lhs.it_ < rhs.it_;
+}
+
+char32 UnicodeText::const_iterator::operator*() const {
+  // (We could call chartorune here, but that does some
+  // error-checking, and we're guaranteed that our data is valid
+  // UTF-8. Also, we expect this routine to be called very often. So
+  // for speed, we do the calculation ourselves.)
+
+  // Convert from UTF-8
+  uint8 byte1 = static_cast<uint8>(it_[0]);
+  if (byte1 < 0x80)
+    return byte1;
+
+  uint8 byte2 = static_cast<uint8>(it_[1]);
+  if (byte1 < 0xE0)
+    return ((byte1 & 0x1F) << 6)
+          | (byte2 & 0x3F);
+
+  uint8 byte3 = static_cast<uint8>(it_[2]);
+  if (byte1 < 0xF0)
+    return ((byte1 & 0x0F) << 12)
+         | ((byte2 & 0x3F) << 6)
+         |  (byte3 & 0x3F);
+
+  uint8 byte4 = static_cast<uint8>(it_[3]);
+  return ((byte1 & 0x07) << 18)
+       | ((byte2 & 0x3F) << 12)
+       | ((byte3 & 0x3F) << 6)
+       |  (byte4 & 0x3F);
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
+  it_ += UniLib::OneCharLen(it_);
+  return *this;
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
+  while (UniLib::IsTrailByte(*--it_)) { }
+  return *this;
+}
+
+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
+  utf8_output[0] = it_[0];
+  if (static_cast<unsigned char>(it_[0]) < 0x80)
+    return 1;
+
+  utf8_output[1] = it_[1];
+  if (static_cast<unsigned char>(it_[0]) < 0xE0)
+    return 2;
+
+  utf8_output[2] = it_[2];
+  if (static_cast<unsigned char>(it_[0]) < 0xF0)
+    return 3;
+
+  utf8_output[3] = it_[3];
+  return 4;
+}
+
+
+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
+  assert(p != NULL);
+  const char* start = utf8_data();
+  int len = utf8_length();
+  const char* end = start + len;
+  assert(p >= start);
+  assert(p <= end);
+  assert(p == end || !UniLib::IsTrailByte(*p));
+  return const_iterator(p);
+}
+
+string UnicodeText::const_iterator::DebugString() const {
+  stringstream ss;
+
+  ss << "{iter " << hex << it_ << "}";
+  string result;
+  ss >> result;
+
+  return result;
+}