native/utils/utf8/unicodetext.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

/*
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_
#define LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_

#include <iterator>
#include <string>
#include <utility>
#include <vector>

#include "utils/base/integral_types.h"
#include "utils/base/logging.h"
#include "utils/strings/stringpiece.h"
#include "absl/strings/string_view.h"

namespace libtextclassifier3 {

// ***************************** UnicodeText **************************
//
// A UnicodeText object is a wrapper around a sequence of Unicode
// codepoint values that allows iteration over these values.
//
// The internal representation of the text is UTF-8. Since UTF-8 is a
// variable-width format, UnicodeText does not provide random access
// to the text, and changes to the text are permitted only at the end.
//
// The UnicodeText class defines a const_iterator. The dereferencing
// operator (*) returns a codepoint (int32). The iterator is a
// read-only iterator. It becomes invalid if the text is changed.
//
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
// 0x10FFFF], but UnicodeText has the additional restriction that it
// can contain only those characters that are valid for interchange on
// the Web. This excludes all of the control codes except for carriage
// return, line feed, and horizontal tab.  It also excludes
// non-characters, but codepoints that are in the Private Use regions
// are allowed, as are codepoints that are unassigned. (See the
// Unicode reference for details.)
//
// MEMORY MANAGEMENT:
//
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
//
// The purpose of an alias is to avoid making an unnecessary copy of a
// UTF-8 buffer while still providing access to the Unicode values
// within that text through iterators. The lifetime of an alias must not
// exceed the lifetime of the buffer from which it was constructed.
//
// Aliases should be used with care. If the source from which an alias
// was created is freed, or if the contents are changed, while the
// alias is still in use, fatal errors could result. But it can be
// quite useful to have a UnicodeText "window" through which to see a
// UTF-8 buffer without having to pay the price of making a copy.

class UnicodeText {
 public:
  class const_iterator;

  UnicodeText();  // Create an empty text.
  UnicodeText(const UnicodeText& src, bool do_copy = true);
  UnicodeText& operator=(UnicodeText&& src);
  ~UnicodeText();

  class const_iterator {
    typedef const_iterator CI;

   public:
    typedef std::bidirectional_iterator_tag iterator_category;
    typedef char32 value_type;
    typedef int difference_type;
    typedef void pointer;            // (Not needed.)
    typedef const char32 reference;  // (Needed for const_reverse_iterator)

    // Iterators are default-constructible.
    const_iterator();

    // It's safe to make multiple passes over a UnicodeText.
    const_iterator& operator=(const const_iterator&) = default;

    char32 operator*() const;  // Dereference

    const_iterator& operator++();     // Advance (++iter)
    const_iterator operator++(int) {  // (iter++)
      const_iterator result(*this);
      ++*this;
      return result;
    }

    const_iterator& operator--();     // Retreat (--iter)
    const_iterator operator--(int) {  // (iter--)
      const_iterator result(*this);
      --*this;
      return result;
    }

    friend bool operator==(const CI& lhs, const CI& rhs) {
      return lhs.it_ == rhs.it_;
    }
    friend bool operator!=(const CI& lhs, const CI& rhs) {
      return !(lhs == rhs);
    }
    friend bool operator<(const CI& lhs, const CI& rhs);
    friend bool operator>(const CI& lhs, const CI& rhs) { return rhs < lhs; }
    friend bool operator<=(const CI& lhs, const CI& rhs) {
      return !(rhs < lhs);
    }
    friend bool operator>=(const CI& lhs, const CI& rhs) {
      return !(lhs < rhs);
    }

    int utf8_length() const {
      const unsigned char byte = static_cast<unsigned char>(it_[0]);
      if (byte < 0x80) {
        return 1;
      } else if (byte < 0xE0) {
        return 2;
      } else if (byte < 0xF0) {
        return 3;
      } else {
        return 4;
      }
    }
    const char* utf8_data() const { return it_; }

   private:
    friend class UnicodeText;
    explicit const_iterator(const char* it) : it_(it) {}

    const char* it_;
  };

  const_iterator begin() const;
  const_iterator end() const;

  // Gets pointer to the underlying utf8 data.
  const char* data() const;

  // Gets length (in bytes) of the underlying utf8 data.
  int size_bytes() const;

  // Computes length (in number of Unicode codepoints) of the underlying utf8
  // data.
  // NOTE: Complexity O(n).
  int size_codepoints() const;

  bool empty() const;

  // Checks whether the underlying data is valid utf8 data.
  bool is_valid() const;

  bool operator==(const UnicodeText& other) const;

  // x.PointToUTF8(buf,len) changes x so that it points to buf
  // ("becomes an alias"). It does not take ownership or copy buf.
  // This function assumes that the input is interchange valid UTF8.
  UnicodeText& Copy(const UnicodeText& src);
  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);

  // Calling this may invalidate pointers to underlying data.
  UnicodeText& AppendUTF8(const char* utf8, int len);
  UnicodeText& push_back(char32 ch);
  void clear();

  // Returns an iterator for each codepoint.
  std::vector<const_iterator> Codepoints() const;

  // Returns the list of codepoints of the UnicodeText.
  std::vector<char32> CodepointsChar32() const;

  std::string ToUTF8String() const;
  std::string UTF8Substring(int begin_codepoint, int end_codepoint) const;
  static std::string UTF8Substring(const const_iterator& it_begin,
                                   const const_iterator& it_end);
  static UnicodeText Substring(const UnicodeText& text, int begin_codepoint,
                               int end_codepoint, bool do_copy = true);
  static UnicodeText Substring(const const_iterator& it_begin,
                               const const_iterator& it_end,
                               bool do_copy = true);

 private:
  friend class const_iterator;

  class Repr {  // A byte-string.
   public:
    char* data_;
    int size_;
    int capacity_;
    bool ours_;  // Do we own data_?

    Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
    Repr& operator=(Repr&& src);
    ~Repr() {
      if (ours_) delete[] data_;
    }

    void clear();
    void reserve(int capacity);
    void resize(int size);

    void append(const char* bytes, int byte_length);
    void Copy(const char* data, int size);
    void PointTo(const char* data, int size);

   private:
    Repr& operator=(const Repr&);
    Repr(const Repr& other);
  };

  Repr repr_;
};

typedef std::pair<UnicodeText::const_iterator, UnicodeText::const_iterator>
    UnicodeTextRange;

// NOTE: The following are needed to avoid implicit conversion from char* to
// std::string, or from ::string to std::string, because if this happens it
// often results in invalid memory access to a temporary object created during
// such conversion (if do_copy == false).
// NOTE: These methods don't check if the input string is UTF8 well formed, for
// efficiency reasons. Use UnicodeText::is_valid() when explicitly needed.
UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
                              bool do_copy = true);
UnicodeText UTF8ToUnicodeText(const char* utf8_buf, bool do_copy = true);
UnicodeText UTF8ToUnicodeText(const std::string& str, bool do_copy = true);
UnicodeText UTF8ToUnicodeText(StringPiece str, bool do_copy = true);
UnicodeText UTF8ToUnicodeText(absl::string_view str, bool do_copy = true);

inline logging::LoggingStringStream& operator<<(
    logging::LoggingStringStream& stream, const UnicodeText& message) {
  stream.message.append(message.data(), message.size_bytes());
  return stream;
}

}  // namespace libtextclassifier3

#endif  // LIBTEXTCLASSIFIER_UTILS_UTF8_UNICODETEXT_H_