utf/unilib.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

/**
 * Copyright 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// Routines to do manipulation of Unicode characters or text
//
// The StructurallyValid routines accept buffers of arbitrary bytes.
// For CoerceToStructurallyValid(), the input buffer and output buffers may
// point to exactly the same memory.
//
// In all other cases, the UTF-8 string must be structurally valid and
// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
// Debug builds take a fatal error for invalid UTF-8 input.
// The input and output buffers may not overlap at all.
//
// The char32 routines are here only for convenience; they convert to UTF-8
// internally and use the UTF-8 routines.

#ifndef UTIL_UTF8_UNILIB_H__
#define UTIL_UTF8_UNILIB_H__

#include <string>
#include "phonenumbers/base/basictypes.h"

namespace UniLib {

// Returns true unless a surrogate code point
inline bool IsValidCodepoint(char32 c) {
  // In the range [0, 0xD800) or [0xE000, 0x10FFFF]
  return (static_cast<uint32>(c) < 0xD800)
    || (c >= 0xE000 && c <= 0x10FFFF);
}

// Table of UTF-8 character lengths, based on first byte
static const unsigned char kUTF8LenTbl[256] = {
  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,

  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
  2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
  3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
};

// Return length of a single UTF-8 source character
inline int OneCharLen(const char* src) {
  return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
}

// Return length of a single UTF-8 source character
inline int OneCharLen(const uint8* src) {
  return kUTF8LenTbl[*src];
}

// Return true if this byte is a trailing UTF-8 byte (10xx xxxx)
inline bool IsTrailByte(char x) {
  // return (x & 0xC0) == 0x80;
  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
  return static_cast<signed char>(x) < -0x40;
}

// Returns the length in bytes of the prefix of src that is all
//  interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length);
inline int SpanInterchangeValid(const std::string& src) {
  return SpanInterchangeValid(src.data(), src.size());
}

// Returns true if the source is all interchange valid UTF-8
// "Interchange valid" is a stronger than structurally valid --
// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
inline bool IsInterchangeValid(const char* src, int byte_length) {
  return (byte_length == SpanInterchangeValid(src, byte_length));
}
inline bool IsInterchangeValid(const std::string& src) {
  return IsInterchangeValid(src.data(), src.size());
}

}  // namespace UniLib

#endif  // UTIL_UTF8_PUBLIC_UNILIB_H_