aboutsummaryrefslogtreecommitdiff
path: root/core/fxcrt/utf16.h
blob: f42f190980090057b79e443625f4500ac20d612c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// Copyright 2023 The PDFium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CORE_FXCRT_UTF16_H_
#define CORE_FXCRT_UTF16_H_

#include "third_party/base/check.h"

namespace pdfium {

// The number of suffix bits in a UTF-16 surrogate.
inline constexpr int kSurrogateBits = 10;

// A bitmask for the suffix of a UTF-16 surrogate.
inline constexpr char16_t kSurrogateMask = (1 << kSurrogateBits) - 1;

// The first supplementary code point, `U+10000`.
inline constexpr char32_t kMinimumSupplementaryCodePoint = 0x10000;

// The last supplementary code point, `U+10FFFF`.
inline constexpr char32_t kMaximumSupplementaryCodePoint =
    kMinimumSupplementaryCodePoint +
    (kSurrogateMask << kSurrogateBits | kSurrogateMask);

// The first UTF-16 high surrogate code unit, `U+D800`.
inline constexpr char16_t kMinimumHighSurrogateCodeUnit = 0xd800;

// The last UTF-16 high surrogate code unit, `U+DBFF`.
inline constexpr char16_t kMaximumHighSurrogateCodeUnit =
    kMinimumHighSurrogateCodeUnit | kSurrogateMask;

// The first UTF-16 low surrogate code unit, `U+DC00`.
inline constexpr char16_t kMinimumLowSurrogateCodeUnit =
    kMaximumHighSurrogateCodeUnit + 1;

// The last UTF-16 low surrogate code unit, `U+DFFF`.
inline constexpr char16_t kMaximumLowSurrogateCodeUnit =
    kMinimumLowSurrogateCodeUnit | kSurrogateMask;

// Returns `true` if `code_point` is in a supplementary plane, and therefore
// requires encoding as a UTF-16 surrogate pair.
constexpr bool IsSupplementary(char32_t code_point) {
  return code_point >= kMinimumSupplementaryCodePoint &&
         code_point <= kMaximumSupplementaryCodePoint;
}

// Returns `true` if `code_point` is a UTF-16 high surrogate.
constexpr bool IsHighSurrogate(char32_t code_point) {
  return code_point >= kMinimumHighSurrogateCodeUnit &&
         code_point <= kMaximumHighSurrogateCodeUnit;
}

// Returns `true` if `code_point` is a UTF-16 low surrogate.
constexpr bool IsLowSurrogate(char32_t code_point) {
  return code_point >= kMinimumLowSurrogateCodeUnit &&
         code_point <= kMaximumLowSurrogateCodeUnit;
}

// A UTF-16 surrogate pair.
class SurrogatePair final {
 public:
  // Constructs a surrogate pair from a high and a low surrogate.
  constexpr SurrogatePair(char16_t high, char16_t low)
      : high_(high), low_(low) {
    DCHECK(IsHighSurrogate(high_));
    DCHECK(IsLowSurrogate(low_));
  }

  // Constructs a surrogate pair from a code point.
  explicit constexpr SurrogatePair(char32_t code_point)
      : high_(GetHighSurrogate(code_point)), low_(GetLowSurrogate(code_point)) {
    // This constructor initializes `high_` and `low_` using helper functions
    // because C++17 requires it for `constexpr` constructors.
    DCHECK(IsSupplementary(code_point));
  }

  constexpr char16_t high() const { return high_; }
  constexpr char16_t low() const { return low_; }

  // Decodes this surrogate pair to a code point.
  constexpr char32_t ToCodePoint() const {
    char32_t code_point = low_ & kSurrogateMask;
    code_point |= (high_ & kSurrogateMask) << kSurrogateBits;
    return kMinimumSupplementaryCodePoint + code_point;
  }

 private:
  static constexpr char16_t GetHighSurrogate(char32_t code_point) {
    code_point -= kMinimumSupplementaryCodePoint;
    char16_t code_unit = (code_point >> kSurrogateBits) & kSurrogateMask;
    return kMinimumHighSurrogateCodeUnit | code_unit;
  }

  static constexpr char16_t GetLowSurrogate(char32_t code_point) {
    code_point -= kMinimumSupplementaryCodePoint;
    char16_t code_unit = code_point & kSurrogateMask;
    return kMinimumLowSurrogateCodeUnit | code_unit;
  }

  char16_t high_;
  char16_t low_;
};

}  // namespace pdfium

#endif  // CORE_FXCRT_UTF16_H_