summaryrefslogtreecommitdiff
path: root/jni/include/spellingtrie.h
blob: 4438757a35d91c1f80c9f720249eb13192efb56c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_SPELLINGTRIE_H__
#define PINYINIME_INCLUDE_SPELLINGTRIE_H__

#include <stdio.h>
#include <stdlib.h>
#include "./dictdef.h"

namespace ime_pinyin {

static const unsigned short kFullSplIdStart = kHalfSpellingIdNum + 1;

// Node used for the trie of spellings
struct SpellingNode {
  SpellingNode *first_son;
  // The spelling id for each node. If you need more bits to store
  // spelling id, please adjust this structure.
  uint16 spelling_idx:11;
  uint16  num_of_son:5;
  char char_this_node;
  unsigned char score;
};

class SpellingTrie {
 private:
  static const int kMaxYmNum = 64;
  static const size_t kValidSplCharNum = 26;

  static const uint16 kHalfIdShengmuMask = 0x01;
  static const uint16 kHalfIdYunmuMask = 0x02;
  static const uint16 kHalfIdSzmMask = 0x04;

  // Map from half spelling id to single char.
  // For half ids of Zh/Ch/Sh, map to z/c/s (low case) respectively.
  // For example, 1 to 'A', 2 to 'B', 3 to 'C', 4 to 'c', 5 to 'D', ...,
  // 28 to 'Z', 29 to 'z'.
  // [0] is not used to achieve better efficiency.
  static const char kHalfId2Sc_[kFullSplIdStart + 1];

  static unsigned char char_flags_[];
  static SpellingTrie* instance_;

  // The spelling table
  char *spelling_buf_;

  // The size of longest spelling string, includes '\0' and an extra char to
  // store score. For example, "zhuang" is the longgest item in Pinyin list,
  // so spelling_size_ is 8.
  // Structure: The string ended with '\0' + score char.
  // An item with a lower score has a higher probability.
  size_t spelling_size_;

  // Number of full spelling ids.
  size_t spelling_num_;

  float score_amplifier_;
  unsigned char average_score_;

  // The Yunmu id list for the spelling ids (for half ids of Shengmu,
  // the Yunmu id is 0).
  // The length of the list is spelling_num_ + kFullSplIdStart,
  // so that spl_ym_ids_[splid] is the Yunmu id of the splid.
  uint8 *spl_ym_ids_;

  // The Yunmu table.
  // Each Yunmu will be assigned with Yunmu id from 1.
  char *ym_buf_;
  size_t ym_size_;  // The size of longest Yunmu string, '\0'included.
  size_t ym_num_;

  // The spelling string just queried
  char *splstr_queried_;

  // The spelling string just queried
  char16 *splstr16_queried_;

  // The root node of the spelling tree
  SpellingNode* root_;

  // If a none qwerty key such as a fnction key like ENTER is given, this node
  // will be used to indicate that this is not a QWERTY node.
  SpellingNode* dumb_node_;

  // If a splitter key is pressed, this node will be used to indicate that this
  // is a splitter key.
  SpellingNode* splitter_node_;

  // Used to get the first level sons.
  SpellingNode* level1_sons_[kValidSplCharNum];

  // The full spl_id range for specific half id.
  // h2f means half to full.
  // A half id can be a ShouZiMu id (id to represent the first char of a full
  // spelling, including Shengmu and Yunmu), or id of zh/ch/sh.
  // [1..kFullSplIdStart-1] is the arrange of half id.
  uint16 h2f_start_[kFullSplIdStart];
  uint16 h2f_num_[kFullSplIdStart];

  // Map from full id to half id.
  uint16 *f2h_;

#ifdef ___BUILD_MODEL___
  // How many node used to build the trie.
  size_t node_num_;
#endif

  SpellingTrie();

  void free_son_trie(SpellingNode* node);

  // Construct a subtree using a subset of the spelling array (from
  // item_star to item_end).
  // Member spelliing_buf_ and spelling_size_ should be valid.
  // parent is used to update its num_of_son and score.
  SpellingNode* construct_spellings_subset(size_t item_start, size_t item_end,
                                           size_t level, SpellingNode *parent);
  bool build_f2h();

  // The caller should guarantee ch >= 'A' && ch <= 'Z'
  bool is_shengmu_char(char ch) const;

  // The caller should guarantee ch >= 'A' && ch <= 'Z'
  bool is_yunmu_char(char ch) const;

#ifdef ___BUILD_MODEL___
  // Given a spelling string, return its Yunmu string.
  // The caller guaratees spl_str is valid.
  const char* get_ym_str(const char *spl_str);

  // Build the Yunmu list, and the mapping relation between the full ids and the
  // Yunmu ids. This functin is called after the spelling trie is built.
  bool build_ym_info();
#endif

  friend class SpellingParser;
  friend class SmartSplParser;
  friend class SmartSplParser2;

 public:
  ~SpellingTrie();

  inline static bool is_valid_spl_char(char ch) {
    return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
  }

  // The caller guarantees that the two chars are valid spelling chars.
  inline static bool is_same_spl_char(char ch1, char ch2) {
    return ch1 == ch2 || ch1 - ch2 == 'a' - 'A' || ch2 - ch1 == 'a' - 'A';
  }

  // Construct the tree from the input pinyin array
  // The given string list should have been sorted.
  // score_amplifier is used to convert a possibility value into score.
  // average_score is the average_score of all spellings. The dumb node is
  // assigned with this score.
  bool construct(const char* spelling_arr, size_t item_size, size_t item_num,
                 float score_amplifier, unsigned char average_score);

  // Test if the given id is a valid spelling id.
  // If function returns true, the given splid may be updated like this:
  // When 'A' is not enabled in ShouZiMu mode, the parsing result for 'A' is
  // first given as a half id 1, but because 'A' is a one-char Yunmu and
  // it is a valid id, it needs to updated to its corresponding full id.
  bool if_valid_id_update(uint16 *splid) const;

  // Test if the given id is a half id.
  bool is_half_id(uint16 splid) const;

  bool is_full_id(uint16 splid) const;

  // Test if the given id is a one-char Yunmu id (obviously, it is also a half
  // id), such as 'A', 'E' and 'O'.
  bool is_half_id_yunmu(uint16 splid) const;

  // Test if this char is a ShouZiMu char. This ShouZiMu char may be not enabled.
  // For Pinyin, only i/u/v is not a ShouZiMu char.
  // The caller should guarantee that ch >= 'A' && ch <= 'Z'
  bool is_szm_char(char ch) const;

  // Test If this char is enabled in ShouZiMu mode.
  // The caller should guarantee that ch >= 'A' && ch <= 'Z'
  bool szm_is_enabled(char ch) const;

  // Enable/disable Shengmus in ShouZiMu mode(using the first char of a spelling
  // to input).
  void szm_enable_shm(bool enable);

  // Enable/disable Yunmus in ShouZiMu mode.
  void szm_enable_ym(bool enable);

  // Test if this char is enabled in ShouZiMu mode.
  // The caller should guarantee ch >= 'A' && ch <= 'Z'
  bool is_szm_enabled(char ch) const;

  // Return the number of full ids for the given half id.
  uint16 half2full_num(uint16 half_id) const;

  // Return the number of full ids for the given half id, and fill spl_id_start
  // to return the first full id.
  uint16 half_to_full(uint16 half_id, uint16 *spl_id_start) const;

  // Return the corresponding half id for the given full id.
  // Not frequently used, low efficient.
  // Return 0 if fails.
  uint16 full_to_half(uint16 full_id) const;

  // To test whether a half id is compatible with a full id.
  // Generally, when half_id == full_to_half(full_id), return true.
  // But for "Zh, Ch, Sh", if fussy mode is on, half id for 'Z' is compatible
  // with a full id like "Zhe". (Fussy mode is not ready).
  bool half_full_compatible(uint16 half_id, uint16 full_id) const;

  static const SpellingTrie* get_cpinstance();

  static SpellingTrie& get_instance();

  // Save to the file stream
  bool save_spl_trie(FILE *fp);

  // Load from the file stream
  bool load_spl_trie(FILE *fp);

  // Get the number of spellings
  size_t get_spelling_num();

  // Return the Yunmu id for the given Yunmu string.
  // If the string is not valid, return 0;
  uint8 get_ym_id(const char* ym_str);

  // Get the readonly Pinyin string for a given spelling id
  const char* get_spelling_str(uint16 splid);

  // Get the readonly Pinyin string for a given spelling id
  const char16* get_spelling_str16(uint16 splid);

  // Get Pinyin string for a given spelling id. Return the length of the
  // string, and fill-in '\0' at the end.
  size_t get_spelling_str16(uint16 splid, char16 *splstr16,
                            size_t splstr16_len);
};
}

#endif  // PINYINIME_INCLUDE_SPELLINGTRIE_H__