summaryrefslogtreecommitdiff
path: root/jni/include/dictdef.h
blob: 3e79d98cdad1339031c79bd9270d1c088eef8582 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef PINYINIME_INCLUDE_DICTDEF_H__
#define PINYINIME_INCLUDE_DICTDEF_H__

#include <stdlib.h>
#include "./utf16char.h"

namespace ime_pinyin {

// Enable the following line when building the binary dictionary model.
// #define ___BUILD_MODEL___

typedef unsigned char      uint8;
typedef unsigned short     uint16;
typedef unsigned int       uint32;

typedef signed char        int8;
typedef short              int16;
typedef int                int32;
typedef long long          int64;
typedef unsigned long long uint64;

const bool kPrintDebug0 = false;
const bool kPrintDebug1 = false;
const bool kPrintDebug2 = false;

// The max length of a lemma.
const size_t kMaxLemmaSize = 8;

// The max length of a Pinyin (spelling).
const size_t kMaxPinyinSize = 6;

// The number of half spelling ids. For Chinese Pinyin, there 30 half ids.
// See SpellingTrie.h for details.
const size_t kHalfSpellingIdNum = 29;

// The maximum number of full spellings. For Chinese Pinyin, there are only
// about 410 spellings.
// If change this value is bigger(needs more bits), please also update
// other structures like SpellingNode, to make sure than a spelling id can be
// stored.
// -1 is because that 0 is never used.
const size_t kMaxSpellingNum = 512 - kHalfSpellingIdNum - 1;
const size_t kMaxSearchSteps = 40;

// One character predicts its following characters.
const size_t kMaxPredictSize = (kMaxLemmaSize - 1);

// LemmaIdType must always be size_t.
typedef size_t LemmaIdType;
const size_t kLemmaIdSize = 3;  // Actually, a Id occupies 3 bytes in storage.
const size_t kLemmaIdComposing = 0xffffff;

typedef uint16 LmaScoreType;
typedef uint16 KeyScoreType;

// Number of items with highest score are kept for prediction purpose.
const size_t kTopScoreLemmaNum = 10;

const size_t kMaxPredictNumByGt3 = 1;
const size_t kMaxPredictNumBy3 = 2;
const size_t kMaxPredictNumBy2 = 2;

// The last lemma id (included) for the system dictionary. The system
// dictionary's ids always start from 1.
const LemmaIdType kSysDictIdEnd = 500000;

// The first lemma id for the user dictionary.
const LemmaIdType kUserDictIdStart = 500001;

// The last lemma id (included) for the user dictionary.
const LemmaIdType kUserDictIdEnd = 600000;

typedef struct {
  uint16 half_splid:5;
  uint16 full_splid:11;
} SpellingId, *PSpellingId;


/**
 * We use different node types for different layers
 * Statistical data of the building result for a testing dictionary:
 *                              root,   level 0,   level 1,   level 2,   level 3
 * max son num of one node:     406        280         41          2          -
 * max homo num of one node:      0         90         23          2          2
 * total node num of a layer:     1        406      31766      13516        993
 * total homo num of a layer:     9       5674      44609      12667        995
 *
 * The node number for root and level 0 won't be larger than 500
 * According to the information above, two kinds of nodes can be used; one for
 * root and level 0, the other for these layers deeper than 0.
 *
 * LE = less and equal,
 * A node occupies 16 bytes. so, totallly less than 16 * 500 = 8K
 */
struct LmaNodeLE0 {
  size_t son_1st_off;
  size_t homo_idx_buf_off;
  uint16 spl_idx;
  uint16 num_of_son;
  uint16 num_of_homo;
};

/**
 * GE = great and equal
 * A node occupies 8 bytes.
 */
struct LmaNodeGE1 {
  uint16 son_1st_off_l;        // Low bits of the son_1st_off
  uint16 homo_idx_buf_off_l;   // Low bits of the homo_idx_buf_off_1
  uint16 spl_idx;
  unsigned char num_of_son;            // number of son nodes
  unsigned char num_of_homo;           // number of homo words
  unsigned char son_1st_off_h;         // high bits of the son_1st_off
  unsigned char homo_idx_buf_off_h;    // high bits of the homo_idx_buf_off
};

#ifdef ___BUILD_MODEL___
struct SingleCharItem {
  float freq;
  char16 hz;
  SpellingId splid;
};

struct LemmaEntry {
  LemmaIdType idx_by_py;
  LemmaIdType idx_by_hz;
  char16 hanzi_str[kMaxLemmaSize + 1];

  // The SingleCharItem id for each Hanzi.
  uint16 hanzi_scis_ids[kMaxLemmaSize];

  uint16 spl_idx_arr[kMaxLemmaSize + 1];
  char pinyin_str[kMaxLemmaSize][kMaxPinyinSize + 1];
  unsigned char hz_str_len;
  float freq;
};
#endif  // ___BUILD_MODEL___

}  //  namespace ime_pinyin

#endif  // PINYINIME_INCLUDE_DICTDEF_H__