summaryrefslogtreecommitdiff
path: root/PinyinIME/jni/include/dictbuilder.h
diff options
context:
space:
mode:
Diffstat (limited to 'PinyinIME/jni/include/dictbuilder.h')
-rw-r--r--PinyinIME/jni/include/dictbuilder.h171
1 files changed, 171 insertions, 0 deletions
diff --git a/PinyinIME/jni/include/dictbuilder.h b/PinyinIME/jni/include/dictbuilder.h
new file mode 100644
index 0000000..da0d6cd
--- /dev/null
+++ b/PinyinIME/jni/include/dictbuilder.h
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__
+#define PINYINIME_INCLUDE_DICTBUILDER_H__
+
+#include <stdlib.h>
+#include "./utf16char.h"
+#include "./dictdef.h"
+#include "./dictlist.h"
+#include "./spellingtable.h"
+#include "./spellingtrie.h"
+#include "./splparser.h"
+
+namespace ime_pinyin {
+
+#ifdef ___BUILD_MODEL___
+
+#define ___DO_STATISTICS___
+
+class DictTrie;
+
+class DictBuilder {
+ private:
+ // The raw lemma array buffer.
+ LemmaEntry *lemma_arr_;
+ size_t lemma_num_;
+
+ // Used to store all possible single char items.
+ // Two items may have the same Hanzi while their spelling ids are different.
+ SingleCharItem *scis_;
+ size_t scis_num_;
+
+ // In the tree, root's level is -1.
+ // Lemma nodes for root, and level 0
+ LmaNodeLE0 *lma_nodes_le0_;
+
+ // Lemma nodes for layers whose levels are deeper than 0
+ LmaNodeGE1 *lma_nodes_ge1_;
+
+ // Number of used lemma nodes
+ size_t lma_nds_used_num_le0_;
+ size_t lma_nds_used_num_ge1_;
+
+ // Used to store homophonies' ids.
+ LemmaIdType *homo_idx_buf_;
+ // Number of homophonies each of which only contains one Chinese character.
+ size_t homo_idx_num_eq1_;
+ // Number of homophonies each of which contains more than one character.
+ size_t homo_idx_num_gt1_;
+
+ // The items with highest scores.
+ LemmaEntry *top_lmas_;
+ size_t top_lmas_num_;
+
+ SpellingTable *spl_table_;
+ SpellingParser *spl_parser_;
+
+#ifdef ___DO_STATISTICS___
+ size_t max_sonbuf_len_[kMaxLemmaSize];
+ size_t max_homobuf_len_[kMaxLemmaSize];
+
+ size_t total_son_num_[kMaxLemmaSize];
+ size_t total_node_hasson_[kMaxLemmaSize];
+ size_t total_sonbuf_num_[kMaxLemmaSize];
+ size_t total_sonbuf_allnoson_[kMaxLemmaSize];
+ size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize];
+ size_t total_homo_num_[kMaxLemmaSize];
+
+ size_t sonbufs_num1_; // Number of son buffer with only 1 son
+ size_t sonbufs_numgt1_; // Number of son buffer with more 1 son;
+
+ size_t total_lma_node_num_;
+
+ void stat_init();
+ void stat_print();
+#endif
+
+ public:
+
+ DictBuilder();
+ ~DictBuilder();
+
+ // Build dictionary trie from the file fn_raw. File fn_validhzs provides
+ // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be
+ // included.
+ bool build_dict(const char* fn_raw, const char* fn_validhzs,
+ DictTrie *dict_trie);
+
+ private:
+ // Fill in the buffer with id. The caller guarantees that the paramters are
+ // vaild.
+ void id_to_charbuf(unsigned char *buf, LemmaIdType id);
+
+ // Update the offset of sons for a node.
+ void set_son_offset(LmaNodeGE1 *node, size_t offset);
+
+ // Update the offset of homophonies' ids for a node.
+ void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset);
+
+ // Format a speling string.
+ void format_spelling_str(char *spl_str);
+
+ // Sort the lemma_arr by the hanzi string, and give each of unique items
+ // a id. Why we need to sort the lemma list according to their Hanzi string
+ // is to find items started by a given prefix string to do prediction.
+ // Actually, the single char items are be in other order, for example,
+ // in spelling id order, etc.
+ // Return value is next un-allocated idx available.
+ LemmaIdType sort_lemmas_by_hz();
+
+ // Build the SingleCharItem list, and fill the hanzi_scis_ids in the
+ // lemma buffer lemma_arr_.
+ // This function should be called after the lemma array is ready.
+ // Return the number of unique SingleCharItem elements.
+ size_t build_scis();
+
+ // Construct a subtree using a subset of the spelling array (from
+ // item_star to item_end)
+ // parent is the parent node to update the necessary information
+ // parent can be a member of LmaNodeLE0 or LmaNodeGE1
+ bool construct_subset(void* parent, LemmaEntry* lemma_arr,
+ size_t item_start, size_t item_end, size_t level);
+
+
+ // Read valid Chinese Hanzis from the given file.
+ // num is used to return number of chars.
+ // The return buffer is sorted and caller needs to free the returned buffer.
+ char16* read_valid_hanzis(const char *fn_validhzs, size_t *num);
+
+
+ // Read a raw dictionary. max_item is the maximum number of items. If there
+ // are more items in the ditionary, only the first max_item will be read.
+ // Returned value is the number of items successfully read from the file.
+ size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs,
+ size_t max_item);
+
+ // Try to find if a character is in hzs buffer.
+ bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz);
+
+ // Try to find if all characters in str are in hzs buffer.
+ bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len,
+ const char16 *str, size_t str_len);
+
+ // Get these lemmas with toppest scores.
+ void get_top_lemmas();
+
+ // Allocate resource to build dictionary.
+ // lma_num is the number of items to be loaded
+ bool alloc_resource(size_t lma_num);
+
+ // Free resource.
+ void free_resource();
+};
+#endif // ___BUILD_MODEL___
+}
+
+#endif // PINYINIME_INCLUDE_DICTBUILDER_H__