diff options
Diffstat (limited to 'PinyinIME/jni/include/dictbuilder.h')
-rw-r--r-- | PinyinIME/jni/include/dictbuilder.h | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/PinyinIME/jni/include/dictbuilder.h b/PinyinIME/jni/include/dictbuilder.h new file mode 100644 index 0000000..da0d6cd --- /dev/null +++ b/PinyinIME/jni/include/dictbuilder.h @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTBUILDER_H__ +#define PINYINIME_INCLUDE_DICTBUILDER_H__ + +#include <stdlib.h> +#include "./utf16char.h" +#include "./dictdef.h" +#include "./dictlist.h" +#include "./spellingtable.h" +#include "./spellingtrie.h" +#include "./splparser.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +#define ___DO_STATISTICS___ + +class DictTrie; + +class DictBuilder { + private: + // The raw lemma array buffer. + LemmaEntry *lemma_arr_; + size_t lemma_num_; + + // Used to store all possible single char items. + // Two items may have the same Hanzi while their spelling ids are different. + SingleCharItem *scis_; + size_t scis_num_; + + // In the tree, root's level is -1. + // Lemma nodes for root, and level 0 + LmaNodeLE0 *lma_nodes_le0_; + + // Lemma nodes for layers whose levels are deeper than 0 + LmaNodeGE1 *lma_nodes_ge1_; + + // Number of used lemma nodes + size_t lma_nds_used_num_le0_; + size_t lma_nds_used_num_ge1_; + + // Used to store homophonies' ids. + LemmaIdType *homo_idx_buf_; + // Number of homophonies each of which only contains one Chinese character. + size_t homo_idx_num_eq1_; + // Number of homophonies each of which contains more than one character. + size_t homo_idx_num_gt1_; + + // The items with highest scores. + LemmaEntry *top_lmas_; + size_t top_lmas_num_; + + SpellingTable *spl_table_; + SpellingParser *spl_parser_; + +#ifdef ___DO_STATISTICS___ + size_t max_sonbuf_len_[kMaxLemmaSize]; + size_t max_homobuf_len_[kMaxLemmaSize]; + + size_t total_son_num_[kMaxLemmaSize]; + size_t total_node_hasson_[kMaxLemmaSize]; + size_t total_sonbuf_num_[kMaxLemmaSize]; + size_t total_sonbuf_allnoson_[kMaxLemmaSize]; + size_t total_node_in_sonbuf_allnoson_[kMaxLemmaSize]; + size_t total_homo_num_[kMaxLemmaSize]; + + size_t sonbufs_num1_; // Number of son buffer with only 1 son + size_t sonbufs_numgt1_; // Number of son buffer with more 1 son; + + size_t total_lma_node_num_; + + void stat_init(); + void stat_print(); +#endif + + public: + + DictBuilder(); + ~DictBuilder(); + + // Build dictionary trie from the file fn_raw. File fn_validhzs provides + // valid chars. If fn_validhzs is NULL, only chars in GB2312 will be + // included. + bool build_dict(const char* fn_raw, const char* fn_validhzs, + DictTrie *dict_trie); + + private: + // Fill in the buffer with id. The caller guarantees that the paramters are + // vaild. + void id_to_charbuf(unsigned char *buf, LemmaIdType id); + + // Update the offset of sons for a node. + void set_son_offset(LmaNodeGE1 *node, size_t offset); + + // Update the offset of homophonies' ids for a node. + void set_homo_id_buf_offset(LmaNodeGE1 *node, size_t offset); + + // Format a speling string. + void format_spelling_str(char *spl_str); + + // Sort the lemma_arr by the hanzi string, and give each of unique items + // a id. Why we need to sort the lemma list according to their Hanzi string + // is to find items started by a given prefix string to do prediction. + // Actually, the single char items are be in other order, for example, + // in spelling id order, etc. + // Return value is next un-allocated idx available. + LemmaIdType sort_lemmas_by_hz(); + + // Build the SingleCharItem list, and fill the hanzi_scis_ids in the + // lemma buffer lemma_arr_. + // This function should be called after the lemma array is ready. + // Return the number of unique SingleCharItem elements. + size_t build_scis(); + + // Construct a subtree using a subset of the spelling array (from + // item_star to item_end) + // parent is the parent node to update the necessary information + // parent can be a member of LmaNodeLE0 or LmaNodeGE1 + bool construct_subset(void* parent, LemmaEntry* lemma_arr, + size_t item_start, size_t item_end, size_t level); + + + // Read valid Chinese Hanzis from the given file. + // num is used to return number of chars. + // The return buffer is sorted and caller needs to free the returned buffer. + char16* read_valid_hanzis(const char *fn_validhzs, size_t *num); + + + // Read a raw dictionary. max_item is the maximum number of items. If there + // are more items in the ditionary, only the first max_item will be read. + // Returned value is the number of items successfully read from the file. + size_t read_raw_dict(const char* fn_raw, const char *fn_validhzs, + size_t max_item); + + // Try to find if a character is in hzs buffer. + bool hz_in_hanzis_list(const char16 *hzs, size_t hzs_len, char16 hz); + + // Try to find if all characters in str are in hzs buffer. + bool str_in_hanzis_list(const char16 *hzs, size_t hzs_len, + const char16 *str, size_t str_len); + + // Get these lemmas with toppest scores. + void get_top_lemmas(); + + // Allocate resource to build dictionary. + // lma_num is the number of items to be loaded + bool alloc_resource(size_t lma_num); + + // Free resource. + void free_resource(); +}; +#endif // ___BUILD_MODEL___ +} + +#endif // PINYINIME_INCLUDE_DICTBUILDER_H__ |