diff options
Diffstat (limited to 'jni/include/dicttrie.h')
-rw-r--r-- | jni/include/dicttrie.h | 233 |
1 files changed, 233 insertions, 0 deletions
diff --git a/jni/include/dicttrie.h b/jni/include/dicttrie.h new file mode 100644 index 0000000..268624f --- /dev/null +++ b/jni/include/dicttrie.h @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef PINYINIME_INCLUDE_DICTTRIE_H__ +#define PINYINIME_INCLUDE_DICTTRIE_H__ + +#include <stdlib.h> +#include "./atomdictbase.h" +#include "./dictdef.h" +#include "./dictlist.h" +#include "./searchutility.h" + +namespace ime_pinyin { + +class DictTrie : AtomDictBase { + private: + typedef struct ParsingMark { + size_t node_offset:24; + size_t node_num:8; // Number of nodes with this spelling id given + // by spl_id. If spl_id is a Shengmu, for nodes + // in the first layer of DictTrie, it equals to + // SpellingTrie::shm2full_num(); but for those + // nodes which are not in the first layer, + // node_num < SpellingTrie::shm2full_num(). + // For a full spelling id, node_num = 1; + }; + + // Used to indicate an extended mile stone. + // An extended mile stone is used to mark a partial match in the dictionary + // trie to speed up further potential extending. + // For example, when the user inputs "w", a mile stone is created to mark the + // partial match status, so that when user inputs another char 'm', it will be + // faster to extend search space based on this mile stone. + // + // For partial match status of "wm", there can be more than one sub mile + // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so + // there may be more one parsing mark used to mark these partial matchings. + // A mile stone records the starting position in the mark list and number of + // marks. + struct MileStone { + uint16 mark_start; + uint16 mark_num; + }; + + DictList* dict_list_; + + const SpellingTrie *spl_trie_; + + LmaNodeLE0* root_; // Nodes for root and the first layer. + LmaNodeGE1* nodes_ge1_; // Nodes for other layers. + + // An quick index from spelling id to the LmaNodeLE0 node buffer, or + // to the root_ buffer. + // Index length: + // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used + // to get the end. + // All Shengmu ids are not indexed because they will be converted into + // corresponding full ids. + // So, given an id splid, the son is: + // root_[splid_le0_index_[splid - kFullSplIdStart]] + uint16 *splid_le0_index_; + + size_t lma_node_num_le0_; + size_t lma_node_num_ge1_; + + // The first part is for homophnies, and the last top_lma_num_ items are + // lemmas with highest scores. + unsigned char *lma_idx_buf_; + size_t lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte. + size_t total_lma_num_; // Total number of lemmas in this dictionary. + size_t top_lmas_num_; // Number of lemma with highest scores. + + // Parsing mark list used to mark the detailed extended statuses. + ParsingMark *parsing_marks_; + // The position for next available mark. + uint16 parsing_marks_pos_; + + // Mile stone list used to mark the extended status. + MileStone *mile_stones_; + // The position for the next available mile stone. We use positions (except 0) + // as handles. + MileStoneHandle mile_stones_pos_; + + // Get the offset of sons for a node. + inline size_t get_son_offset(const LmaNodeGE1 *node); + + // Get the offset of homonious ids for a node. + inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node); + + // Get the lemma id by the offset. + inline LemmaIdType get_lemma_id(size_t id_offset); + + void free_resource(bool free_dict_list); + + bool load_dict(FILE *fp); + + // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill + // them into the lpi_items buffer. + // This function is called by the search engine. + size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, + LmaNodeLE0 *node); + + // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill + // them into the lpi_items buffer. + // This function is called by inner functions extend_dict0(), extend_dict1() + // and extend_dict2(). + size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size, + size_t homo_buf_off, LmaNodeGE1 *node, + uint16 lma_len); + + // Extend in the trie from level 0. + MileStoneHandle extend_dict0(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + // Extend in the trie from level 1. + MileStoneHandle extend_dict1(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + // Extend in the trie from level 2. + MileStoneHandle extend_dict2(MileStoneHandle from_handle, + const DictExtPara *dep, LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + // Try to extend the given spelling id buffer, and if the given id_lemma can + // be successfully gotten, return true; + // The given spelling ids are all valid full ids. + bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma); + +#ifdef ___BUILD_MODEL___ + bool save_dict(FILE *fp); +#endif // ___BUILD_MODEL___ + + static const int kMaxMileStone = 100; + static const int kMaxParsingMark = 600; + static const MileStoneHandle kFirstValidMileStoneHandle = 1; + + friend class DictParser; + friend class DictBuilder; + + public: + + DictTrie(); + ~DictTrie(); + +#ifdef ___BUILD_MODEL___ + // Construct the tree from the file fn_raw. + // fn_validhzs provide the valid hanzi list. If fn_validhzs is + // NULL, only chars in GB2312 will be included. + bool build_dict(const char *fn_raw, const char *fn_validhzs); + + // Save the binary dictionary + // Actually, the SpellingTrie/DictList instance will be also saved. + bool save_dict(const char *filename); +#endif // ___BUILD_MODEL___ + + void convert_to_hanzis(char16 *str, uint16 str_len); + + void convert_to_scis_ids(char16 *str, uint16 str_len); + + // Load a binary dictionary + // The SpellingTrie instance/DictList will be also loaded + bool load_dict(const char *filename, LemmaIdType start_id, + LemmaIdType end_id); + bool load_dict_fd(int sys_fd, long start_offset, long length, + LemmaIdType start_id, LemmaIdType end_id); + bool close_dict() {return true;} + size_t number_of_lemmas() {return 0;} + + void reset_milestones(uint16 from_step, MileStoneHandle from_handle); + + MileStoneHandle extend_dict(MileStoneHandle from_handle, + const DictExtPara *dep, + LmaPsbItem *lpi_items, + size_t lpi_max, size_t *lpi_num); + + size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len, + LmaPsbItem *lpi_items, size_t lpi_max); + + uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max); + + uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids, + uint16 splids_max, bool arg_valid); + + size_t predict(const char16 *last_hzs, uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used); + + LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[], + uint16 lemma_len, uint16 count) {return 0;} + + LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count, + bool selected) {return 0;} + + LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) {return 0;} + + LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;} + + LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[], + uint16 lemma_len) {return 0;} + + bool remove_lemma(LemmaIdType lemma_id) {return false;} + + size_t get_total_lemma_count() {return 0;} + void set_total_lemma_count_of_others(size_t count); + + void flush_cache() {} + + LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len); + + // Fill the lemmas with highest scores to the prediction buffer. + // his_len is the history length to fill in the prediction buffer. + size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items, + size_t npre_max, size_t b4_used); +}; +} + +#endif // PINYINIME_INCLUDE_DICTTRIE_H__ |