summaryrefslogtreecommitdiff
path: root/jni/include/dicttrie.h
diff options
context:
space:
mode:
Diffstat (limited to 'jni/include/dicttrie.h')
-rw-r--r--jni/include/dicttrie.h233
1 files changed, 233 insertions, 0 deletions
diff --git a/jni/include/dicttrie.h b/jni/include/dicttrie.h
new file mode 100644
index 0000000..268624f
--- /dev/null
+++ b/jni/include/dicttrie.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_DICTTRIE_H__
+#define PINYINIME_INCLUDE_DICTTRIE_H__
+
+#include <stdlib.h>
+#include "./atomdictbase.h"
+#include "./dictdef.h"
+#include "./dictlist.h"
+#include "./searchutility.h"
+
+namespace ime_pinyin {
+
+class DictTrie : AtomDictBase {
+ private:
+ typedef struct ParsingMark {
+ size_t node_offset:24;
+ size_t node_num:8; // Number of nodes with this spelling id given
+ // by spl_id. If spl_id is a Shengmu, for nodes
+ // in the first layer of DictTrie, it equals to
+ // SpellingTrie::shm2full_num(); but for those
+ // nodes which are not in the first layer,
+ // node_num < SpellingTrie::shm2full_num().
+ // For a full spelling id, node_num = 1;
+ };
+
+ // Used to indicate an extended mile stone.
+ // An extended mile stone is used to mark a partial match in the dictionary
+ // trie to speed up further potential extending.
+ // For example, when the user inputs "w", a mile stone is created to mark the
+ // partial match status, so that when user inputs another char 'm', it will be
+ // faster to extend search space based on this mile stone.
+ //
+ // For partial match status of "wm", there can be more than one sub mile
+ // stone, for example, "wm" can be matched to "wanm", "wom", ..., etc, so
+ // there may be more one parsing mark used to mark these partial matchings.
+ // A mile stone records the starting position in the mark list and number of
+ // marks.
+ struct MileStone {
+ uint16 mark_start;
+ uint16 mark_num;
+ };
+
+ DictList* dict_list_;
+
+ const SpellingTrie *spl_trie_;
+
+ LmaNodeLE0* root_; // Nodes for root and the first layer.
+ LmaNodeGE1* nodes_ge1_; // Nodes for other layers.
+
+ // An quick index from spelling id to the LmaNodeLE0 node buffer, or
+ // to the root_ buffer.
+ // Index length:
+ // SpellingTrie::get_instance().get_spelling_num() + 1. The last one is used
+ // to get the end.
+ // All Shengmu ids are not indexed because they will be converted into
+ // corresponding full ids.
+ // So, given an id splid, the son is:
+ // root_[splid_le0_index_[splid - kFullSplIdStart]]
+ uint16 *splid_le0_index_;
+
+ size_t lma_node_num_le0_;
+ size_t lma_node_num_ge1_;
+
+ // The first part is for homophnies, and the last top_lma_num_ items are
+ // lemmas with highest scores.
+ unsigned char *lma_idx_buf_;
+ size_t lma_idx_buf_len_; // The total size of lma_idx_buf_ in byte.
+ size_t total_lma_num_; // Total number of lemmas in this dictionary.
+ size_t top_lmas_num_; // Number of lemma with highest scores.
+
+ // Parsing mark list used to mark the detailed extended statuses.
+ ParsingMark *parsing_marks_;
+ // The position for next available mark.
+ uint16 parsing_marks_pos_;
+
+ // Mile stone list used to mark the extended status.
+ MileStone *mile_stones_;
+ // The position for the next available mile stone. We use positions (except 0)
+ // as handles.
+ MileStoneHandle mile_stones_pos_;
+
+ // Get the offset of sons for a node.
+ inline size_t get_son_offset(const LmaNodeGE1 *node);
+
+ // Get the offset of homonious ids for a node.
+ inline size_t get_homo_idx_buf_offset(const LmaNodeGE1 *node);
+
+ // Get the lemma id by the offset.
+ inline LemmaIdType get_lemma_id(size_t id_offset);
+
+ void free_resource(bool free_dict_list);
+
+ bool load_dict(FILE *fp);
+
+ // Given a LmaNodeLE0 node, extract the lemmas specified by it, and fill
+ // them into the lpi_items buffer.
+ // This function is called by the search engine.
+ size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
+ LmaNodeLE0 *node);
+
+ // Given a LmaNodeGE1 node, extract the lemmas specified by it, and fill
+ // them into the lpi_items buffer.
+ // This function is called by inner functions extend_dict0(), extend_dict1()
+ // and extend_dict2().
+ size_t fill_lpi_buffer(LmaPsbItem lpi_items[], size_t max_size,
+ size_t homo_buf_off, LmaNodeGE1 *node,
+ uint16 lma_len);
+
+ // Extend in the trie from level 0.
+ MileStoneHandle extend_dict0(MileStoneHandle from_handle,
+ const DictExtPara *dep, LmaPsbItem *lpi_items,
+ size_t lpi_max, size_t *lpi_num);
+
+ // Extend in the trie from level 1.
+ MileStoneHandle extend_dict1(MileStoneHandle from_handle,
+ const DictExtPara *dep, LmaPsbItem *lpi_items,
+ size_t lpi_max, size_t *lpi_num);
+
+ // Extend in the trie from level 2.
+ MileStoneHandle extend_dict2(MileStoneHandle from_handle,
+ const DictExtPara *dep, LmaPsbItem *lpi_items,
+ size_t lpi_max, size_t *lpi_num);
+
+ // Try to extend the given spelling id buffer, and if the given id_lemma can
+ // be successfully gotten, return true;
+ // The given spelling ids are all valid full ids.
+ bool try_extend(const uint16 *splids, uint16 splid_num, LemmaIdType id_lemma);
+
+#ifdef ___BUILD_MODEL___
+ bool save_dict(FILE *fp);
+#endif // ___BUILD_MODEL___
+
+ static const int kMaxMileStone = 100;
+ static const int kMaxParsingMark = 600;
+ static const MileStoneHandle kFirstValidMileStoneHandle = 1;
+
+ friend class DictParser;
+ friend class DictBuilder;
+
+ public:
+
+ DictTrie();
+ ~DictTrie();
+
+#ifdef ___BUILD_MODEL___
+ // Construct the tree from the file fn_raw.
+ // fn_validhzs provide the valid hanzi list. If fn_validhzs is
+ // NULL, only chars in GB2312 will be included.
+ bool build_dict(const char *fn_raw, const char *fn_validhzs);
+
+ // Save the binary dictionary
+ // Actually, the SpellingTrie/DictList instance will be also saved.
+ bool save_dict(const char *filename);
+#endif // ___BUILD_MODEL___
+
+ void convert_to_hanzis(char16 *str, uint16 str_len);
+
+ void convert_to_scis_ids(char16 *str, uint16 str_len);
+
+ // Load a binary dictionary
+ // The SpellingTrie instance/DictList will be also loaded
+ bool load_dict(const char *filename, LemmaIdType start_id,
+ LemmaIdType end_id);
+ bool load_dict_fd(int sys_fd, long start_offset, long length,
+ LemmaIdType start_id, LemmaIdType end_id);
+ bool close_dict() {return true;}
+ size_t number_of_lemmas() {return 0;}
+
+ void reset_milestones(uint16 from_step, MileStoneHandle from_handle);
+
+ MileStoneHandle extend_dict(MileStoneHandle from_handle,
+ const DictExtPara *dep,
+ LmaPsbItem *lpi_items,
+ size_t lpi_max, size_t *lpi_num);
+
+ size_t get_lpis(const uint16 *splid_str, uint16 splid_str_len,
+ LmaPsbItem *lpi_items, size_t lpi_max);
+
+ uint16 get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, uint16 str_max);
+
+ uint16 get_lemma_splids(LemmaIdType id_lemma, uint16 *splids,
+ uint16 splids_max, bool arg_valid);
+
+ size_t predict(const char16 *last_hzs, uint16 hzs_len,
+ NPredictItem *npre_items, size_t npre_max,
+ size_t b4_used);
+
+ LemmaIdType put_lemma(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len, uint16 count) {return 0;}
+
+ LemmaIdType update_lemma(LemmaIdType lemma_id, int16 delta_count,
+ bool selected) {return 0;}
+
+ LemmaIdType get_lemma_id(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len) {return 0;}
+
+ LmaScoreType get_lemma_score(LemmaIdType lemma_id) {return 0;}
+
+ LmaScoreType get_lemma_score(char16 lemma_str[], uint16 splids[],
+ uint16 lemma_len) {return 0;}
+
+ bool remove_lemma(LemmaIdType lemma_id) {return false;}
+
+ size_t get_total_lemma_count() {return 0;}
+ void set_total_lemma_count_of_others(size_t count);
+
+ void flush_cache() {}
+
+ LemmaIdType get_lemma_id(const char16 lemma_str[], uint16 lemma_len);
+
+ // Fill the lemmas with highest scores to the prediction buffer.
+ // his_len is the history length to fill in the prediction buffer.
+ size_t predict_top_lmas(size_t his_len, NPredictItem *npre_items,
+ size_t npre_max, size_t b4_used);
+};
+}
+
+#endif // PINYINIME_INCLUDE_DICTTRIE_H__