summaryrefslogtreecommitdiff
path: root/jni/include/ngram.h
diff options
context:
space:
mode:
Diffstat (limited to 'jni/include/ngram.h')
-rw-r--r--jni/include/ngram.h96
1 files changed, 96 insertions, 0 deletions
diff --git a/jni/include/ngram.h b/jni/include/ngram.h
new file mode 100644
index 0000000..ad6c304
--- /dev/null
+++ b/jni/include/ngram.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef PINYINIME_INCLUDE_NGRAM_H__
+#define PINYINIME_INCLUDE_NGRAM_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "./dictdef.h"
+
+namespace ime_pinyin {
+
+typedef unsigned char CODEBOOK_TYPE;
+
+static const size_t kCodeBookSize = 256;
+
+class NGram {
+ public:
+ // The maximum score of a lemma item.
+ static const LmaScoreType kMaxScore = 0x3fff;
+
+ // In order to reduce the storage size, the original log value is amplified by
+ // kScoreAmplifier, and we use LmaScoreType to store.
+ // After this process, an item with a lower score has a higher frequency.
+ static const int kLogValueAmplifier = -800;
+
+ // System words' total frequency. It is not the real total frequency, instead,
+ // It is only used to adjust system lemmas' scores when the user dictionary's
+ // total frequency changes.
+ // In this version, frequencies of system lemmas are fixed. We are considering
+ // to make them changable in next version.
+ static const size_t kSysDictTotalFreq = 100000000;
+
+ private:
+
+ static NGram* instance_;
+
+ bool initialized_;
+ size_t idx_num_;
+
+ size_t total_freq_none_sys_;
+
+ // Score compensation for system dictionary lemmas.
+ // Because after user adds some user lemmas, the total frequency changes, and
+ // we use this value to normalize the score.
+ float sys_score_compensation_;
+
+#ifdef ___BUILD_MODEL___
+ double *freq_codes_df_;
+#endif
+ LmaScoreType *freq_codes_;
+ CODEBOOK_TYPE *lma_freq_idx_;
+
+ public:
+ NGram();
+ ~NGram();
+
+ static NGram& get_instance();
+
+ bool save_ngram(FILE *fp);
+ bool load_ngram(FILE *fp);
+
+ // Set the total frequency of all none system dictionaries.
+ void set_total_freq_none_sys(size_t freq_none_sys);
+
+ float get_uni_psb(LemmaIdType lma_id);
+
+ // Convert a probability to score. Actually, the score will be limited to
+ // kMaxScore, but at runtime, we also need float expression to get accurate
+ // value of the score.
+ // After the conversion, a lower score indicates a higher probability of the
+ // item.
+ static float convert_psb_to_score(double psb);
+
+#ifdef ___BUILD_MODEL___
+ // For constructing the unigram mode model.
+ bool build_unigram(LemmaEntry *lemma_arr, size_t num,
+ LemmaIdType next_idx_unused);
+#endif
+};
+}
+
+#endif // PINYINIME_INCLUDE_NGRAM_H__