summaryrefslogtreecommitdiff
path: root/jni/share/spellingtable.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'jni/share/spellingtable.cpp')
-rw-r--r--jni/share/spellingtable.cpp313
1 files changed, 313 insertions, 0 deletions
diff --git a/jni/share/spellingtable.cpp b/jni/share/spellingtable.cpp
new file mode 100644
index 0000000..6005e20
--- /dev/null
+++ b/jni/share/spellingtable.cpp
@@ -0,0 +1,313 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include "../include/spellingtable.h"
+
+namespace ime_pinyin {
+
+#ifdef ___BUILD_MODEL___
+
+const char SpellingTable::
+ kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"};
+
+// "" is the biggest, so that all empty strings will be moved to the end
+// _eb mean empty is biggest
+int compare_raw_spl_eb(const void* p1, const void* p2) {
+ if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0])
+ return 1;
+
+ if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0])
+ return -1;
+
+ return strcmp((static_cast<const RawSpelling*>(p1))->str,
+ (static_cast<const RawSpelling*>(p2))->str);
+}
+
+size_t get_odd_next(size_t value) {
+ size_t v_next = value;
+ while (true) {
+ size_t v_next_sqrt = (size_t)sqrt(v_next);
+
+ bool is_odd = true;
+ for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) {
+ if (v_next % v_dv == 0) {
+ is_odd = false;
+ break;
+ }
+ }
+
+ if (is_odd)
+ return v_next;
+
+ v_next++;
+ }
+
+ // never reach here
+ return 0;
+}
+
+SpellingTable::SpellingTable() {
+ need_score_ = false;
+ raw_spellings_ = NULL;
+ spelling_buf_ = NULL;
+ spelling_num_ = 0;
+ total_freq_ = 0;
+ frozen_ = true;
+}
+
+SpellingTable::~SpellingTable() {
+ free_resource();
+}
+
+size_t SpellingTable::get_hash_pos(const char* spelling_str) {
+ size_t hash_pos = 0;
+ for (size_t pos = 0; pos < spelling_size_; pos++) {
+ if ('\0' == spelling_str[pos])
+ break;
+ hash_pos += (size_t)spelling_str[pos];
+ }
+
+ hash_pos = hash_pos % spelling_max_num_;
+ return hash_pos;
+}
+
+size_t SpellingTable::hash_pos_next(size_t hash_pos) {
+ hash_pos += 123;
+ hash_pos = hash_pos % spelling_max_num_;
+ return hash_pos;
+}
+
+void SpellingTable::free_resource() {
+ if (NULL != raw_spellings_)
+ delete [] raw_spellings_;
+ raw_spellings_ = NULL;
+
+ if (NULL != spelling_buf_)
+ delete [] spelling_buf_;
+ spelling_buf_ = NULL;
+}
+
+bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num,
+ bool need_score) {
+ if (pure_spl_size == 0 || spl_max_num ==0)
+ return false;
+
+ need_score_ = need_score;
+
+ free_resource();
+
+ spelling_size_ = pure_spl_size + 1;
+ if (need_score)
+ spelling_size_ += 1;
+ spelling_max_num_ = get_odd_next(spl_max_num);
+ spelling_num_ = 0;
+
+ raw_spellings_ = new RawSpelling[spelling_max_num_];
+ spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)];
+ if (NULL == raw_spellings_ || NULL == spelling_buf_) {
+ free_resource();
+ return false;
+ }
+
+ memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling));
+ memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_));
+ frozen_ = false;
+ total_freq_ = 0;
+ return true;
+}
+
+bool SpellingTable::put_spelling(const char* spelling_str, double freq) {
+ if (frozen_ || NULL == spelling_str)
+ return false;
+
+ for (size_t pos = 0; pos < kNotSupportNum; pos++) {
+ if (strcmp(spelling_str, kNotSupportList[pos]) == 0) {
+ return false;
+ }
+ }
+
+ total_freq_ += freq;
+
+ size_t hash_pos = get_hash_pos(spelling_str);
+
+ raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
+
+ if (strncmp(raw_spellings_[hash_pos].str, spelling_str,
+ spelling_size_ - 1) == 0) {
+ raw_spellings_[hash_pos].freq += freq;
+ return true;
+ }
+
+ size_t hash_pos_ori = hash_pos;
+
+ while (true) {
+ if (strncmp(raw_spellings_[hash_pos].str,
+ spelling_str, spelling_size_ - 1) == 0) {
+ raw_spellings_[hash_pos].freq += freq;
+ return true;
+ }
+
+ if ('\0' == raw_spellings_[hash_pos].str[0]) {
+ raw_spellings_[hash_pos].freq += freq;
+ strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1);
+ raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0';
+ spelling_num_++;
+ return true;
+ }
+
+ hash_pos = hash_pos_next(hash_pos);
+ if (hash_pos_ori == hash_pos)
+ return false;
+ }
+
+ // never reach here
+ return false;
+}
+
+bool SpellingTable::contain(const char* spelling_str) {
+ if (NULL == spelling_str || NULL == spelling_buf_ || frozen_)
+ return false;
+
+ size_t hash_pos = get_hash_pos(spelling_str);
+
+ if ('\0' == raw_spellings_[hash_pos].str[0])
+ return false;
+
+ if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
+ == 0)
+ return true;
+
+ size_t hash_pos_ori = hash_pos;
+
+ while (true) {
+ hash_pos = hash_pos_next(hash_pos);
+ if (hash_pos_ori == hash_pos)
+ return false;
+
+ if ('\0' == raw_spellings_[hash_pos].str[0])
+ return false;
+
+ if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1)
+ == 0)
+ return true;
+ }
+
+ // never reach here
+ return false;
+}
+
+const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) {
+ if (NULL == raw_spellings_ || NULL == spelling_buf_ ||
+ NULL == item_size || NULL == spl_num)
+ return NULL;
+
+ qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling),
+ compare_raw_spl_eb);
+
+ // After sorting, only the first spelling_num_ items are valid.
+ // Copy them to the destination buffer.
+ for (size_t pos = 0; pos < spelling_num_; pos++) {
+ strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str,
+ spelling_size_);
+ }
+
+ if (need_score_) {
+ if (kPrintDebug0)
+ printf("------------Spelling Possiblities--------------\n");
+
+ double max_score = 0;
+ double min_score = 0;
+
+ // After sorting, only the first spelling_num_ items are valid.
+ for (size_t pos = 0; pos < spelling_num_; pos++) {
+ raw_spellings_[pos].freq /= total_freq_;
+ if (need_score_) {
+ if (0 == pos) {
+ max_score = raw_spellings_[0].freq;
+ min_score = max_score;
+ } else {
+ if (raw_spellings_[pos].freq > max_score)
+ max_score = raw_spellings_[pos].freq;
+ if (raw_spellings_[pos].freq < min_score)
+ min_score = raw_spellings_[pos].freq;
+ }
+ }
+ }
+
+ if (kPrintDebug0)
+ printf("-----max psb: %f, min psb: %f\n", max_score, min_score);
+
+ max_score = log(max_score);
+ min_score = log(min_score);
+
+ if (kPrintDebug0)
+ printf("-----max log value: %f, min log value: %f\n",
+ max_score, min_score);
+
+ // The absolute value of min_score is bigger than that of max_score because
+ // both of them are negative after log function.
+ score_amplifier_ = 1.0 * 255 / min_score;
+
+ double average_score = 0;
+ for (size_t pos = 0; pos < spelling_num_; pos++) {
+ double score = log(raw_spellings_[pos].freq) * score_amplifier_;
+ assert(score >= 0);
+
+ average_score += score;
+
+ // Because of calculation precision issue, score might be a little bigger
+ // than 255 after being amplified.
+ if (score > 255)
+ score = 255;
+ char *this_spl_buf = spelling_buf_ + pos * spelling_size_;
+ this_spl_buf[spelling_size_ - 1] =
+ static_cast<char>((unsigned char)score);
+
+ if (kPrintDebug0) {
+ printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf,
+ (unsigned char)this_spl_buf[spelling_size_ -1]);
+ }
+ }
+ average_score /= spelling_num_;
+ assert(average_score <= 255);
+ average_score_ = static_cast<uint8>(average_score);
+
+ if (kPrintDebug0)
+ printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_,
+ average_score_);
+ }
+
+ *item_size = spelling_size_;
+ *spl_num = spelling_num_;
+ frozen_ = true;
+ return spelling_buf_;
+}
+
+float SpellingTable::get_score_amplifier() {
+ return static_cast<float>(score_amplifier_);
+}
+
+unsigned char SpellingTable::get_average_score() {
+ return average_score_;
+}
+
+#endif // ___BUILD_MODEL___
+} // namespace ime_pinyin