diff options
Diffstat (limited to 'jni/share/spellingtable.cpp')
-rw-r--r-- | jni/share/spellingtable.cpp | 313 |
1 files changed, 313 insertions, 0 deletions
diff --git a/jni/share/spellingtable.cpp b/jni/share/spellingtable.cpp new file mode 100644 index 0000000..6005e20 --- /dev/null +++ b/jni/share/spellingtable.cpp @@ -0,0 +1,313 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include "../include/spellingtable.h" + +namespace ime_pinyin { + +#ifdef ___BUILD_MODEL___ + +const char SpellingTable:: + kNotSupportList[kNotSupportNum][kMaxSpellingSize + 1] = {"HM", "HNG", "NG"}; + +// "" is the biggest, so that all empty strings will be moved to the end +// _eb mean empty is biggest +int compare_raw_spl_eb(const void* p1, const void* p2) { + if ('\0' == (static_cast<const RawSpelling*>(p1))->str[0]) + return 1; + + if ('\0' == (static_cast<const RawSpelling*>(p2))->str[0]) + return -1; + + return strcmp((static_cast<const RawSpelling*>(p1))->str, + (static_cast<const RawSpelling*>(p2))->str); +} + +size_t get_odd_next(size_t value) { + size_t v_next = value; + while (true) { + size_t v_next_sqrt = (size_t)sqrt(v_next); + + bool is_odd = true; + for (size_t v_dv = 2; v_dv < v_next_sqrt + 1; v_dv++) { + if (v_next % v_dv == 0) { + is_odd = false; + break; + } + } + + if (is_odd) + return v_next; + + v_next++; + } + + // never reach here + return 0; +} + +SpellingTable::SpellingTable() { + need_score_ = false; + raw_spellings_ = NULL; + spelling_buf_ = NULL; + spelling_num_ = 0; + total_freq_ = 0; + frozen_ = true; +} + +SpellingTable::~SpellingTable() { + free_resource(); +} + +size_t SpellingTable::get_hash_pos(const char* spelling_str) { + size_t hash_pos = 0; + for (size_t pos = 0; pos < spelling_size_; pos++) { + if ('\0' == spelling_str[pos]) + break; + hash_pos += (size_t)spelling_str[pos]; + } + + hash_pos = hash_pos % spelling_max_num_; + return hash_pos; +} + +size_t SpellingTable::hash_pos_next(size_t hash_pos) { + hash_pos += 123; + hash_pos = hash_pos % spelling_max_num_; + return hash_pos; +} + +void SpellingTable::free_resource() { + if (NULL != raw_spellings_) + delete [] raw_spellings_; + raw_spellings_ = NULL; + + if (NULL != spelling_buf_) + delete [] spelling_buf_; + spelling_buf_ = NULL; +} + +bool SpellingTable::init_table(size_t pure_spl_size, size_t spl_max_num, + bool need_score) { + if (pure_spl_size == 0 || spl_max_num ==0) + return false; + + need_score_ = need_score; + + free_resource(); + + spelling_size_ = pure_spl_size + 1; + if (need_score) + spelling_size_ += 1; + spelling_max_num_ = get_odd_next(spl_max_num); + spelling_num_ = 0; + + raw_spellings_ = new RawSpelling[spelling_max_num_]; + spelling_buf_ = new char[spelling_max_num_ * (spelling_size_)]; + if (NULL == raw_spellings_ || NULL == spelling_buf_) { + free_resource(); + return false; + } + + memset(raw_spellings_, 0, spelling_max_num_ * sizeof(RawSpelling)); + memset(spelling_buf_, 0, spelling_max_num_ * (spelling_size_)); + frozen_ = false; + total_freq_ = 0; + return true; +} + +bool SpellingTable::put_spelling(const char* spelling_str, double freq) { + if (frozen_ || NULL == spelling_str) + return false; + + for (size_t pos = 0; pos < kNotSupportNum; pos++) { + if (strcmp(spelling_str, kNotSupportList[pos]) == 0) { + return false; + } + } + + total_freq_ += freq; + + size_t hash_pos = get_hash_pos(spelling_str); + + raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; + + if (strncmp(raw_spellings_[hash_pos].str, spelling_str, + spelling_size_ - 1) == 0) { + raw_spellings_[hash_pos].freq += freq; + return true; + } + + size_t hash_pos_ori = hash_pos; + + while (true) { + if (strncmp(raw_spellings_[hash_pos].str, + spelling_str, spelling_size_ - 1) == 0) { + raw_spellings_[hash_pos].freq += freq; + return true; + } + + if ('\0' == raw_spellings_[hash_pos].str[0]) { + raw_spellings_[hash_pos].freq += freq; + strncpy(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1); + raw_spellings_[hash_pos].str[spelling_size_ - 1] = '\0'; + spelling_num_++; + return true; + } + + hash_pos = hash_pos_next(hash_pos); + if (hash_pos_ori == hash_pos) + return false; + } + + // never reach here + return false; +} + +bool SpellingTable::contain(const char* spelling_str) { + if (NULL == spelling_str || NULL == spelling_buf_ || frozen_) + return false; + + size_t hash_pos = get_hash_pos(spelling_str); + + if ('\0' == raw_spellings_[hash_pos].str[0]) + return false; + + if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) + == 0) + return true; + + size_t hash_pos_ori = hash_pos; + + while (true) { + hash_pos = hash_pos_next(hash_pos); + if (hash_pos_ori == hash_pos) + return false; + + if ('\0' == raw_spellings_[hash_pos].str[0]) + return false; + + if (strncmp(raw_spellings_[hash_pos].str, spelling_str, spelling_size_ - 1) + == 0) + return true; + } + + // never reach here + return false; +} + +const char* SpellingTable::arrange(size_t *item_size, size_t *spl_num) { + if (NULL == raw_spellings_ || NULL == spelling_buf_ || + NULL == item_size || NULL == spl_num) + return NULL; + + qsort(raw_spellings_, spelling_max_num_, sizeof(RawSpelling), + compare_raw_spl_eb); + + // After sorting, only the first spelling_num_ items are valid. + // Copy them to the destination buffer. + for (size_t pos = 0; pos < spelling_num_; pos++) { + strncpy(spelling_buf_ + pos * spelling_size_, raw_spellings_[pos].str, + spelling_size_); + } + + if (need_score_) { + if (kPrintDebug0) + printf("------------Spelling Possiblities--------------\n"); + + double max_score = 0; + double min_score = 0; + + // After sorting, only the first spelling_num_ items are valid. + for (size_t pos = 0; pos < spelling_num_; pos++) { + raw_spellings_[pos].freq /= total_freq_; + if (need_score_) { + if (0 == pos) { + max_score = raw_spellings_[0].freq; + min_score = max_score; + } else { + if (raw_spellings_[pos].freq > max_score) + max_score = raw_spellings_[pos].freq; + if (raw_spellings_[pos].freq < min_score) + min_score = raw_spellings_[pos].freq; + } + } + } + + if (kPrintDebug0) + printf("-----max psb: %f, min psb: %f\n", max_score, min_score); + + max_score = log(max_score); + min_score = log(min_score); + + if (kPrintDebug0) + printf("-----max log value: %f, min log value: %f\n", + max_score, min_score); + + // The absolute value of min_score is bigger than that of max_score because + // both of them are negative after log function. + score_amplifier_ = 1.0 * 255 / min_score; + + double average_score = 0; + for (size_t pos = 0; pos < spelling_num_; pos++) { + double score = log(raw_spellings_[pos].freq) * score_amplifier_; + assert(score >= 0); + + average_score += score; + + // Because of calculation precision issue, score might be a little bigger + // than 255 after being amplified. + if (score > 255) + score = 255; + char *this_spl_buf = spelling_buf_ + pos * spelling_size_; + this_spl_buf[spelling_size_ - 1] = + static_cast<char>((unsigned char)score); + + if (kPrintDebug0) { + printf("---pos:%d, %s, psb:%d\n", pos, this_spl_buf, + (unsigned char)this_spl_buf[spelling_size_ -1]); + } + } + average_score /= spelling_num_; + assert(average_score <= 255); + average_score_ = static_cast<uint8>(average_score); + + if (kPrintDebug0) + printf("\n----Score Amplifier: %f, Average Score: %d\n", score_amplifier_, + average_score_); + } + + *item_size = spelling_size_; + *spl_num = spelling_num_; + frozen_ = true; + return spelling_buf_; +} + +float SpellingTable::get_score_amplifier() { + return static_cast<float>(score_amplifier_); +} + +unsigned char SpellingTable::get_average_score() { + return average_score_; +} + +#endif // ___BUILD_MODEL___ +} // namespace ime_pinyin |