diff options
Diffstat (limited to 'jni/share/ngram.cpp')
-rw-r--r-- | jni/share/ngram.cpp | 342 |
1 files changed, 342 insertions, 0 deletions
diff --git a/jni/share/ngram.cpp b/jni/share/ngram.cpp new file mode 100644 index 0000000..d95477a --- /dev/null +++ b/jni/share/ngram.cpp @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include <math.h> +#include <stdio.h> +#include <string.h> +#include <time.h> +#include "../include/mystdlib.h" +#include "../include/ngram.h" + +namespace ime_pinyin { + +#define ADD_COUNT 0.3 + +int comp_double(const void *p1, const void *p2) { + if (*static_cast<const double*>(p1) < *static_cast<const double*>(p2)) + return -1; + if (*static_cast<const double*>(p1) > *static_cast<const double*>(p2)) + return 1; + return 0; +} + +inline double distance(double freq, double code) { + // return fabs(freq - code); + return freq * fabs(log(freq) - log(code)); +} + +// Find the index of the code value which is nearest to the given freq +int qsearch_nearest(double code_book[], double freq, int start, int end) { + if (start == end) + return start; + + if (start + 1 == end) { + if (distance(freq, code_book[end]) > distance(freq, code_book[start])) + return start; + return end; + } + + int mid = (start + end) / 2; + + if (code_book[mid] > freq) + return qsearch_nearest(code_book, freq, start, mid); + else + return qsearch_nearest(code_book, freq, mid, end); +} + +size_t update_code_idx(double freqs[], size_t num, double code_book[], + CODEBOOK_TYPE *code_idx) { + size_t changed = 0; + for (size_t pos = 0; pos < num; pos++) { + CODEBOOK_TYPE idx; + idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1); + if (idx != code_idx[pos]) + changed++; + code_idx[pos] = idx; + } + return changed; +} + +double recalculate_kernel(double freqs[], size_t num, double code_book[], + CODEBOOK_TYPE *code_idx) { + double ret = 0; + + size_t *item_num = new size_t[kCodeBookSize]; + assert(item_num); + memset(item_num, 0, sizeof(size_t) * kCodeBookSize); + + double *cb_new = new double[kCodeBookSize]; + assert(cb_new); + memset(cb_new, 0, sizeof(double) * kCodeBookSize); + + for (size_t pos = 0; pos < num; pos++) { + ret += distance(freqs[pos], code_book[code_idx[pos]]); + + cb_new[code_idx[pos]] += freqs[pos]; + item_num[code_idx[pos]] += 1; + } + + for (size_t code = 0; code < kCodeBookSize; code++) { + assert(item_num[code] > 0); + code_book[code] = cb_new[code] / item_num[code]; + } + + delete [] item_num; + delete [] cb_new; + + return ret; +} + +void iterate_codes(double freqs[], size_t num, double code_book[], + CODEBOOK_TYPE *code_idx) { + size_t iter_num = 0; + double delta_last = 0; + do { + size_t changed = update_code_idx(freqs, num, code_book, code_idx); + + double delta = recalculate_kernel(freqs, num, code_book, code_idx); + + if (kPrintDebug0) { + printf("---Unigram codebook iteration: %d : %d, %.9f\n", + iter_num, changed, delta); + } + iter_num++; + + if (iter_num > 1 && + (delta == 0 || fabs(delta_last - delta)/fabs(delta) < 0.000000001)) + break; + delta_last = delta; + } while (true); +} + + +NGram* NGram::instance_ = NULL; + +NGram::NGram() { + initialized_ = false; + idx_num_ = 0; + lma_freq_idx_ = NULL; + sys_score_compensation_ = 0; + +#ifdef ___BUILD_MODEL___ + freq_codes_df_ = NULL; +#endif + freq_codes_ = NULL; +} + +NGram::~NGram() { + if (NULL != lma_freq_idx_) + free(lma_freq_idx_); + +#ifdef ___BUILD_MODEL___ + if (NULL != freq_codes_df_) + free(freq_codes_df_); +#endif + + if (NULL != freq_codes_) + free(freq_codes_); +} + +NGram& NGram::get_instance() { + if (NULL == instance_) + instance_ = new NGram(); + return *instance_; +} + +bool NGram::save_ngram(FILE *fp) { + if (!initialized_ || NULL == fp) + return false; + + if (0 == idx_num_ || NULL == freq_codes_ || NULL == lma_freq_idx_) + return false; + + if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) + return false; + + if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != + kCodeBookSize) + return false; + + if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) + return false; + + return true; +} + +bool NGram::load_ngram(FILE *fp) { + if (NULL == fp) + return false; + + initialized_ = false; + + if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) + return false; + + if (NULL != lma_freq_idx_) + free(lma_freq_idx_); + + if (NULL != freq_codes_) + free(freq_codes_); + + lma_freq_idx_ = static_cast<CODEBOOK_TYPE*> + (malloc(idx_num_ * sizeof(CODEBOOK_TYPE))); + freq_codes_ = static_cast<LmaScoreType*> + (malloc(kCodeBookSize * sizeof(LmaScoreType))); + + if (NULL == lma_freq_idx_ || NULL == freq_codes_) + return false; + + if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) != + kCodeBookSize) + return false; + + if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) + return false; + + initialized_ = true; + + total_freq_none_sys_ = 0; + return true; +} + +void NGram::set_total_freq_none_sys(size_t freq_none_sys) { + total_freq_none_sys_ = freq_none_sys; + if (0 == total_freq_none_sys_) { + sys_score_compensation_ = 0; + } else { + double factor = static_cast<double>(total_freq_none_sys_) / ( + kSysDictTotalFreq + total_freq_none_sys_); + sys_score_compensation_ = static_cast<float>( + log(factor) * kLogValueAmplifier); + } +} + +// The caller makes sure this oject is initialized. +float NGram::get_uni_psb(LemmaIdType lma_id) { + return static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) + + sys_score_compensation_; +} + +float NGram::convert_psb_to_score(double psb) { + float score = static_cast<float>( + log(psb) * static_cast<double>(kLogValueAmplifier)); + if (score > static_cast<float>(kMaxScore)) { + score = static_cast<float>(kMaxScore); + } + return score; +} + +#ifdef ___BUILD_MODEL___ +bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num, + LemmaIdType next_idx_unused) { + if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) + return false; + + double total_freq = 0; + double *freqs = new double[next_idx_unused]; + if (NULL == freqs) + return false; + + freqs[0] = ADD_COUNT; + total_freq += freqs[0]; + LemmaIdType idx_now = 0; + for (size_t pos = 0; pos < lemma_num; pos++) { + if (lemma_arr[pos].idx_by_hz == idx_now) + continue; + idx_now++; + + assert(lemma_arr[pos].idx_by_hz == idx_now); + + freqs[idx_now] = lemma_arr[pos].freq; + if (freqs[idx_now] <= 0) + freqs[idx_now] = 0.3; + + total_freq += freqs[idx_now]; + } + + double max_freq = 0; + idx_num_ = idx_now + 1; + assert(idx_now + 1 == next_idx_unused); + + for (size_t pos = 0; pos < idx_num_; pos++) { + freqs[pos] = freqs[pos] / total_freq; + assert(freqs[pos] > 0); + if (freqs[pos] > max_freq) + max_freq = freqs[pos]; + } + + // calculate the code book + if (NULL == freq_codes_df_) + freq_codes_df_ = new double[kCodeBookSize]; + assert(freq_codes_df_); + memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize); + + if (NULL == freq_codes_) + freq_codes_ = new LmaScoreType[kCodeBookSize]; + assert(freq_codes_); + memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize); + + size_t freq_pos = 0; + for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { + bool found = true; + + while (found) { + found = false; + double cand = freqs[freq_pos]; + for (size_t i = 0; i < code_pos; i++) + if (freq_codes_df_[i] == cand) { + found = true; + break; + } + if (found) + freq_pos++; + } + + freq_codes_df_[code_pos] = freqs[freq_pos]; + freq_pos++; + } + + myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double); + + if (NULL == lma_freq_idx_) + lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_]; + assert(lma_freq_idx_); + + iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_); + + delete [] freqs; + + if (kPrintDebug0) { + printf("\n------Language Model Unigram Codebook------\n"); + } + + for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) { + double log_score = log(freq_codes_df_[code_pos]); + float final_score = convert_psb_to_score(freq_codes_df_[code_pos]); + if (kPrintDebug0) { + printf("code:%d, probability:%.9f, log score:%.3f, final score: %.3f\n", + code_pos, freq_codes_df_[code_pos], log_score, final_score); + } + freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score); + } + + initialized_ = true; + return true; +} +#endif + +} // namespace ime_pinyin |