diff options
Diffstat (limited to 'jni/share/dictlist.cpp')
-rw-r--r-- | jni/share/dictlist.cpp | 446 |
1 files changed, 446 insertions, 0 deletions
diff --git a/jni/share/dictlist.cpp b/jni/share/dictlist.cpp new file mode 100644 index 0000000..aa7905c --- /dev/null +++ b/jni/share/dictlist.cpp @@ -0,0 +1,446 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include "../include/dictlist.h" +#include "../include/mystdlib.h" +#include "../include/ngram.h" +#include "../include/searchutility.h" + +namespace ime_pinyin { + +DictList::DictList() { + initialized_ = false; + scis_num_ = 0; + scis_hz_ = NULL; + scis_splid_ = NULL; + buf_ = NULL; + spl_trie_ = SpellingTrie::get_cpinstance(); + + assert(kMaxLemmaSize == 8); + cmp_func_[0] = cmp_hanzis_1; + cmp_func_[1] = cmp_hanzis_2; + cmp_func_[2] = cmp_hanzis_3; + cmp_func_[3] = cmp_hanzis_4; + cmp_func_[4] = cmp_hanzis_5; + cmp_func_[5] = cmp_hanzis_6; + cmp_func_[6] = cmp_hanzis_7; + cmp_func_[7] = cmp_hanzis_8; +} + +DictList::~DictList() { + free_resource(); +} + +bool DictList::alloc_resource(size_t buf_size, size_t scis_num) { + // Allocate memory + buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16))); + if (NULL == buf_) + return false; + + scis_num_ = scis_num; + + scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16))); + if (NULL == scis_hz_) + return false; + + scis_splid_ = static_cast<SpellingId*> + (malloc(scis_num_ * sizeof(SpellingId))); + + if (NULL == scis_splid_) + return false; + + return true; +} + +void DictList::free_resource() { + if (NULL != buf_) + free(buf_); + buf_ = NULL; + + if (NULL != scis_hz_) + free(scis_hz_); + scis_hz_ = NULL; + + if (NULL != scis_splid_) + free(scis_splid_); + scis_splid_ = NULL; +} + +#ifdef ___BUILD_MODEL___ +bool DictList::init_list(const SingleCharItem *scis, size_t scis_num, + const LemmaEntry *lemma_arr, size_t lemma_num) { + if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num) + return false; + + initialized_ = false; + + if (NULL != buf_) + free(buf_); + + // calculate the size + size_t buf_size = calculate_size(lemma_arr, lemma_num); + if (0 == buf_size) + return false; + + if (!alloc_resource(buf_size, scis_num)) + return false; + + fill_scis(scis, scis_num); + + // Copy the related content from the array to inner buffer + fill_list(lemma_arr, lemma_num); + + initialized_ = true; + return true; +} + +size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) { + size_t last_hz_len = 0; + size_t list_size = 0; + size_t id_num = 0; + + for (size_t i = 0; i < lemma_num; i++) { + if (0 == i) { + last_hz_len = lemma_arr[i].hz_str_len; + + assert(last_hz_len > 0); + assert(lemma_arr[0].idx_by_hz == 1); + + id_num++; + start_pos_[0] = 0; + start_id_[0] = id_num; + + last_hz_len = 1; + list_size += last_hz_len; + } else { + size_t current_hz_len = lemma_arr[i].hz_str_len; + + assert(current_hz_len >= last_hz_len); + + if (current_hz_len == last_hz_len) { + list_size += current_hz_len; + id_num++; + } else { + for (size_t len = last_hz_len; len < current_hz_len - 1; len++) { + start_pos_[len] = start_pos_[len - 1]; + start_id_[len] = start_id_[len - 1]; + } + + start_pos_[current_hz_len - 1] = list_size; + + id_num++; + start_id_[current_hz_len - 1] = id_num; + + last_hz_len = current_hz_len; + list_size += current_hz_len; + } + } + } + + for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) { + if (0 == i) { + start_pos_[0] = 0; + start_id_[0] = 1; + } else { + start_pos_[i] = list_size; + start_id_[i] = id_num; + } + } + + return start_pos_[kMaxLemmaSize]; +} + +void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) { + assert(scis_num_ == scis_num); + + for (size_t pos = 0; pos < scis_num_; pos++) { + scis_hz_[pos] = scis[pos].hz; + scis_splid_[pos] = scis[pos].splid; + } +} + +void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) { + size_t current_pos = 0; + + utf16_strncpy(buf_, lemma_arr[0].hanzi_str, + lemma_arr[0].hz_str_len); + + current_pos = lemma_arr[0].hz_str_len; + + size_t id_num = 1; + + for (size_t i = 1; i < lemma_num; i++) { + utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str, + lemma_arr[i].hz_str_len); + + id_num++; + current_pos += lemma_arr[i].hz_str_len; + } + + assert(current_pos == start_pos_[kMaxLemmaSize]); + assert(id_num == start_id_[kMaxLemmaSize]); +} + +char16* DictList::find_pos2_startedbyhz(char16 hz_char) { + char16 *found_2w = static_cast<char16*> + (mybsearch(&hz_char, buf_ + start_pos_[1], + (start_pos_[2] - start_pos_[1]) / 2, + sizeof(char16) * 2, cmp_hanzis_1)); + if (NULL == found_2w) + return NULL; + + while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1)) + found_2w -= 2; + + return found_2w; +} +#endif // ___BUILD_MODEL___ + +char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[], + size_t word_len, int (*cmp_func)(const void *, const void *)) { + char16 *found_w = static_cast<char16*> + (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1], + (start_pos_[word_len] - start_pos_[word_len - 1]) + / word_len, + sizeof(char16) * word_len, cmp_func)); + + if (NULL == found_w) + return NULL; + + while (found_w > buf_ + start_pos_[word_len -1] && + cmp_func(found_w, found_w - word_len) == 0) + found_w -= word_len; + + return found_w; +} + +size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len, + NPredictItem *npre_items, size_t npre_max, + size_t b4_used) { + assert(hzs_len <= kMaxPredictSize && hzs_len > 0); + + // 1. Prepare work + int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1]; + + NGram& ngram = NGram::get_instance(); + + size_t item_num = 0; + + // 2. Do prediction + for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len; + pre_len++) { + uint16 word_len = hzs_len + pre_len; + char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func); + if (NULL == w_buf) + continue; + while (w_buf < buf_ + start_pos_[word_len] && + cmp_func(w_buf, last_hzs) == 0 && + item_num < npre_max) { + memset(npre_items + item_num, 0, sizeof(NPredictItem)); + utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len); + npre_items[item_num].psb = + ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1]) + / word_len + start_id_[word_len - 1]); + npre_items[item_num].his_len = hzs_len; + item_num++; + w_buf += word_len; + } + } + + size_t new_num = 0; + for (size_t i = 0; i < item_num; i++) { + // Try to find it in the existing items + size_t e_pos; + for (e_pos = 1; e_pos <= b4_used; e_pos++) { + if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs, + kMaxPredictSize) == 0) + break; + } + if (e_pos <= b4_used) + continue; + + // If not found, append it to the buffer + npre_items[new_num] = npre_items[i]; + new_num++; + } + + return new_num; +} + +uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf, + uint16 str_max) { + if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf + || str_max <= 1) + return 0; + + // Find the range + for (uint16 i = 0; i < kMaxLemmaSize; i++) { + if (i + 1 > str_max - 1) + return 0; + if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) { + size_t id_span = id_lemma - start_id_[i]; + + uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1); + for (uint16 len = 0; len <= i; len++) { + str_buf[len] = buf[len]; + } + str_buf[i+1] = (char16)'\0'; + return i + 1; + } + } + return 0; +} + +uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid, + uint16 *splids, uint16 max_splids) { + char16 *hz_found = static_cast<char16*> + (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1)); + assert(NULL != hz_found && hanzi == *hz_found); + + // Move to the first one. + while (hz_found > scis_hz_ && hanzi == *(hz_found - 1)) + hz_found--; + + // First try to found if strict comparison result is not zero. + char16 *hz_f = hz_found; + bool strict = false; + while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) { + uint16 pos = hz_f - scis_hz_; + if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) { + strict = true; + } + hz_f++; + } + + uint16 found_num = 0; + while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) { + uint16 pos = hz_found - scis_hz_; + if (0 == half_splid || + (strict && scis_splid_[pos].half_splid == half_splid) || + (!strict && spl_trie_->half_full_compatible(half_splid, + scis_splid_[pos].full_splid))) { + assert(found_num + 1 < max_splids); + splids[found_num] = scis_splid_[pos].full_splid; + found_num++; + } + hz_found++; + } + + return found_num; +} + +LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) { + if (NULL == str || str_len > kMaxLemmaSize) + return 0; + + char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]); + if (NULL == found) + return 0; + + assert(found > buf_); + assert(static_cast<size_t>(found - buf_) >= start_pos_[str_len - 1]); + return static_cast<LemmaIdType> + (start_id_[str_len - 1] + + (found - buf_ - start_pos_[str_len - 1]) / str_len); +} + +void DictList::convert_to_hanzis(char16 *str, uint16 str_len) { + assert(NULL != str); + + for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { + str[str_pos] = scis_hz_[str[str_pos]]; + } +} + +void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) { + assert(NULL != str); + + for (uint16 str_pos = 0; str_pos < str_len; str_pos++) { + str[str_pos] = 0x100; + } +} + +bool DictList::save_list(FILE *fp) { + if (!initialized_ || NULL == fp) + return false; + + if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] || + NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_) + return false; + + if (fwrite(&scis_num_, sizeof(size_t), 1, fp) != 1) + return false; + + if (fwrite(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + if (fwrite(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) + return false; + + if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) + return false; + + if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != + start_pos_[kMaxLemmaSize]) + return false; + + return true; +} + +bool DictList::load_list(FILE *fp) { + if (NULL == fp) + return false; + + initialized_ = false; + + if (fread(&scis_num_, sizeof(size_t), 1, fp) != 1) + return false; + + if (fread(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + if (fread(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) != + kMaxLemmaSize + 1) + return false; + + free_resource(); + + if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_)) + return false; + + if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_) + return false; + + if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_) + return false; + + if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) != + start_pos_[kMaxLemmaSize]) + return false; + + initialized_ = true; + return true; +} +} // namespace ime_pinyin |