1 files changed, 446 insertions, 0 deletions
diff --git a/jni/share/dictlist.cpp b/jni/share/dictlist.cpp
new file mode 100644
index 0000000..aa7905c
--- /dev/null
+++ b/jni/share/dictlist.cpp
@@ -0,0 +1,446 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../include/dictlist.h"
+#include "../include/mystdlib.h"
+#include "../include/ngram.h"
+#include "../include/searchutility.h"
+
+namespace ime_pinyin {
+
+DictList::DictList() {
+  initialized_ = false;
+  scis_num_ = 0;
+  scis_hz_ = NULL;
+  scis_splid_ = NULL;
+  buf_ = NULL;
+  spl_trie_ = SpellingTrie::get_cpinstance();
+
+  assert(kMaxLemmaSize == 8);
+  cmp_func_[0] = cmp_hanzis_1;
+  cmp_func_[1] = cmp_hanzis_2;
+  cmp_func_[2] = cmp_hanzis_3;
+  cmp_func_[3] = cmp_hanzis_4;
+  cmp_func_[4] = cmp_hanzis_5;
+  cmp_func_[5] = cmp_hanzis_6;
+  cmp_func_[6] = cmp_hanzis_7;
+  cmp_func_[7] = cmp_hanzis_8;
+}
+
+DictList::~DictList() {
+  free_resource();
+}
+
+bool DictList::alloc_resource(size_t buf_size, size_t scis_num) {
+  // Allocate memory
+  buf_ = static_cast<char16*>(malloc(buf_size * sizeof(char16)));
+  if (NULL == buf_)
+    return false;
+
+  scis_num_ = scis_num;
+
+  scis_hz_ = static_cast<char16*>(malloc(scis_num_ * sizeof(char16)));
+  if (NULL == scis_hz_)
+    return false;
+
+  scis_splid_ = static_cast<SpellingId*>
+      (malloc(scis_num_ * sizeof(SpellingId)));
+
+  if (NULL == scis_splid_)
+    return false;
+
+  return true;
+}
+
+void DictList::free_resource() {
+  if (NULL != buf_)
+    free(buf_);
+  buf_ = NULL;
+
+  if (NULL != scis_hz_)
+    free(scis_hz_);
+  scis_hz_ = NULL;
+
+  if (NULL != scis_splid_)
+    free(scis_splid_);
+  scis_splid_ = NULL;
+}
+
+#ifdef ___BUILD_MODEL___
+bool DictList::init_list(const SingleCharItem *scis, size_t scis_num,
+                         const LemmaEntry *lemma_arr, size_t lemma_num) {
+  if (NULL == scis || 0 == scis_num || NULL == lemma_arr || 0 == lemma_num)
+    return false;
+
+  initialized_ = false;
+
+  if (NULL != buf_)
+    free(buf_);
+
+  // calculate the size
+  size_t buf_size = calculate_size(lemma_arr, lemma_num);
+  if (0 == buf_size)
+    return false;
+
+  if (!alloc_resource(buf_size, scis_num))
+    return false;
+
+  fill_scis(scis, scis_num);
+
+  // Copy the related content from the array to inner buffer
+  fill_list(lemma_arr, lemma_num);
+
+  initialized_ = true;
+  return true;
+}
+
+size_t DictList::calculate_size(const LemmaEntry* lemma_arr, size_t lemma_num) {
+  size_t last_hz_len = 0;
+  size_t list_size = 0;
+  size_t id_num = 0;
+
+  for (size_t i = 0; i < lemma_num; i++) {
+    if (0 == i) {
+      last_hz_len = lemma_arr[i].hz_str_len;
+
+      assert(last_hz_len > 0);
+      assert(lemma_arr[0].idx_by_hz == 1);
+
+      id_num++;
+      start_pos_[0] = 0;
+      start_id_[0] = id_num;
+
+      last_hz_len = 1;
+      list_size += last_hz_len;
+    } else {
+      size_t current_hz_len = lemma_arr[i].hz_str_len;
+
+      assert(current_hz_len >= last_hz_len);
+
+      if (current_hz_len == last_hz_len) {
+          list_size += current_hz_len;
+          id_num++;
+      } else {
+        for (size_t len = last_hz_len; len < current_hz_len - 1; len++) {
+          start_pos_[len] = start_pos_[len - 1];
+          start_id_[len] = start_id_[len - 1];
+        }
+
+        start_pos_[current_hz_len - 1] = list_size;
+
+        id_num++;
+        start_id_[current_hz_len - 1] = id_num;
+
+        last_hz_len = current_hz_len;
+        list_size += current_hz_len;
+      }
+    }
+  }
+
+  for (size_t i = last_hz_len; i <= kMaxLemmaSize; i++) {
+    if (0 == i) {
+      start_pos_[0] = 0;
+      start_id_[0] = 1;
+    } else {
+      start_pos_[i] = list_size;
+      start_id_[i] = id_num;
+    }
+  }
+
+  return start_pos_[kMaxLemmaSize];
+}
+
+void DictList::fill_scis(const SingleCharItem *scis, size_t scis_num) {
+  assert(scis_num_ == scis_num);
+
+  for (size_t pos = 0; pos < scis_num_; pos++) {
+    scis_hz_[pos] = scis[pos].hz;
+    scis_splid_[pos] = scis[pos].splid;
+  }
+}
+
+void DictList::fill_list(const LemmaEntry* lemma_arr, size_t lemma_num) {
+  size_t current_pos = 0;
+
+  utf16_strncpy(buf_, lemma_arr[0].hanzi_str,
+                lemma_arr[0].hz_str_len);
+
+  current_pos = lemma_arr[0].hz_str_len;
+
+  size_t id_num = 1;
+
+  for (size_t i = 1; i < lemma_num; i++) {
+    utf16_strncpy(buf_ + current_pos, lemma_arr[i].hanzi_str,
+                  lemma_arr[i].hz_str_len);
+
+    id_num++;
+    current_pos += lemma_arr[i].hz_str_len;
+  }
+
+  assert(current_pos == start_pos_[kMaxLemmaSize]);
+  assert(id_num == start_id_[kMaxLemmaSize]);
+}
+
+char16* DictList::find_pos2_startedbyhz(char16 hz_char) {
+  char16 *found_2w = static_cast<char16*>
+                     (mybsearch(&hz_char, buf_ + start_pos_[1],
+                                (start_pos_[2] - start_pos_[1]) / 2,
+                                sizeof(char16) * 2, cmp_hanzis_1));
+  if (NULL == found_2w)
+    return NULL;
+
+  while (found_2w > buf_ + start_pos_[1] && *found_2w == *(found_2w - 1))
+    found_2w -= 2;
+
+  return found_2w;
+}
+#endif  // ___BUILD_MODEL___
+
+char16* DictList::find_pos_startedbyhzs(const char16 last_hzs[],
+    size_t word_len, int (*cmp_func)(const void *, const void *)) {
+  char16 *found_w = static_cast<char16*>
+                    (mybsearch(last_hzs, buf_ + start_pos_[word_len - 1],
+                               (start_pos_[word_len] - start_pos_[word_len - 1])
+                               / word_len,
+                               sizeof(char16) * word_len, cmp_func));
+
+  if (NULL == found_w)
+    return NULL;
+
+  while (found_w > buf_ + start_pos_[word_len -1] &&
+         cmp_func(found_w, found_w - word_len) == 0)
+    found_w -= word_len;
+
+  return found_w;
+}
+
+size_t DictList::predict(const char16 last_hzs[], uint16 hzs_len,
+                         NPredictItem *npre_items, size_t npre_max,
+                         size_t b4_used) {
+  assert(hzs_len <= kMaxPredictSize && hzs_len > 0);
+
+  // 1. Prepare work
+  int (*cmp_func)(const void *, const void *) = cmp_func_[hzs_len - 1];
+
+  NGram& ngram = NGram::get_instance();
+
+  size_t item_num = 0;
+
+  // 2. Do prediction
+  for (uint16 pre_len = 1; pre_len <= kMaxPredictSize + 1 - hzs_len;
+       pre_len++) {
+    uint16 word_len = hzs_len + pre_len;
+    char16 *w_buf = find_pos_startedbyhzs(last_hzs, word_len, cmp_func);
+    if (NULL == w_buf)
+      continue;
+    while (w_buf < buf_ + start_pos_[word_len] &&
+           cmp_func(w_buf, last_hzs) == 0 &&
+           item_num < npre_max) {
+      memset(npre_items + item_num, 0, sizeof(NPredictItem));
+      utf16_strncpy(npre_items[item_num].pre_hzs, w_buf + hzs_len, pre_len);
+      npre_items[item_num].psb =
+        ngram.get_uni_psb((size_t)(w_buf - buf_ - start_pos_[word_len - 1])
+                          / word_len + start_id_[word_len - 1]);
+      npre_items[item_num].his_len = hzs_len;
+      item_num++;
+      w_buf += word_len;
+    }
+  }
+
+  size_t new_num = 0;
+  for (size_t i = 0; i < item_num; i++) {
+    // Try to find it in the existing items
+    size_t e_pos;
+    for (e_pos = 1; e_pos <= b4_used; e_pos++) {
+      if (utf16_strncmp((*(npre_items - e_pos)).pre_hzs, npre_items[i].pre_hzs,
+                        kMaxPredictSize) == 0)
+        break;
+    }
+    if (e_pos <= b4_used)
+      continue;
+
+    // If not found, append it to the buffer
+    npre_items[new_num] = npre_items[i];
+    new_num++;
+  }
+
+  return new_num;
+}
+
+uint16 DictList::get_lemma_str(LemmaIdType id_lemma, char16 *str_buf,
+                               uint16 str_max) {
+  if (!initialized_ || id_lemma >= start_id_[kMaxLemmaSize] || NULL == str_buf
+      || str_max <= 1)
+    return 0;
+
+  // Find the range
+  for (uint16 i = 0; i < kMaxLemmaSize; i++) {
+    if (i + 1 > str_max - 1)
+      return 0;
+    if (start_id_[i] <= id_lemma && start_id_[i + 1] > id_lemma) {
+      size_t id_span = id_lemma - start_id_[i];
+
+      uint16 *buf = buf_ + start_pos_[i] + id_span * (i + 1);
+      for (uint16 len = 0; len <= i; len++) {
+        str_buf[len] = buf[len];
+      }
+      str_buf[i+1] = (char16)'\0';
+      return i + 1;
+    }
+  }
+  return 0;
+}
+
+uint16 DictList::get_splids_for_hanzi(char16 hanzi, uint16 half_splid,
+                                      uint16 *splids, uint16 max_splids) {
+  char16 *hz_found = static_cast<char16*>
+      (mybsearch(&hanzi, scis_hz_, scis_num_, sizeof(char16), cmp_hanzis_1));
+  assert(NULL != hz_found && hanzi == *hz_found);
+
+  // Move to the first one.
+  while (hz_found > scis_hz_ && hanzi == *(hz_found - 1))
+    hz_found--;
+
+  // First try to found if strict comparison result is not zero.
+  char16 *hz_f = hz_found;
+  bool strict = false;
+  while (hz_f < scis_hz_ + scis_num_ && hanzi == *hz_f) {
+    uint16 pos = hz_f - scis_hz_;
+    if (0 == half_splid || scis_splid_[pos].half_splid == half_splid) {
+      strict = true;
+    }
+    hz_f++;
+  }
+
+  uint16 found_num = 0;
+  while (hz_found < scis_hz_ + scis_num_ && hanzi == *hz_found) {
+    uint16 pos = hz_found - scis_hz_;
+    if (0 == half_splid ||
+        (strict && scis_splid_[pos].half_splid == half_splid) ||
+        (!strict && spl_trie_->half_full_compatible(half_splid,
+        scis_splid_[pos].full_splid))) {
+      assert(found_num + 1 < max_splids);
+      splids[found_num] = scis_splid_[pos].full_splid;
+      found_num++;
+    }
+    hz_found++;
+  }
+
+  return found_num;
+}
+
+LemmaIdType DictList::get_lemma_id(const char16 *str, uint16 str_len) {
+  if (NULL == str || str_len > kMaxLemmaSize)
+    return 0;
+
+  char16 *found = find_pos_startedbyhzs(str, str_len, cmp_func_[str_len - 1]);
+  if (NULL == found)
+    return 0;
+
+  assert(found > buf_);
+  assert(static_cast<size_t>(found - buf_) >= start_pos_[str_len - 1]);
+  return static_cast<LemmaIdType>
+      (start_id_[str_len - 1] +
+       (found - buf_ - start_pos_[str_len - 1]) / str_len);
+}
+
+void DictList::convert_to_hanzis(char16 *str, uint16 str_len) {
+  assert(NULL != str);
+
+  for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
+    str[str_pos] = scis_hz_[str[str_pos]];
+  }
+}
+
+void DictList::convert_to_scis_ids(char16 *str, uint16 str_len) {
+  assert(NULL != str);
+
+  for (uint16 str_pos = 0; str_pos < str_len; str_pos++) {
+    str[str_pos] = 0x100;
+  }
+}
+
+bool DictList::save_list(FILE *fp) {
+  if (!initialized_ || NULL == fp)
+    return false;
+
+  if (NULL == buf_ || 0 == start_pos_[kMaxLemmaSize] ||
+      NULL == scis_hz_ || NULL == scis_splid_ || 0 == scis_num_)
+    return false;
+
+  if (fwrite(&scis_num_, sizeof(size_t), 1, fp) != 1)
+    return false;
+
+  if (fwrite(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) !=
+      kMaxLemmaSize + 1)
+    return false;
+
+  if (fwrite(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) !=
+      kMaxLemmaSize + 1)
+    return false;
+
+  if (fwrite(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
+    return false;
+
+  if (fwrite(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
+    return false;
+
+  if (fwrite(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
+      start_pos_[kMaxLemmaSize])
+    return false;
+
+  return true;
+}
+
+bool DictList::load_list(FILE *fp) {
+  if (NULL == fp)
+    return false;
+
+  initialized_ = false;
+
+  if (fread(&scis_num_, sizeof(size_t), 1, fp) != 1)
+    return false;
+
+  if (fread(start_pos_, sizeof(size_t), kMaxLemmaSize + 1, fp) !=
+      kMaxLemmaSize + 1)
+    return false;
+
+  if (fread(start_id_, sizeof(size_t), kMaxLemmaSize + 1, fp) !=
+      kMaxLemmaSize + 1)
+    return false;
+
+  free_resource();
+
+  if (!alloc_resource(start_pos_[kMaxLemmaSize], scis_num_))
+    return false;
+
+  if (fread(scis_hz_, sizeof(char16), scis_num_, fp) != scis_num_)
+    return false;
+
+  if (fread(scis_splid_, sizeof(SpellingId), scis_num_, fp) != scis_num_)
+    return false;
+
+  if (fread(buf_, sizeof(char16), start_pos_[kMaxLemmaSize], fp) !=
+      start_pos_[kMaxLemmaSize])
+    return false;
+
+  initialized_ = true;
+  return true;
+}
+}  // namespace ime_pinyin