summaryrefslogtreecommitdiff
path: root/jni/share/splparser.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'jni/share/splparser.cpp')
-rw-r--r--jni/share/splparser.cpp341
1 files changed, 341 insertions, 0 deletions
diff --git a/jni/share/splparser.cpp b/jni/share/splparser.cpp
new file mode 100644
index 0000000..d75aec6
--- /dev/null
+++ b/jni/share/splparser.cpp
@@ -0,0 +1,341 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <assert.h>
+#include "../include/splparser.h"
+
+namespace ime_pinyin {
+
+SpellingParser::SpellingParser() {
+ spl_trie_ = SpellingTrie::get_cpinstance();
+}
+
+bool SpellingParser::is_valid_to_parse(char ch) {
+ return SpellingTrie::is_valid_spl_char(ch);
+}
+
+uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len,
+ uint16 spl_idx[], uint16 start_pos[],
+ uint16 max_size, bool &last_is_pre) {
+ if (NULL == splstr || 0 == max_size || 0 == str_len)
+ return 0;
+
+ if (!SpellingTrie::is_valid_spl_char(splstr[0]))
+ return 0;
+
+ last_is_pre = false;
+
+ const SpellingNode *node_this = spl_trie_->root_;
+
+ uint16 str_pos = 0;
+ uint16 idx_num = 0;
+ if (NULL != start_pos)
+ start_pos[0] = 0;
+ bool last_is_splitter = false;
+
+ while (str_pos < str_len) {
+ char char_this = splstr[str_pos];
+ // all characters outside of [a, z] are considered as splitters
+ if (!SpellingTrie::is_valid_spl_char(char_this)) {
+ // test if the current node is endable
+ uint16 id_this = node_this->spelling_idx;
+ if (spl_trie_->if_valid_id_update(&id_this)) {
+ spl_idx[idx_num] = id_this;
+
+ idx_num++;
+ str_pos++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ if (idx_num >= max_size)
+ return idx_num;
+
+ node_this = spl_trie_->root_;
+ last_is_splitter = true;
+ continue;
+ } else {
+ if (last_is_splitter) {
+ str_pos++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ continue;
+ } else {
+ return idx_num;
+ }
+ }
+ }
+
+ last_is_splitter = false;
+
+ SpellingNode *found_son = NULL;
+
+ if (0 == str_pos) {
+ if (char_this >= 'a')
+ found_son = spl_trie_->level1_sons_[char_this - 'a'];
+ else
+ found_son = spl_trie_->level1_sons_[char_this - 'A'];
+ } else {
+ SpellingNode *first_son = node_this->first_son;
+ // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
+ // frequently used, so we scan from the end.
+ for (int i = 0; i < node_this->num_of_son; i++) {
+ SpellingNode *this_son = first_son + i;
+ if (SpellingTrie::is_same_spl_char(
+ this_son->char_this_node, char_this)) {
+ found_son = this_son;
+ break;
+ }
+ }
+ }
+
+ // found, just move the current node pointer to the the son
+ if (NULL != found_son) {
+ node_this = found_son;
+ } else {
+ // not found, test if it is endable
+ uint16 id_this = node_this->spelling_idx;
+ if (spl_trie_->if_valid_id_update(&id_this)) {
+ // endable, remember the index
+ spl_idx[idx_num] = id_this;
+
+ idx_num++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ if (idx_num >= max_size)
+ return idx_num;
+ node_this = spl_trie_->root_;
+ continue;
+ } else {
+ return idx_num;
+ }
+ }
+
+ str_pos++;
+ }
+
+ uint16 id_this = node_this->spelling_idx;
+ if (spl_trie_->if_valid_id_update(&id_this)) {
+ // endable, remember the index
+ spl_idx[idx_num] = id_this;
+
+ idx_num++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ }
+
+ last_is_pre = !last_is_splitter;
+
+ return idx_num;
+}
+
+uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len,
+ uint16 spl_idx[], uint16 start_pos[],
+ uint16 max_size, bool &last_is_pre) {
+ uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos,
+ max_size, last_is_pre);
+ for (uint16 pos = 0; pos < idx_num; pos++) {
+ if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
+ spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
+ if (pos == idx_num - 1) {
+ last_is_pre = false;
+ }
+ }
+ }
+ return idx_num;
+}
+
+uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len,
+ uint16 spl_idx[], uint16 start_pos[],
+ uint16 max_size, bool &last_is_pre) {
+ if (NULL == splstr || 0 == max_size || 0 == str_len)
+ return 0;
+
+ if (!SpellingTrie::is_valid_spl_char(splstr[0]))
+ return 0;
+
+ last_is_pre = false;
+
+ const SpellingNode *node_this = spl_trie_->root_;
+
+ uint16 str_pos = 0;
+ uint16 idx_num = 0;
+ if (NULL != start_pos)
+ start_pos[0] = 0;
+ bool last_is_splitter = false;
+
+ while (str_pos < str_len) {
+ char16 char_this = splstr[str_pos];
+ // all characters outside of [a, z] are considered as splitters
+ if (!SpellingTrie::is_valid_spl_char(char_this)) {
+ // test if the current node is endable
+ uint16 id_this = node_this->spelling_idx;
+ if (spl_trie_->if_valid_id_update(&id_this)) {
+ spl_idx[idx_num] = id_this;
+
+ idx_num++;
+ str_pos++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ if (idx_num >= max_size)
+ return idx_num;
+
+ node_this = spl_trie_->root_;
+ last_is_splitter = true;
+ continue;
+ } else {
+ if (last_is_splitter) {
+ str_pos++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ continue;
+ } else {
+ return idx_num;
+ }
+ }
+ }
+
+ last_is_splitter = false;
+
+ SpellingNode *found_son = NULL;
+
+ if (0 == str_pos) {
+ if (char_this >= 'a')
+ found_son = spl_trie_->level1_sons_[char_this - 'a'];
+ else
+ found_son = spl_trie_->level1_sons_[char_this - 'A'];
+ } else {
+ SpellingNode *first_son = node_this->first_son;
+ // Because for Zh/Ch/Sh nodes, they are the last in the buffer and
+ // frequently used, so we scan from the end.
+ for (int i = 0; i < node_this->num_of_son; i++) {
+ SpellingNode *this_son = first_son + i;
+ if (SpellingTrie::is_same_spl_char(
+ this_son->char_this_node, char_this)) {
+ found_son = this_son;
+ break;
+ }
+ }
+ }
+
+ // found, just move the current node pointer to the the son
+ if (NULL != found_son) {
+ node_this = found_son;
+ } else {
+ // not found, test if it is endable
+ uint16 id_this = node_this->spelling_idx;
+ if (spl_trie_->if_valid_id_update(&id_this)) {
+ // endable, remember the index
+ spl_idx[idx_num] = id_this;
+
+ idx_num++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ if (idx_num >= max_size)
+ return idx_num;
+ node_this = spl_trie_->root_;
+ continue;
+ } else {
+ return idx_num;
+ }
+ }
+
+ str_pos++;
+ }
+
+ uint16 id_this = node_this->spelling_idx;
+ if (spl_trie_->if_valid_id_update(&id_this)) {
+ // endable, remember the index
+ spl_idx[idx_num] = id_this;
+
+ idx_num++;
+ if (NULL != start_pos)
+ start_pos[idx_num] = str_pos;
+ }
+
+ last_is_pre = !last_is_splitter;
+
+ return idx_num;
+}
+
+uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len,
+ uint16 spl_idx[], uint16 start_pos[],
+ uint16 max_size, bool &last_is_pre) {
+ uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos,
+ max_size, last_is_pre);
+ for (uint16 pos = 0; pos < idx_num; pos++) {
+ if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) {
+ spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos);
+ if (pos == idx_num - 1) {
+ last_is_pre = false;
+ }
+ }
+ }
+ return idx_num;
+}
+
+uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len,
+ bool *is_pre) {
+ if (NULL == is_pre)
+ return 0;
+
+ uint16 spl_idx[2];
+ uint16 start_pos[3];
+
+ if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
+ return 0;
+
+ if (start_pos[1] != str_len)
+ return 0;
+ return spl_idx[0];
+}
+
+uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len,
+ bool *is_pre) {
+ if (NULL == is_pre)
+ return 0;
+
+ uint16 spl_idx[2];
+ uint16 start_pos[3];
+
+ if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1)
+ return 0;
+
+ if (start_pos[1] != str_len)
+ return 0;
+ if (spl_trie_->is_half_id_yunmu(spl_idx[0])) {
+ spl_trie_->half_to_full(spl_idx[0], spl_idx);
+ *is_pre = false;
+ }
+
+ return spl_idx[0];
+}
+
+uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len,
+ uint16 splidx[], uint16 max_size,
+ uint16 &full_id_num, bool &is_pre) {
+ if (max_size <= 0 || !is_valid_to_parse(splstr[0]))
+ return 0;
+
+ splidx[0] = get_splid_by_str(splstr, str_len, &is_pre);
+ full_id_num = 0;
+ if (0 != splidx[0]) {
+ if (splidx[0] >= kFullSplIdStart)
+ full_id_num = 1;
+ return 1;
+ }
+ return 0;
+}
+
+} // namespace ime_pinyin