diff options
Diffstat (limited to 'jni/share/splparser.cpp')
-rw-r--r-- | jni/share/splparser.cpp | 341 |
1 files changed, 341 insertions, 0 deletions
diff --git a/jni/share/splparser.cpp b/jni/share/splparser.cpp new file mode 100644 index 0000000..d75aec6 --- /dev/null +++ b/jni/share/splparser.cpp @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2009 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <assert.h> +#include "../include/splparser.h" + +namespace ime_pinyin { + +SpellingParser::SpellingParser() { + spl_trie_ = SpellingTrie::get_cpinstance(); +} + +bool SpellingParser::is_valid_to_parse(char ch) { + return SpellingTrie::is_valid_spl_char(ch); +} + +uint16 SpellingParser::splstr_to_idxs(const char *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + if (NULL == splstr || 0 == max_size || 0 == str_len) + return 0; + + if (!SpellingTrie::is_valid_spl_char(splstr[0])) + return 0; + + last_is_pre = false; + + const SpellingNode *node_this = spl_trie_->root_; + + uint16 str_pos = 0; + uint16 idx_num = 0; + if (NULL != start_pos) + start_pos[0] = 0; + bool last_is_splitter = false; + + while (str_pos < str_len) { + char char_this = splstr[str_pos]; + // all characters outside of [a, z] are considered as splitters + if (!SpellingTrie::is_valid_spl_char(char_this)) { + // test if the current node is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + spl_idx[idx_num] = id_this; + + idx_num++; + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + + node_this = spl_trie_->root_; + last_is_splitter = true; + continue; + } else { + if (last_is_splitter) { + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + continue; + } else { + return idx_num; + } + } + } + + last_is_splitter = false; + + SpellingNode *found_son = NULL; + + if (0 == str_pos) { + if (char_this >= 'a') + found_son = spl_trie_->level1_sons_[char_this - 'a']; + else + found_son = spl_trie_->level1_sons_[char_this - 'A']; + } else { + SpellingNode *first_son = node_this->first_son; + // Because for Zh/Ch/Sh nodes, they are the last in the buffer and + // frequently used, so we scan from the end. + for (int i = 0; i < node_this->num_of_son; i++) { + SpellingNode *this_son = first_son + i; + if (SpellingTrie::is_same_spl_char( + this_son->char_this_node, char_this)) { + found_son = this_son; + break; + } + } + } + + // found, just move the current node pointer to the the son + if (NULL != found_son) { + node_this = found_son; + } else { + // not found, test if it is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + node_this = spl_trie_->root_; + continue; + } else { + return idx_num; + } + } + + str_pos++; + } + + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + } + + last_is_pre = !last_is_splitter; + + return idx_num; +} + +uint16 SpellingParser::splstr_to_idxs_f(const char *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + uint16 idx_num = splstr_to_idxs(splstr, str_len, spl_idx, start_pos, + max_size, last_is_pre); + for (uint16 pos = 0; pos < idx_num; pos++) { + if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { + spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); + if (pos == idx_num - 1) { + last_is_pre = false; + } + } + } + return idx_num; +} + +uint16 SpellingParser::splstr16_to_idxs(const char16 *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + if (NULL == splstr || 0 == max_size || 0 == str_len) + return 0; + + if (!SpellingTrie::is_valid_spl_char(splstr[0])) + return 0; + + last_is_pre = false; + + const SpellingNode *node_this = spl_trie_->root_; + + uint16 str_pos = 0; + uint16 idx_num = 0; + if (NULL != start_pos) + start_pos[0] = 0; + bool last_is_splitter = false; + + while (str_pos < str_len) { + char16 char_this = splstr[str_pos]; + // all characters outside of [a, z] are considered as splitters + if (!SpellingTrie::is_valid_spl_char(char_this)) { + // test if the current node is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + spl_idx[idx_num] = id_this; + + idx_num++; + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + + node_this = spl_trie_->root_; + last_is_splitter = true; + continue; + } else { + if (last_is_splitter) { + str_pos++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + continue; + } else { + return idx_num; + } + } + } + + last_is_splitter = false; + + SpellingNode *found_son = NULL; + + if (0 == str_pos) { + if (char_this >= 'a') + found_son = spl_trie_->level1_sons_[char_this - 'a']; + else + found_son = spl_trie_->level1_sons_[char_this - 'A']; + } else { + SpellingNode *first_son = node_this->first_son; + // Because for Zh/Ch/Sh nodes, they are the last in the buffer and + // frequently used, so we scan from the end. + for (int i = 0; i < node_this->num_of_son; i++) { + SpellingNode *this_son = first_son + i; + if (SpellingTrie::is_same_spl_char( + this_son->char_this_node, char_this)) { + found_son = this_son; + break; + } + } + } + + // found, just move the current node pointer to the the son + if (NULL != found_son) { + node_this = found_son; + } else { + // not found, test if it is endable + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + if (idx_num >= max_size) + return idx_num; + node_this = spl_trie_->root_; + continue; + } else { + return idx_num; + } + } + + str_pos++; + } + + uint16 id_this = node_this->spelling_idx; + if (spl_trie_->if_valid_id_update(&id_this)) { + // endable, remember the index + spl_idx[idx_num] = id_this; + + idx_num++; + if (NULL != start_pos) + start_pos[idx_num] = str_pos; + } + + last_is_pre = !last_is_splitter; + + return idx_num; +} + +uint16 SpellingParser::splstr16_to_idxs_f(const char16 *splstr, uint16 str_len, + uint16 spl_idx[], uint16 start_pos[], + uint16 max_size, bool &last_is_pre) { + uint16 idx_num = splstr16_to_idxs(splstr, str_len, spl_idx, start_pos, + max_size, last_is_pre); + for (uint16 pos = 0; pos < idx_num; pos++) { + if (spl_trie_->is_half_id_yunmu(spl_idx[pos])) { + spl_trie_->half_to_full(spl_idx[pos], spl_idx + pos); + if (pos == idx_num - 1) { + last_is_pre = false; + } + } + } + return idx_num; +} + +uint16 SpellingParser::get_splid_by_str(const char *splstr, uint16 str_len, + bool *is_pre) { + if (NULL == is_pre) + return 0; + + uint16 spl_idx[2]; + uint16 start_pos[3]; + + if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) + return 0; + + if (start_pos[1] != str_len) + return 0; + return spl_idx[0]; +} + +uint16 SpellingParser::get_splid_by_str_f(const char *splstr, uint16 str_len, + bool *is_pre) { + if (NULL == is_pre) + return 0; + + uint16 spl_idx[2]; + uint16 start_pos[3]; + + if (splstr_to_idxs(splstr, str_len, spl_idx, start_pos, 2, *is_pre) != 1) + return 0; + + if (start_pos[1] != str_len) + return 0; + if (spl_trie_->is_half_id_yunmu(spl_idx[0])) { + spl_trie_->half_to_full(spl_idx[0], spl_idx); + *is_pre = false; + } + + return spl_idx[0]; +} + +uint16 SpellingParser::get_splids_parallel(const char *splstr, uint16 str_len, + uint16 splidx[], uint16 max_size, + uint16 &full_id_num, bool &is_pre) { + if (max_size <= 0 || !is_valid_to_parse(splstr[0])) + return 0; + + splidx[0] = get_splid_by_str(splstr, str_len, &is_pre); + full_id_num = 0; + if (0 != splidx[0]) { + if (splidx[0] >= kFullSplIdStart) + full_id_num = 1; + return 1; + } + return 0; +} + +} // namespace ime_pinyin |