diff options
Diffstat (limited to 'lib/marisa/grimoire/trie/tail.cc')
-rw-r--r-- | lib/marisa/grimoire/trie/tail.cc | 218 |
1 files changed, 218 insertions, 0 deletions
diff --git a/lib/marisa/grimoire/trie/tail.cc b/lib/marisa/grimoire/trie/tail.cc new file mode 100644 index 0000000..bd9bd01 --- /dev/null +++ b/lib/marisa/grimoire/trie/tail.cc @@ -0,0 +1,218 @@ +#include "marisa/grimoire/algorithm.h" +#include "marisa/grimoire/trie/state.h" +#include "marisa/grimoire/trie/tail.h" + +namespace marisa { +namespace grimoire { +namespace trie { + +Tail::Tail() : buf_(), end_flags_() {} + +void Tail::build(Vector<Entry> &entries, Vector<UInt32> *offsets, + TailMode mode) { + MARISA_THROW_IF(offsets == NULL, MARISA_NULL_ERROR); + + switch (mode) { + case MARISA_TEXT_TAIL: { + for (std::size_t i = 0; i < entries.size(); ++i) { + const char * const ptr = entries[i].ptr(); + const std::size_t length = entries[i].length(); + for (std::size_t j = 0; j < length; ++j) { + if (ptr[j] == '\0') { + mode = MARISA_BINARY_TAIL; + break; + } + } + if (mode == MARISA_BINARY_TAIL) { + break; + } + } + break; + } + case MARISA_BINARY_TAIL: { + break; + } + default: { + MARISA_THROW(MARISA_CODE_ERROR, "undefined tail mode"); + } + } + + Tail temp; + temp.build_(entries, offsets, mode); + swap(temp); +} + +void Tail::map(Mapper &mapper) { + Tail temp; + temp.map_(mapper); + swap(temp); +} + +void Tail::read(Reader &reader) { + Tail temp; + temp.read_(reader); + swap(temp); +} + +void Tail::write(Writer &writer) const { + write_(writer); +} + +void Tail::restore(Agent &agent, std::size_t offset) const { + MARISA_DEBUG_IF(buf_.empty(), MARISA_STATE_ERROR); + + State &state = agent.state(); + if (end_flags_.empty()) { + for (const char *ptr = &buf_[offset]; *ptr != '\0'; ++ptr) { + state.key_buf().push_back(*ptr); + } + } else { + do { + state.key_buf().push_back(buf_[offset]); + } while (!end_flags_[offset++]); + } +} + +bool Tail::match(Agent &agent, std::size_t offset) const { + MARISA_DEBUG_IF(buf_.empty(), MARISA_STATE_ERROR); + MARISA_DEBUG_IF(agent.state().query_pos() >= agent.query().length(), + MARISA_BOUND_ERROR); + + State &state = agent.state(); + if (end_flags_.empty()) { + const char * const ptr = &buf_[offset] - state.query_pos(); + do { + if (ptr[state.query_pos()] != agent.query()[state.query_pos()]) { + return false; + } + state.set_query_pos(state.query_pos() + 1); + if (ptr[state.query_pos()] == '\0') { + return true; + } + } while (state.query_pos() < agent.query().length()); + return false; + } else { + do { + if (buf_[offset] != agent.query()[state.query_pos()]) { + return false; + } + state.set_query_pos(state.query_pos() + 1); + if (end_flags_[offset++]) { + return true; + } + } while (state.query_pos() < agent.query().length()); + return false; + } +} + +bool Tail::prefix_match(Agent &agent, std::size_t offset) const { + MARISA_DEBUG_IF(buf_.empty(), MARISA_STATE_ERROR); + + State &state = agent.state(); + if (end_flags_.empty()) { + const char *ptr = &buf_[offset] - state.query_pos(); + do { + if (ptr[state.query_pos()] != agent.query()[state.query_pos()]) { + return false; + } + state.key_buf().push_back(ptr[state.query_pos()]); + state.set_query_pos(state.query_pos() + 1); + if (ptr[state.query_pos()] == '\0') { + return true; + } + } while (state.query_pos() < agent.query().length()); + ptr += state.query_pos(); + do { + state.key_buf().push_back(*ptr); + } while (*++ptr != '\0'); + return true; + } else { + do { + if (buf_[offset] != agent.query()[state.query_pos()]) { + return false; + } + state.key_buf().push_back(buf_[offset]); + state.set_query_pos(state.query_pos() + 1); + if (end_flags_[offset++]) { + return true; + } + } while (state.query_pos() < agent.query().length()); + do { + state.key_buf().push_back(buf_[offset]); + } while (!end_flags_[offset++]); + return true; + } +} + +void Tail::clear() { + Tail().swap(*this); +} + +void Tail::swap(Tail &rhs) { + buf_.swap(rhs.buf_); + end_flags_.swap(rhs.end_flags_); +} + +void Tail::build_(Vector<Entry> &entries, Vector<UInt32> *offsets, + TailMode mode) { + for (std::size_t i = 0; i < entries.size(); ++i) { + entries[i].set_id(i); + } + Algorithm().sort(entries.begin(), entries.end()); + + Vector<UInt32> temp_offsets; + temp_offsets.resize(entries.size(), 0); + + const Entry dummy; + const Entry *last = &dummy; + for (std::size_t i = entries.size(); i > 0; --i) { + const Entry ¤t = entries[i - 1]; + MARISA_THROW_IF(current.length() == 0, MARISA_RANGE_ERROR); + std::size_t match = 0; + while ((match < current.length()) && (match < last->length()) && + ((*last)[match] == current[match])) { + ++match; + } + if ((match == current.length()) && (last->length() != 0)) { + temp_offsets[current.id()] = (UInt32)( + temp_offsets[last->id()] + (last->length() - match)); + } else { + temp_offsets[current.id()] = (UInt32)buf_.size(); + for (std::size_t j = 1; j <= current.length(); ++j) { + buf_.push_back(current[current.length() - j]); + } + if (mode == MARISA_TEXT_TAIL) { + buf_.push_back('\0'); + } else { + for (std::size_t j = 1; j < current.length(); ++j) { + end_flags_.push_back(false); + } + end_flags_.push_back(true); + } + MARISA_THROW_IF(buf_.size() > MARISA_UINT32_MAX, MARISA_SIZE_ERROR); + } + last = ¤t; + } + buf_.shrink(); + + offsets->swap(temp_offsets); +} + +void Tail::map_(Mapper &mapper) { + buf_.map(mapper); + end_flags_.map(mapper); +} + +void Tail::read_(Reader &reader) { + buf_.read(reader); + end_flags_.read(reader); +} + +void Tail::write_(Writer &writer) const { + buf_.write(writer); + end_flags_.write(writer); +} + +} // namespace trie +} // namespace grimoire +} // namespace marisa |