diff options
Diffstat (limited to 'src/extensions/ngram/bitmap-index.cc')
-rw-r--r-- | src/extensions/ngram/bitmap-index.cc | 187 |
1 files changed, 187 insertions, 0 deletions
diff --git a/src/extensions/ngram/bitmap-index.cc b/src/extensions/ngram/bitmap-index.cc new file mode 100644 index 0000000..c6aaa14 --- /dev/null +++ b/src/extensions/ngram/bitmap-index.cc @@ -0,0 +1,187 @@ + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright 2005-2010 Google, Inc. +// Author: Jeffrey Soresnen (sorenj@google.com) + +#include <fst/extensions/ngram/bitmap-index.h> + +#include <algorithm> +#include <iterator> + +#include <fst/extensions/ngram/nthbit.h> + +namespace fst { + +// These two internal classes implemented inverted views of the +// primary and secondary indexes. That is, they provide iterators +// that have operator*'s that return the number zeros rather than +// the number of ones. + +class primary_index_inverted : public vector<uint32>::const_iterator { + public: + primary_index_inverted() {} + primary_index_inverted(vector<uint32>::const_iterator loc, + vector<uint32>::const_iterator begin) : + vector<uint32>::const_iterator(loc), begin_(begin) {} + uint32 operator*() { + return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize * + (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) - + vector<uint32>::const_iterator::operator*(); + } + private: + vector<uint32>::const_iterator begin_; +}; + +class secondary_index_inverted : public vector<uint16>::const_iterator { + public: + secondary_index_inverted() : vector<uint16>::const_iterator() {} + secondary_index_inverted(vector<uint16>::const_iterator loc, + vector<uint16>::const_iterator block_begin) : + vector<uint16>::const_iterator(loc), block_begin_(block_begin) {} + uint16 operator*() { + return ((1 + std::distance<vector<uint16>::const_iterator>( + block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) - + vector<uint16>::const_iterator::operator*(); + } + private: + vector<uint16>::const_iterator block_begin_; +}; + +size_t BitmapIndex::Rank1(size_t end) const { + if (end == 0) return 0; + CHECK_LE(end, Bits()); + const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize; + const uint32 sum = get_index_ones_count(end_word); + const uint64 zero = 0; + const uint64 ones = ~zero; + return sum + __builtin_popcountll(bits_[end_word] & + (ones >> (kStorageBitSize - (end & kStorageBlockMask)))); +} + +size_t BitmapIndex::Select1(size_t bit_index) const { + if (bit_index >= GetOnesCount()) return Bits(); + // search primary index for the relevant block + uint32 rembits = bit_index + 1; + const uint32 block = find_primary_block(bit_index + 1); + uint32 offset = 0; + if (block > 0) { + rembits -= primary_index_[block - 1]; + offset += block * kSecondaryBlockSize; + } + // search the secondary index + uint32 word = find_secondary_block(offset, rembits); + if (word > 0) { + rembits -= secondary_index_[offset + word - 1]; + offset += word; + } + int nth = nth_bit(bits_[offset], rembits); + return (offset << BitmapIndex::kStorageLogBitSize) + nth; +} + +size_t BitmapIndex::Select0(size_t bit_index) const { + if (bit_index >= Bits() - GetOnesCount()) return Bits(); + // search inverted primary index for relevant block + uint32 remzeros = bit_index + 1; + uint32 offset = 0; + const uint32 block = find_inverted_primary_block(bit_index + 1); + if (block > 0) { + remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1, + primary_index_.begin()); + offset += block * kSecondaryBlockSize; + } + // search the inverted secondary index + uint32 word = find_inverted_secondary_block(offset, remzeros); + if (word > 0) { + vector<uint16>::const_iterator block_begin = + secondary_index_.begin() + offset; + remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin); + offset += word; + } + int nth = nth_bit(~bits_[offset], remzeros); + return (offset << BitmapIndex::kStorageLogBitSize) + nth; +} + +size_t BitmapIndex::get_index_ones_count(size_t array_index) const { + uint32 sum = 0; + if (array_index > 0) { + sum += secondary_index_[array_index-1]; + uint32 end_block = (array_index - 1) / kSecondaryBlockSize; + if (end_block > 0) sum += primary_index_[end_block-1]; + } + return sum; +} + +void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) { + bits_ = bits; + size_ = size; + secondary_index_.clear(); + secondary_index_.reserve(ArraySize()); + primary_index_.clear(); + primary_index_.reserve(primary_index_size()); + const uint64 zero = 0; + const uint64 ones = ~zero; + uint32 popcount = 0; + for (uint32 block_begin = 0; block_begin < ArraySize(); + block_begin += kSecondaryBlockSize) { + uint32 block_popcount = 0; + uint32 block_end = block_begin + kSecondaryBlockSize; + if (block_end > ArraySize()) block_end = ArraySize(); + for (uint32 j = block_begin; j < block_end; ++j) { + uint64 mask = ones; + if (j == ArraySize() - 1) { + mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask); + } + block_popcount += __builtin_popcountll(bits_[j] & mask); + secondary_index_.push_back(block_popcount); + } + popcount += block_popcount; + primary_index_.push_back(popcount); + } +} + +size_t BitmapIndex::find_secondary_block( + size_t block_begin, size_t rem_bit_index) const { + size_t block_end = block_begin + kSecondaryBlockSize; + if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); + return std::distance(secondary_index_.begin() + block_begin, + std::lower_bound(secondary_index_.begin() + block_begin, + secondary_index_.begin() + block_end, + rem_bit_index)); +} + +size_t BitmapIndex::find_inverted_secondary_block( + size_t block_begin, size_t rem_bit_index) const { + size_t block_end = block_begin + kSecondaryBlockSize; + if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); + secondary_index_inverted start(secondary_index_.begin() + block_begin, + secondary_index_.begin() + block_begin); + secondary_index_inverted end(secondary_index_.begin() + block_end, + secondary_index_.begin() + block_begin); + return std::distance(start, + std::lower_bound(start, end, rem_bit_index)); +} + +inline size_t BitmapIndex::find_primary_block(size_t bit_index) const { + return std::distance(primary_index_.begin(), + std::lower_bound(primary_index_.begin(), + primary_index_.end(), bit_index)); +} + +size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const { + primary_index_inverted start(primary_index_.begin(), primary_index_.begin()); + primary_index_inverted end(primary_index_.end(), primary_index_.begin()); + return std::distance(start, std::lower_bound(start, end, bit_index)); +} + +} // end namespace fst |