aboutsummaryrefslogtreecommitdiff
path: root/src/extensions/ngram/bitmap-index.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/extensions/ngram/bitmap-index.cc')
-rw-r--r--src/extensions/ngram/bitmap-index.cc187
1 files changed, 187 insertions, 0 deletions
diff --git a/src/extensions/ngram/bitmap-index.cc b/src/extensions/ngram/bitmap-index.cc
new file mode 100644
index 0000000..c6aaa14
--- /dev/null
+++ b/src/extensions/ngram/bitmap-index.cc
@@ -0,0 +1,187 @@
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Copyright 2005-2010 Google, Inc.
+// Author: Jeffrey Soresnen (sorenj@google.com)
+
+#include <fst/extensions/ngram/bitmap-index.h>
+
+#include <algorithm>
+#include <iterator>
+
+#include <fst/extensions/ngram/nthbit.h>
+
+namespace fst {
+
+// These two internal classes implemented inverted views of the
+// primary and secondary indexes. That is, they provide iterators
+// that have operator*'s that return the number zeros rather than
+// the number of ones.
+
+class primary_index_inverted : public vector<uint32>::const_iterator {
+ public:
+ primary_index_inverted() {}
+ primary_index_inverted(vector<uint32>::const_iterator loc,
+ vector<uint32>::const_iterator begin) :
+ vector<uint32>::const_iterator(loc), begin_(begin) {}
+ uint32 operator*() {
+ return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize *
+ (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) -
+ vector<uint32>::const_iterator::operator*();
+ }
+ private:
+ vector<uint32>::const_iterator begin_;
+};
+
+class secondary_index_inverted : public vector<uint16>::const_iterator {
+ public:
+ secondary_index_inverted() : vector<uint16>::const_iterator() {}
+ secondary_index_inverted(vector<uint16>::const_iterator loc,
+ vector<uint16>::const_iterator block_begin) :
+ vector<uint16>::const_iterator(loc), block_begin_(block_begin) {}
+ uint16 operator*() {
+ return ((1 + std::distance<vector<uint16>::const_iterator>(
+ block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) -
+ vector<uint16>::const_iterator::operator*();
+ }
+ private:
+ vector<uint16>::const_iterator block_begin_;
+};
+
+size_t BitmapIndex::Rank1(size_t end) const {
+ if (end == 0) return 0;
+ CHECK_LE(end, Bits());
+ const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize;
+ const uint32 sum = get_index_ones_count(end_word);
+ const uint64 zero = 0;
+ const uint64 ones = ~zero;
+ return sum + __builtin_popcountll(bits_[end_word] &
+ (ones >> (kStorageBitSize - (end & kStorageBlockMask))));
+}
+
+size_t BitmapIndex::Select1(size_t bit_index) const {
+ if (bit_index >= GetOnesCount()) return Bits();
+ // search primary index for the relevant block
+ uint32 rembits = bit_index + 1;
+ const uint32 block = find_primary_block(bit_index + 1);
+ uint32 offset = 0;
+ if (block > 0) {
+ rembits -= primary_index_[block - 1];
+ offset += block * kSecondaryBlockSize;
+ }
+ // search the secondary index
+ uint32 word = find_secondary_block(offset, rembits);
+ if (word > 0) {
+ rembits -= secondary_index_[offset + word - 1];
+ offset += word;
+ }
+ int nth = nth_bit(bits_[offset], rembits);
+ return (offset << BitmapIndex::kStorageLogBitSize) + nth;
+}
+
+size_t BitmapIndex::Select0(size_t bit_index) const {
+ if (bit_index >= Bits() - GetOnesCount()) return Bits();
+ // search inverted primary index for relevant block
+ uint32 remzeros = bit_index + 1;
+ uint32 offset = 0;
+ const uint32 block = find_inverted_primary_block(bit_index + 1);
+ if (block > 0) {
+ remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1,
+ primary_index_.begin());
+ offset += block * kSecondaryBlockSize;
+ }
+ // search the inverted secondary index
+ uint32 word = find_inverted_secondary_block(offset, remzeros);
+ if (word > 0) {
+ vector<uint16>::const_iterator block_begin =
+ secondary_index_.begin() + offset;
+ remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin);
+ offset += word;
+ }
+ int nth = nth_bit(~bits_[offset], remzeros);
+ return (offset << BitmapIndex::kStorageLogBitSize) + nth;
+}
+
+size_t BitmapIndex::get_index_ones_count(size_t array_index) const {
+ uint32 sum = 0;
+ if (array_index > 0) {
+ sum += secondary_index_[array_index-1];
+ uint32 end_block = (array_index - 1) / kSecondaryBlockSize;
+ if (end_block > 0) sum += primary_index_[end_block-1];
+ }
+ return sum;
+}
+
+void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) {
+ bits_ = bits;
+ size_ = size;
+ secondary_index_.clear();
+ secondary_index_.reserve(ArraySize());
+ primary_index_.clear();
+ primary_index_.reserve(primary_index_size());
+ const uint64 zero = 0;
+ const uint64 ones = ~zero;
+ uint32 popcount = 0;
+ for (uint32 block_begin = 0; block_begin < ArraySize();
+ block_begin += kSecondaryBlockSize) {
+ uint32 block_popcount = 0;
+ uint32 block_end = block_begin + kSecondaryBlockSize;
+ if (block_end > ArraySize()) block_end = ArraySize();
+ for (uint32 j = block_begin; j < block_end; ++j) {
+ uint64 mask = ones;
+ if (j == ArraySize() - 1) {
+ mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask);
+ }
+ block_popcount += __builtin_popcountll(bits_[j] & mask);
+ secondary_index_.push_back(block_popcount);
+ }
+ popcount += block_popcount;
+ primary_index_.push_back(popcount);
+ }
+}
+
+size_t BitmapIndex::find_secondary_block(
+ size_t block_begin, size_t rem_bit_index) const {
+ size_t block_end = block_begin + kSecondaryBlockSize;
+ if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
+ return std::distance(secondary_index_.begin() + block_begin,
+ std::lower_bound(secondary_index_.begin() + block_begin,
+ secondary_index_.begin() + block_end,
+ rem_bit_index));
+}
+
+size_t BitmapIndex::find_inverted_secondary_block(
+ size_t block_begin, size_t rem_bit_index) const {
+ size_t block_end = block_begin + kSecondaryBlockSize;
+ if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
+ secondary_index_inverted start(secondary_index_.begin() + block_begin,
+ secondary_index_.begin() + block_begin);
+ secondary_index_inverted end(secondary_index_.begin() + block_end,
+ secondary_index_.begin() + block_begin);
+ return std::distance(start,
+ std::lower_bound(start, end, rem_bit_index));
+}
+
+inline size_t BitmapIndex::find_primary_block(size_t bit_index) const {
+ return std::distance(primary_index_.begin(),
+ std::lower_bound(primary_index_.begin(),
+ primary_index_.end(), bit_index));
+}
+
+size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const {
+ primary_index_inverted start(primary_index_.begin(), primary_index_.begin());
+ primary_index_inverted end(primary_index_.end(), primary_index_.begin());
+ return std::distance(start, std::lower_bound(start, end, bit_index));
+}
+
+} // end namespace fst