aboutsummaryrefslogtreecommitdiff
path: root/src/extensions/ngram/bitmap-index.cc
blob: c6aaa1429120e3017439774b6ba68bbade05cc7c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2010 Google, Inc.
// Author: Jeffrey Soresnen (sorenj@google.com)

#include <fst/extensions/ngram/bitmap-index.h>

#include <algorithm>
#include <iterator>

#include <fst/extensions/ngram/nthbit.h>

namespace fst {

// These two internal classes implemented inverted views of the
// primary and secondary indexes.  That is, they provide iterators
// that have operator*'s that return the number zeros rather than
// the number of ones.

class primary_index_inverted : public vector<uint32>::const_iterator {
 public:
  primary_index_inverted() {}
  primary_index_inverted(vector<uint32>::const_iterator loc,
                         vector<uint32>::const_iterator begin) :
    vector<uint32>::const_iterator(loc), begin_(begin) {}
  uint32 operator*() {
    return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize *
           (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) -
           vector<uint32>::const_iterator::operator*();
  }
 private:
  vector<uint32>::const_iterator begin_;
};

class secondary_index_inverted : public vector<uint16>::const_iterator {
 public:
  secondary_index_inverted() : vector<uint16>::const_iterator() {}
  secondary_index_inverted(vector<uint16>::const_iterator loc,
                           vector<uint16>::const_iterator block_begin) :
    vector<uint16>::const_iterator(loc), block_begin_(block_begin) {}
  uint16 operator*() {
    return ((1 + std::distance<vector<uint16>::const_iterator>(
        block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) -
        vector<uint16>::const_iterator::operator*();
  }
 private:
  vector<uint16>::const_iterator block_begin_;
};

size_t BitmapIndex::Rank1(size_t end) const {
  if (end == 0) return 0;
  CHECK_LE(end, Bits());
  const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize;
  const uint32 sum = get_index_ones_count(end_word);
  const uint64 zero = 0;
  const uint64 ones = ~zero;
  return sum + __builtin_popcountll(bits_[end_word] &
      (ones >> (kStorageBitSize - (end & kStorageBlockMask))));
}

size_t BitmapIndex::Select1(size_t bit_index) const {
  if (bit_index >= GetOnesCount()) return Bits();
  // search primary index for the relevant block
  uint32 rembits = bit_index + 1;
  const uint32 block = find_primary_block(bit_index + 1);
  uint32 offset = 0;
  if (block > 0) {
    rembits -= primary_index_[block - 1];
    offset += block * kSecondaryBlockSize;
  }
  // search the secondary index
  uint32 word = find_secondary_block(offset, rembits);
  if (word > 0) {
    rembits -= secondary_index_[offset + word - 1];
    offset += word;
  }
  int nth = nth_bit(bits_[offset], rembits);
  return (offset << BitmapIndex::kStorageLogBitSize) + nth;
}

size_t BitmapIndex::Select0(size_t bit_index) const {
  if (bit_index >= Bits() - GetOnesCount()) return Bits();
  // search inverted primary index for relevant block
  uint32 remzeros = bit_index + 1;
  uint32 offset = 0;
  const uint32 block = find_inverted_primary_block(bit_index + 1);
  if (block > 0) {
    remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1,
                                        primary_index_.begin());
    offset += block * kSecondaryBlockSize;
  }
  // search the inverted secondary index
  uint32 word = find_inverted_secondary_block(offset, remzeros);
  if (word > 0) {
    vector<uint16>::const_iterator block_begin =
        secondary_index_.begin() + offset;
    remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin);
    offset += word;
  }
  int nth = nth_bit(~bits_[offset], remzeros);
  return (offset << BitmapIndex::kStorageLogBitSize) + nth;
}

size_t BitmapIndex::get_index_ones_count(size_t array_index) const {
  uint32 sum = 0;
  if (array_index > 0) {
    sum += secondary_index_[array_index-1];
    uint32 end_block = (array_index - 1) / kSecondaryBlockSize;
    if (end_block > 0) sum += primary_index_[end_block-1];
  }
  return sum;
}

void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) {
  bits_ = bits;
  size_ = size;
  secondary_index_.clear();
  secondary_index_.reserve(ArraySize());
  primary_index_.clear();
  primary_index_.reserve(primary_index_size());
  const uint64 zero = 0;
  const uint64 ones = ~zero;
  uint32 popcount = 0;
  for (uint32 block_begin = 0; block_begin < ArraySize();
       block_begin += kSecondaryBlockSize) {
    uint32 block_popcount = 0;
    uint32 block_end = block_begin + kSecondaryBlockSize;
    if (block_end > ArraySize()) block_end = ArraySize();
    for (uint32 j = block_begin; j < block_end; ++j) {
      uint64 mask = ones;
      if (j == ArraySize() - 1) {
        mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask);
      }
      block_popcount += __builtin_popcountll(bits_[j] & mask);
      secondary_index_.push_back(block_popcount);
    }
    popcount += block_popcount;
    primary_index_.push_back(popcount);
  }
}

size_t BitmapIndex::find_secondary_block(
    size_t block_begin, size_t rem_bit_index) const {
  size_t block_end = block_begin + kSecondaryBlockSize;
  if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
  return std::distance(secondary_index_.begin() + block_begin,
                       std::lower_bound(secondary_index_.begin() + block_begin,
                                        secondary_index_.begin() + block_end,
                                        rem_bit_index));
}

size_t BitmapIndex::find_inverted_secondary_block(
    size_t block_begin, size_t rem_bit_index) const {
  size_t block_end = block_begin + kSecondaryBlockSize;
  if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
  secondary_index_inverted start(secondary_index_.begin() + block_begin,
                                 secondary_index_.begin() + block_begin);
  secondary_index_inverted end(secondary_index_.begin() + block_end,
                               secondary_index_.begin() + block_begin);
  return std::distance(start,
                       std::lower_bound(start, end, rem_bit_index));
}

inline size_t BitmapIndex::find_primary_block(size_t bit_index) const {
  return std::distance(primary_index_.begin(),
                       std::lower_bound(primary_index_.begin(),
                                        primary_index_.end(), bit_index));
}

size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const {
  primary_index_inverted start(primary_index_.begin(), primary_index_.begin());
  primary_index_inverted end(primary_index_.end(), primary_index_.begin());
  return std::distance(start, std::lower_bound(start, end, bit_index));
}

}  // end namespace fst