aboutsummaryrefslogtreecommitdiff
path: root/java/com/google/setfilters/cuckoofilter/CuckooFilter.java
blob: b37840b3b7998c43b862cdad762944774b54ae75 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.setfilters.cuckoofilter;

import com.google.common.hash.Funnel;
import com.google.common.hash.HashCode;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Random;

/**
 * A space efficient, probabilistic multiset data structure that supports membership check,
 * insertion, and deletion of the elements.
 *
 * <p>Cuckoo filter enables tradeoffs between its space efficiency and the false positive
 * probability of the membership check.
 *
 * <p>See the original paper https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf for more
 * details.
 *
 * <p>This class is not thread-safe.
 */
public final class CuckooFilter<T> {
  private final CuckooFilterConfig config;
  private final CuckooFilterTable table;
  private final Funnel<? super T> funnel;
  private final Random random;

  /** Counts the total number of elements in the cuckoo filter. */
  private long count;

  /** Instantiates a new cuckoo filter. */
  public static <T> CuckooFilter<T> createNew(CuckooFilterConfig config, Funnel<? super T> funnel) {
    Random random = new Random();
    CuckooFilterTable table =
        CuckooFilterTable.create(config.size(), config.useSpaceOptimization(), random);
    return new CuckooFilter<T>(config, table, funnel, random);
  }

  /**
   * Instantiates a cuckoo filter from serialized cuckoo filter table.
   *
   * <p>Note that {@link SerializedCuckooFilterTable} does not contain any data on {@link
   * CuckooFilterConfig.HashFunction}, {@link CuckooFilterConfig.Strategy}, or {@link Funnel} used,
   * so it is up to the user to supply appropriate hash function, strategy, and funnel that were
   * used to generate the {@link SerializedCuckooFilterTable}.
   */
  public static <T> CuckooFilter<T> createFromSerializedTable(
      SerializedCuckooFilterTable serializedTable,
      CuckooFilterConfig.HashFunction hashFunction,
      CuckooFilterConfig.Strategy strategy,
      Funnel<? super T> funnel) {
    Random random = new Random();
    CuckooFilterTable table = CuckooFilterTable.createFromSerialization(serializedTable, random);
    return new CuckooFilter<T>(
        CuckooFilterConfig.newBuilder()
            .setSize(table.size())
            .setHashFunction(hashFunction)
            .setStrategy(strategy)
            .build(),
        table,
        funnel,
        random);
  }

  private CuckooFilter(
      CuckooFilterConfig config, CuckooFilterTable table, Funnel<? super T> funnel, Random random) {
    this.config = config;
    this.table = table;
    this.funnel = funnel;
    this.random = random;
    count = 0;
  }

  /**
   * Returns true if {@code element} is in the cuckoo filter.
   *
   * <p>By the probabilistic nature of the cuckoo filter data structure, this method may return a
   * false positive result. In other words, this method may incorrectly return true for an element
   * that was actually never inserted. This probability can depend on various factors, including the
   * size of the cuckoo filter and the hash function used.
   *
   * <p>However, it is guaranteed that this method never returns a false negative result, as long as
   * {@code delete} method is called on an element that exists in the filter. Please see {@code
   * delete} method for more details.
   */
  public boolean contains(T element) {
    HashCode hash = config.hashFunction().hash(element, funnel);
    long fingerprint =
        config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
    int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
    int otherBucketIndex =
        config
            .strategy()
            .computeOtherBucketIndex(
                fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
    return table.contains(bucketIndex, fingerprint)
        || table.contains(otherBucketIndex, fingerprint);
  }

  /**
   * Inserts {@code element} to the cuckoo filter, returning true if the element was inserted
   * successfully.
   *
   * <p>Insertion of {@code element} will fail if there is no room for {@code element}. Note that
   * even when the insertion of {@code element} fails, it is possible for another element to be
   * inserted successfully. Even then, the insertion failure should be a good indicator that the
   * filter is getting close to its maximum capacity.
   */
  public boolean insert(T element) {
    HashCode hash = config.hashFunction().hash(element, funnel);
    long fingerprint =
        config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
    int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
    int otherBucketIndex =
        config
            .strategy()
            .computeOtherBucketIndex(
                fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());

    // First attempt to insert the fingerprint to one of the two assigned buckets.
    if (attemptInsertion(fingerprint, bucketIndex, otherBucketIndex)) {
      count++;
      return true;
    }

    // If both buckets are full, execute insertion with repeated replacements algorithm.
    int startBucketIndex = (random.nextInt(2) == 0) ? bucketIndex : otherBucketIndex;
    boolean inserted = insertWithRepeatedReplacements(fingerprint, startBucketIndex);
    if (inserted) {
      count++;
    }
    return inserted;
  }

  /**
   * Deletes {@code element} from the cuckoo filter, returning true if the element was deleted
   * successfully.
   *
   * <p>It is critical for {@code delete} to be called on an already existing element. Otherwise,
   * the filter may incorrectly delete a wrong element. When this happens, it is possible for {@code
   * contains} method to return a false negative result.
   */
  public boolean delete(T element) {
    HashCode hash = config.hashFunction().hash(element, funnel);
    long fingerprint =
        config.strategy().computeFingerprint(hash, config.size().fingerprintLength());
    int bucketIndex = config.strategy().computeBucketIndex(hash, config.size().bucketCount());
    int otherBucketIndex =
        config
            .strategy()
            .computeOtherBucketIndex(
                fingerprint, bucketIndex, config.size().bucketCount(), config.hashFunction());
    boolean deleted =
        table.delete(bucketIndex, fingerprint) || table.delete(otherBucketIndex, fingerprint);
    if (deleted) {
      count--;
    }
    return deleted;
  }

  /** Returns the size of the cuckoo filter. */
  public CuckooFilterConfig.Size size() {
    return config.size();
  }

  /** Returns the count of the elements in the cuckoo filter. */
  public long count() {
    return count;
  }

  /**
   * Returns the ratio of the total number of elements in the cuckoo filter and the theoretical max
   * capacity.
   *
   * <p>The returned value is in range [0, 1].
   */
  public double load() {
    return count / ((double) config.size().bucketCount() * config.size().bucketCapacity());
  }

  /**
   * Serializes the state of the cuckoo filter table.
   *
   * <p>Note that this method does not serialize hash function, strategy, and funnel. When
   * instantiating a cuckoo filter from the returned {@link SerializedCuckooFilterTable}, it is up
   * to the user to supply appropriate hash function, strategy, and funnel that were used.
   */
  public SerializedCuckooFilterTable serializeTable() {
    return table.serialize();
  }

  /**
   * Attempts to insert {@code fingerprint} to one of the buckets with indices {@code bucketIndex}
   * and {@code otherBucketIndex}, returning true when successful. Returns false if both buckets are
   * full and the insertion failed.
   */
  private boolean attemptInsertion(long fingerprint, int bucketIndex, int otherBucketIndex) {
    if (!table.isFull(bucketIndex)) {
      table.insertWithReplacement(bucketIndex, fingerprint);
      return true;
    }
    if (!table.isFull(otherBucketIndex)) {
      table.insertWithReplacement(otherBucketIndex, fingerprint);
      return true;
    }
    return false;
  }

  /**
   * Randomly traverses the cuckoo graph to find an available bucket for insertion.
   *
   * <p>At a high level, this algorithm starts at vertex {@code bucketIndex} and performs a random
   * walk of length at most {@link CuckooFilterConfig.Strategy#maxReplacementCount}. If an available
   * bucket is found, the algorithm "pushes" all the fingerprints (edges) that are visited (note
   * that in the cuckoo graph, the edges are the fingerprints) to their alternate buckets, and make
   * room for {@code fingerprint} to be inserted.
   *
   * <p>If during the random walk an available bucket is not found, the insertion fails and the
   * method returns false.
   *
   * <p>Note that it is possible to deterministically find an available bucket by performing breadth
   * first search in the cuckoo graph, but this is usually slower and the extra chance of successful
   * insertion is negligibly small in practice.
   */
  private boolean insertWithRepeatedReplacements(long fingerprint, int bucketIndex) {
    List<Integer> visitedBucketIndices = new ArrayList<>();
    List<Long> replacedFingerprints = new ArrayList<>();

    long currFingerprint = fingerprint;
    int currBucketIndex = bucketIndex;
    visitedBucketIndices.add(-1); // Just for index alignment purpose.
    replacedFingerprints.add(currFingerprint);
    for (int i = 0; i < config.strategy().maxReplacementCount(); i++) {
      Optional<Long> replacedFingerprint =
          table.insertWithReplacement(currBucketIndex, currFingerprint);
      // Found an available bucket, and the insertion is successful.
      if (replacedFingerprint.isEmpty()) {
        return true;
      }

      visitedBucketIndices.add(currBucketIndex);
      replacedFingerprints.add(replacedFingerprint.get());

      currFingerprint = replacedFingerprint.get();
      currBucketIndex =
          config
              .strategy()
              .computeOtherBucketIndex(
                  currFingerprint,
                  currBucketIndex,
                  config.size().bucketCount(),
                  config.hashFunction());
    }

    // Failed to find a bucket to insert. Reverse the replacements and declare that the insertion
    // failed.
    for (int i = visitedBucketIndices.size() - 1; i > 0; i--) {
      int previousBucketIndex = visitedBucketIndices.get(i);
      table.delete(previousBucketIndex, replacedFingerprints.get(i - 1));
      table.insertWithReplacement(previousBucketIndex, replacedFingerprints.get(i));
    }
    return false;
  }
}