aboutsummaryrefslogtreecommitdiff
path: root/client/java/com/google/rappor/Encoder.java
diff options
context:
space:
mode:
Diffstat (limited to 'client/java/com/google/rappor/Encoder.java')
-rw-r--r--client/java/com/google/rappor/Encoder.java613
1 files changed, 613 insertions, 0 deletions
diff --git a/client/java/com/google/rappor/Encoder.java b/client/java/com/google/rappor/Encoder.java
new file mode 100644
index 0000000..a8fb57c
--- /dev/null
+++ b/client/java/com/google/rappor/Encoder.java
@@ -0,0 +1,613 @@
+package com.google.android.rappor;
+
+// BEGIN android-changed: Removed guava dependency
+// import static com.google.common.base.Preconditions.checkArgument;
+//
+// import com.google.common.base.Verify;
+// END android-changed
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.security.SecureRandom;
+import java.util.BitSet;
+// BEGIN android-changed
+import java.util.Random;
+// END android-changed
+
+// BEGIN android-changed: Remove javax
+// import javax.annotation.Nullable;
+// import javax.annotation.concurrent.GuardedBy;
+// END android-changed
+
+/**
+ * Encodes reports using the RAPPOR differentially-private encoding algorithm.
+ *
+ * The RAPPOR algorithm is described at go/rappor and presented in detail at go/rappor-writeup.
+ *
+ * @author bonawitz@google.com Keith Bonawitz
+ */
+// TODO(bonawitz): Make encoder and interface and make this a final class implementing it.
+// We can't just make this final now because there exist tests that need to Mock it.
+public class Encoder {
+ /**
+ * A non-decreasing version number.
+ *
+ * <p>The version number should increase any time the Encoder has a user-visible functional change
+ * to any of encoding algorithms or the interpretation of the input parameters.
+ */
+ public static final long VERSION = 3;
+
+ /**
+ * Minimum length required for the user secret, in bytes.
+ */
+ public static final int MIN_USER_SECRET_BYTES = HmacDrbg.ENTROPY_INPUT_SIZE_BYTES;
+
+ /**
+ * Maximum number of bits in the RAPPOR-encoded report.
+ *
+ * Must be less than HmacDrbg.MAX_BYTES_TOTAL;
+ */
+ public static final int MAX_BITS = 4096;
+
+ /**
+ * Maximum number of Bloom filter hashes used for RAPPOR-encoded strings.
+ *
+ * <p>This is constrained in the current implementation by requiring 2 bytes from an MD5 value
+ * (which is 16 bytes long) for each Bloom filter hash.
+ */
+ public static final int MAX_BLOOM_HASHES = 8;
+
+ /**
+ * Maximum number of cohorts supported.
+ */
+ public static final int MAX_COHORTS = 128;
+
+ /**
+ * First (and only) byte of HMAC_DRBG personalization strings used to generate the cohort number.
+ */
+ private static final byte HMAC_DRBG_TYPE_COHORT = 0x00;
+
+ /**
+ * First byte of HMAC_DRBG personalization strings used to generate the PRR response.
+ */
+ private static final byte HMAC_DRBG_TYPE_PRR = 0x01;
+
+ /**
+ * A unique identifier for this Encoder, represented in UTF-8.
+ *
+ * <p>The (userSecret, encoderId) pair identify a the logical memoization table used for RAPPOR's
+ * Permanent Randomized Response stage. Therefore, for any userSecret, each Encoder must have a
+ * distinct identifier for Permanent Randomized Response to be effective.
+ *
+ * <p>In practice, "memoization" is achieved by generating deterministic pseudo-random bits using
+ * HMAC_DRBG. encoderIdBytes is used as part of the personalization string.
+ */
+ private final byte[] encoderIdBytes;
+
+ /**
+ * The RAPPOR "f" probability, on the range [0.0, 1.0].
+ *
+ * <p>This it the probability of replacing a bit from the input with a uniform random bit when
+ * generating the permanent randomized response.
+ *
+ * <p>Setting probabilityF to 0 disables the PRR phase of RAPPOR.
+ */
+ private final double probabilityF;
+
+ /**
+ * The RAPPOR "p" probability, on the range [0.0, 1.0].
+ *
+ * <p>This is the probability of emitting a '1' bit in the instantaneous randomized response,
+ * given that the corresponding bit in the permanent randomized response is '0'.
+ *
+ * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR.
+ */
+ private final double probabilityP;
+
+ /**
+ * The RAPPOR "1" probability, on the range [0.0, 1.0].
+ *
+ * <p>This is the probability of emitting a 1 bit in the instantaneous randomized response, given
+ * that the corresponding bit in the permanent randomized response is 1.
+ *
+ * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR.
+ */
+ private final double probabilityQ;
+
+ /**
+ * The number of bits in the RAPPOR-encoded report.
+ *
+ * <p>Must satisfy 1 &lt;= numBits &lt;= MAX_BITS.
+ *
+ * <ul>
+ * <li>For encodeOrdinal, requires 0 &lt;= ordinal &lt; numBits.
+ * <li>For encodeString, uses a numBits-wide Bloom filter.
+ * <li>For encodeBits, only the low-order numBits may be non-zero.
+ * </ul>
+ */
+ private final int numBits;
+
+ /**
+ * The number of hash functions used forming the Bloom filter encoding of a string.
+ *
+ * <p>Must satisfy 1 &lt;= numBloomHashes &lt;= MAX_BLOOM_HASHES.
+ */
+ private final int numBloomHashes;
+
+ /**
+ * The number of cohorts used for cohort assignment.
+ */
+ private final int numCohorts;
+
+ /**
+ * The cohort this user is assigned to for Bloom filter string encoding.
+ *
+ * <p>This value is stable for a given (userSecret, numCohorts) pair. Furthermore, if two
+ * encoders use the same userSecret but have different numCohorts values, the cohort assignment
+ * for the encoder with the smaller numCohorts value is a bitwise suffix of the cohort assignment
+ * for the encoder with the larger numCohorts value. It follows that, if you know maximum cohort
+ * assignment across a set of encoders, and you know the numCohorts setting for each encoder, then
+ * you can deduce the cohort assignment for each encoder by taking the bitwise-and of the max
+ * cohort value and (numCohorts-1), noting that numCohorts is required to be a positive power of
+ * 2.
+ */
+ private final int cohort;
+
+ /**
+ * A bitmask with 1 bits in all the positions where a RAPPOR-encoded report could have a 1 bit.
+ *
+ * <p>The bitmask has the lowest order numBits set to 1 and the rest 0.
+ */
+ private final BitSet inputMask;
+
+ /**
+ * SHA-256 utility class instance.
+ *
+ * <p>This object is stateful; access must be synchronized. The reset method must be
+ * called before each use.
+ */
+// BEGIN android-changed: Remove javax
+// @GuardedBy("this")
+// END android-changed
+ private final MessageDigest sha256;
+
+ /**
+ * MD5 utility class instance.
+ *
+ * <p>This object is stateful; access must be synchronized. The reset method must be
+ * called before each use.
+ */
+// BEGIN android-changed: Remove javax
+// @GuardedBy("this")
+// END android-changed
+ private final MessageDigest md5;
+
+ /**
+ * A SecureRandom instance, initialized with a cryptographically secure random seed.
+ */
+// BEGIN android-changed
+// private final SecureRandom random;
+ private final Random random;
+// BEGIN android-changed
+
+ /**
+ * Entropy input for constructing HmacDrbg objects.
+ */
+ private final byte[] userSecret;
+
+ /**
+ * Constructs a new RAPPOR message encoder.
+ *
+ * @param userSecret Stable secret randomly selected for this user. UserSecret must be at least
+ * MIN_USER_SECRET_BYTES bytes of high-quality entropy. Changing the user secret clears the
+ * memoized cohort assignments and permanent randomized responses. Be aware that resetting
+ * these memoizations has significant privacy risks -- consult documentation at go/rappor for
+ * more details.
+ * @param encoderId Uniquely identifies this encoder. Used to differentiate momoized
+ * cohort assignments and permanent randomized responses.
+ * @param numBits The number of bits in the RAPPOR-encoded report.
+ * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0]. This will be
+ * quantized to the nearest 1/128.
+ * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0].
+ * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0].
+ * @param numCohorts Number of cohorts into which the user pool is randomly segmented.
+ * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a
+ * string.
+ */
+ public Encoder(byte[] userSecret, String encoderId, int numBits,
+ double probabilityF, double probabilityP, double probabilityQ,
+ int numCohorts, int numBloomHashes) {
+ this(
+ null, // random
+ null, // md5,
+ null, // sha256,
+ userSecret,
+ encoderId,
+ numBits,
+ probabilityF,
+ probabilityP,
+ probabilityQ,
+ numCohorts,
+ numBloomHashes);
+ }
+
+ /**
+ * Constructs a new RAPPOR message encoder, using constructor-style dependency injection.
+ *
+ * @param random A cryptographically secure random number generator, or null (in which case a
+ * new SecureRandom will be constructed).
+ * @param md5 A configured MD5 hash algorithm, or null (in which case a new MessageDigest will be
+ * constructed). Note: MessageDigest objects are stateful, and that state must not be
+ * modified while calls to the Encoder are active.
+ * @param sha256 A configured SHA-256 hash algorithm, or null (in which case a new MessageDigest
+ * will be constructed). Note: MessageDigest objects are stateful, and that state must not
+ * be modified while calls to the Encoder are active.
+ * @param userSecret Stable secret randomly selected for this user. UserSecret must be at least
+ * 32 bytes of high-quality entropy. Changing the user secret clears the memoized cohort
+ * assignments and permanent randomized responses. Be aware that resetting these memoizations
+ * has significant privacy risks -- consult documentation at go/rappor for more details.
+ * @param encoderId Uniquely identifies this encoder. Used to differentiate momoized
+ * cohort assignments and permanent randomized responses.
+ * @param numBits The number of bits in the RAPPOR-encoded report.
+ * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0]. This will be
+ * quantized to the nearest 1/128.
+ * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0].
+ * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0].
+ * @param numCohorts Number of cohorts into which the user pool is randomly segmented.
+ * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a
+ * string.
+ */
+ public Encoder(
+// BEGIN android-changed: Remove javax
+// @Nullable SecureRandom random,
+// @Nullable MessageDigest md5,
+// @Nullable MessageDigest sha256,
+// SecureRandom random,
+ Random random,
+ MessageDigest md5,
+ MessageDigest sha256,
+// END android-changed
+ byte[] userSecret,
+ String encoderId,
+ int numBits,
+ double probabilityF,
+ double probabilityP,
+ double probabilityQ,
+ int numCohorts,
+ int numBloomHashes) {
+ if (md5 != null) {
+ this.md5 = md5;
+ } else {
+ try {
+ this.md5 = MessageDigest.getInstance("MD5");
+ } catch (NoSuchAlgorithmException impossible) {
+ // This should never happen. Every implementation of the Java platform
+ // is required to support MD5.
+ throw new AssertionError(impossible);
+ }
+ }
+ this.md5.reset();
+
+ if (sha256 != null) {
+ this.sha256 = sha256;
+ } else {
+ try {
+ this.sha256 = MessageDigest.getInstance("SHA-256");
+ } catch (NoSuchAlgorithmException impossible) {
+ // This should never happen. Every implementation of the Java platform
+ // is required to support 256.
+ throw new AssertionError(impossible);
+ }
+ }
+ this.sha256.reset();
+
+ this.encoderIdBytes = encoderId.getBytes(StandardCharsets.UTF_8);
+
+ if (random != null) {
+ this.random = random;
+ } else {
+ this.random = new SecureRandom();
+ }
+
+ // android-changed: Removed guava dependency
+ // checkArgument(
+ // userSecret.length >= MIN_USER_SECRET_BYTES,
+ // "userSecret must be at least %s bytes of high-quality entropy.",
+ // MIN_USER_SECRET_BYTES);
+ checkArgument(
+ userSecret.length >= MIN_USER_SECRET_BYTES,
+ "userSecret must be at least " + MIN_USER_SECRET_BYTES
+ + " bytes of high-quality entropy.");
+ this.userSecret = userSecret;
+
+ checkArgument(
+ probabilityF >= 0 && probabilityF <= 1, "probabilityF must be on range [0.0, 1.0]");
+ this.probabilityF = Math.round(probabilityF * 128) / 128.0;
+
+ checkArgument(
+ probabilityP >= 0 && probabilityP <= 1, "probabilityP must be on range [0.0, 1.0]");
+ this.probabilityP = probabilityP;
+
+ checkArgument(
+ probabilityQ >= 0 && probabilityQ <= 1, "probabilityQ must be on range [0.0, 1.0]");
+ this.probabilityQ = probabilityQ;
+
+ checkArgument(
+ numBits >= 1 && numBits <= MAX_BITS, "numBits must be on range [1, " + MAX_BITS + "].");
+ this.numBits = numBits;
+ // Make a bitmask with the lowest-order numBits set to 1.
+ this.inputMask = new BitSet(numBits);
+ this.inputMask.set(0, numBits, true);
+
+ checkArgument(
+ numBloomHashes >= 1 && numBloomHashes <= numBits,
+ "numBloomHashes must be on range [1, numBits).");
+ this.numBloomHashes = numBloomHashes;
+
+ checkArgument(
+ numCohorts >= 1 && numCohorts <= MAX_COHORTS,
+ "numCohorts must be on range [1, " + MAX_COHORTS + "].");
+
+ // Assuming numCohorts >= 1:
+ //
+ // If numCohorts is a power of 2, then numCohorts has one bit set and numCohorts - 1 has only
+ // the bits to the right of numCohorts's bit set. The bitwise-and of these is 0.
+ //
+ // If numCohorts is not a power of 2, then numCohorts has at least two bits set. It follows
+ // subtracting one from numCohorts keeps the most significant bit set to 1, and thus the
+ // bitwise-and has at least this non-zero bit.
+ boolean numCohortsIsPowerOfTwo = (numCohorts & (numCohorts - 1)) == 0;
+ checkArgument(numCohortsIsPowerOfTwo, "numCohorts must be a power of 2.");
+ this.numCohorts = numCohorts;
+
+ // cohortMasterAssignment depends only on the userSecret.
+ HmacDrbg cohortDrbg = new HmacDrbg(userSecret, new byte[] {HMAC_DRBG_TYPE_COHORT});
+ ByteBuffer cohortDrbgBytes = ByteBuffer.wrap(cohortDrbg.nextBytes(4));
+ int cohortMasterAssignment = Math.abs(cohortDrbgBytes.getInt()) % MAX_COHORTS;
+ this.cohort = cohortMasterAssignment & (numCohorts - 1);
+ }
+
+ public double getProbabilityF() {
+ return probabilityF;
+ }
+
+ public double getProbabilityP() {
+ return probabilityP;
+ }
+
+ public double getProbabilityQ() {
+ return probabilityQ;
+ }
+
+ public int getNumBits() {
+ return numBits;
+ }
+
+ public int getNumBloomHashes() {
+ return numBloomHashes;
+ }
+
+ public int getNumCohorts() {
+ return numCohorts;
+ }
+
+ public int getCohort() {
+ return cohort;
+ }
+
+ /**
+ * Returns the Encoder ID as a UTF-8 formatted string.
+ */
+ public String getEncoderId() {
+ return new String(encoderIdBytes, StandardCharsets.UTF_8);
+ }
+
+ /**
+ * Encodes a boolean into a RAPPOR report.
+ *
+ * <p>The boolean is 0 or 1, then encoded using permanent and instantaneous randomized response.
+ *
+ * <p>In most cases, numBits should be 1 when using this method.
+ */
+ public byte[] encodeBoolean(boolean bool) {
+ BitSet input = new BitSet(numBits);
+ input.set(0, bool);
+ return encodeBits(input);
+ }
+
+ /**
+ * Encodes an ordinal into a RAPPOR report.
+ *
+ * <p>The ordinal is represented using a 1-hot binary representation, then encoded using permanent
+ * and instantaneous randomized response.
+ *
+ * @param ordinal A value on the range [0, numBits).
+ */
+ public byte[] encodeOrdinal(int ordinal) {
+ checkArgument(
+ ordinal >= 0 && ordinal < numBits, "Ordinal value must be in range [0, numBits).");
+ BitSet input = new BitSet(numBits);
+ input.set(ordinal, true);
+ return encodeBits(input);
+ }
+
+ /**
+ * Encodes a string into a RAPPOR report.
+ *
+ * <p>The string is represented using a Bloom filter with numBloomHashes hash functions, then
+ * encoded using permanent and instantaneous randomized response.
+ *
+ * @param string An arbitrary string.
+ */
+ public byte[] encodeString(String string) {
+ // Implements a Bloom filter by slicing a single MD5 hash into numBloomHashes bit indices.
+ byte[] stringInUtf8 = string.getBytes(StandardCharsets.UTF_8);
+ byte[] message =
+ ByteBuffer.allocate(4 + stringInUtf8.length)
+ .putInt(cohort)
+ .put(stringInUtf8)
+ .array();
+
+ byte[] digest;
+ synchronized (this) {
+ md5.reset();
+ digest = md5.digest(message);
+ }
+ // android-changed: Removed guava dependency
+ // Verify.verify(digest.length == 16);
+ // Verify.verify(numBloomHashes <= digest.length / 2);
+ verify(digest.length == 16);
+ verify(numBloomHashes <= digest.length / 2);
+
+ BitSet input = new BitSet(numBits);
+ for (int i = 0; i < numBloomHashes; i++) {
+ // Convert byte pairs to ints on [0, 65535].
+ // Anding with 0xFF converts signed byte to unsigned int.
+ int digestWord = (digest[i * 2] & 0xFF) * 256 + (digest[i * 2 + 1] & 0xFF);
+ int chosenBit = digestWord % numBits;
+ input.set(chosenBit, true);
+ }
+
+ return encodeBits(input);
+ }
+
+ /**
+ * Encodes an arbitrary bitstring into a RAPPOR report.
+ *
+ * @param bits A bitstring in which only the least significant numBits bits may be 1.
+ */
+ public byte[] encodeBits(byte[] bits) {
+ return encodeBits(BitSet.valueOf(bits));
+ }
+
+ /**
+ * Encodes an arbitrary bitstring into a RAPPOR report.
+ *
+ * @param bits A bitstring in which only the least significant numBits bits may be 1.
+ */
+ private byte[] encodeBits(BitSet bits) {
+ BitSet permanentRandomizedResponse = computePermanentRandomizedResponse(bits);
+ BitSet encodedBitSet = computeInstantaneousRandomizedResponse(permanentRandomizedResponse);
+
+ // BitSet.toByteArray only returns enough bytes to capture the most significant bit
+ // that is set. For example, a BitSet with no bits set could return a length-0 array.
+ // In contrast, we guarantee that our output is sized according to numBits.
+ byte[] encodedBytes = encodedBitSet.toByteArray();
+ byte[] output = new byte[(numBits + 7) / 8];
+ // android-changed: Removed guava dependency
+ // Verify.verify(encodedBytes.length <= output.length);
+ verify(encodedBytes.length <= output.length);
+ System.arraycopy(
+ encodedBytes, // src
+ 0, // srcPos
+ output, // dest
+ 0, // destPos
+ encodedBytes.length); // length
+ return output;
+ }
+
+ /**
+ * Returns the permanent randomized response for the given bits.
+ *
+ * <p>The response for a particular bits input is guaranteed to always be the same for any encoder
+ * constructed with the same parameters (including the encoderId and the userSecret).
+ */
+ private BitSet computePermanentRandomizedResponse(BitSet bits) {
+ // Ensures that the input only has bits set in the lowest
+ BitSet masked = new BitSet();
+ masked.or(bits);
+ masked.andNot(inputMask);
+ checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit.");
+
+ if (probabilityF == 0.0) {
+ return bits;
+ }
+
+ // Builds a personalization string that contains both the encoderId and input value (bits),
+ // and is no longer than HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES. The first byte
+ // of the personalization string is always HMAC_DRBG_TYPE_PRR, to avoid collisions with the
+ // cohort-generation personalization string.
+ byte[] personalizationString;
+ synchronized (this) {
+ int personalizationStringLength =
+ Math.min(HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES, 1 + sha256.getDigestLength());
+ personalizationString = new byte[personalizationStringLength];
+ personalizationString[0] = HMAC_DRBG_TYPE_PRR;
+ sha256.reset();
+ sha256.update(encoderIdBytes);
+ sha256.update(new byte[] {0});
+ sha256.update(bits.toByteArray());
+ byte[] digest = sha256.digest(personalizationString);
+ System.arraycopy(digest, 0, personalizationString, 1, personalizationString.length - 1);
+ }
+
+ HmacDrbg drbg = new HmacDrbg(userSecret, personalizationString);
+ byte[] pseudorandomStream = drbg.nextBytes(numBits);
+ // android-changed: Removed guava dependency
+ // Verify.verify(numBits <= pseudorandomStream.length);
+ verify(numBits <= pseudorandomStream.length);
+
+ int probabilityFTimes128 = (int) Math.round(probabilityF * 128);
+ BitSet result = new BitSet(numBits);
+ for (int i = 0; i < numBits; i++) {
+ // Grabs a single byte from the pseudorandom stream.
+ // Anding with 0xFF converts a signed byte to an unsigned integer.
+ int pseudorandomByte = pseudorandomStream[i] & 0xFF;
+
+ // Uses bits 1-7 to get a random number between 0 and 127.
+ int uniform0to127 = pseudorandomByte >> 1;
+ boolean shouldUseNoise = uniform0to127 < probabilityFTimes128;
+
+ if (shouldUseNoise) {
+ // Uses bit 0 as a flip of a fair coin.
+ result.set(i, (pseudorandomByte & 0x01) > 0);
+ } else {
+ result.set(i, bits.get(i));
+ }
+ }
+ return result;
+ }
+
+ /**
+ * Returns the instantaneous randomized response for the given bits.
+ *
+ * <p>The instantaneous response is NOT memoized -- it is sampled randomly on
+ * every invocation.
+ */
+ private BitSet computeInstantaneousRandomizedResponse(BitSet bits) {
+ // Ensures that the input only has bits set in the lowest
+ BitSet masked = new BitSet();
+ masked.or(bits);
+ masked.andNot(inputMask);
+ checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit.");
+
+ if (probabilityP == 0.0 && probabilityQ == 1.0) {
+ return bits;
+ }
+
+ BitSet response = new BitSet(numBits);
+ for (int i = 0; i < numBits; i++) {
+ boolean bit = bits.get(i);
+ double probability = bit ? probabilityQ : probabilityP;
+ boolean responseBit = random.nextFloat() < probability;
+ response.set(i, responseBit);
+ }
+ return response;
+ }
+
+ // BEGIN android-changed: Added guava methods
+ private static void checkArgument(boolean expression, Object errorMessage) {
+ if (!expression) {
+ throw new IllegalArgumentException(String.valueOf(errorMessage));
+ }
+ }
+
+ private static void verify(boolean expression) {
+ if (!expression) {
+ throw new java.lang.IllegalStateException();
+ }
+ }
+ // END android-changed
+}