diff options
Diffstat (limited to 'client/java/com/google/rappor/Encoder.java')
-rw-r--r-- | client/java/com/google/rappor/Encoder.java | 613 |
1 files changed, 613 insertions, 0 deletions
diff --git a/client/java/com/google/rappor/Encoder.java b/client/java/com/google/rappor/Encoder.java new file mode 100644 index 0000000..a8fb57c --- /dev/null +++ b/client/java/com/google/rappor/Encoder.java @@ -0,0 +1,613 @@ +package com.google.android.rappor; + +// BEGIN android-changed: Removed guava dependency +// import static com.google.common.base.Preconditions.checkArgument; +// +// import com.google.common.base.Verify; +// END android-changed + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.security.SecureRandom; +import java.util.BitSet; +// BEGIN android-changed +import java.util.Random; +// END android-changed + +// BEGIN android-changed: Remove javax +// import javax.annotation.Nullable; +// import javax.annotation.concurrent.GuardedBy; +// END android-changed + +/** + * Encodes reports using the RAPPOR differentially-private encoding algorithm. + * + * The RAPPOR algorithm is described at go/rappor and presented in detail at go/rappor-writeup. + * + * @author bonawitz@google.com Keith Bonawitz + */ +// TODO(bonawitz): Make encoder and interface and make this a final class implementing it. +// We can't just make this final now because there exist tests that need to Mock it. +public class Encoder { + /** + * A non-decreasing version number. + * + * <p>The version number should increase any time the Encoder has a user-visible functional change + * to any of encoding algorithms or the interpretation of the input parameters. + */ + public static final long VERSION = 3; + + /** + * Minimum length required for the user secret, in bytes. + */ + public static final int MIN_USER_SECRET_BYTES = HmacDrbg.ENTROPY_INPUT_SIZE_BYTES; + + /** + * Maximum number of bits in the RAPPOR-encoded report. + * + * Must be less than HmacDrbg.MAX_BYTES_TOTAL; + */ + public static final int MAX_BITS = 4096; + + /** + * Maximum number of Bloom filter hashes used for RAPPOR-encoded strings. + * + * <p>This is constrained in the current implementation by requiring 2 bytes from an MD5 value + * (which is 16 bytes long) for each Bloom filter hash. + */ + public static final int MAX_BLOOM_HASHES = 8; + + /** + * Maximum number of cohorts supported. + */ + public static final int MAX_COHORTS = 128; + + /** + * First (and only) byte of HMAC_DRBG personalization strings used to generate the cohort number. + */ + private static final byte HMAC_DRBG_TYPE_COHORT = 0x00; + + /** + * First byte of HMAC_DRBG personalization strings used to generate the PRR response. + */ + private static final byte HMAC_DRBG_TYPE_PRR = 0x01; + + /** + * A unique identifier for this Encoder, represented in UTF-8. + * + * <p>The (userSecret, encoderId) pair identify a the logical memoization table used for RAPPOR's + * Permanent Randomized Response stage. Therefore, for any userSecret, each Encoder must have a + * distinct identifier for Permanent Randomized Response to be effective. + * + * <p>In practice, "memoization" is achieved by generating deterministic pseudo-random bits using + * HMAC_DRBG. encoderIdBytes is used as part of the personalization string. + */ + private final byte[] encoderIdBytes; + + /** + * The RAPPOR "f" probability, on the range [0.0, 1.0]. + * + * <p>This it the probability of replacing a bit from the input with a uniform random bit when + * generating the permanent randomized response. + * + * <p>Setting probabilityF to 0 disables the PRR phase of RAPPOR. + */ + private final double probabilityF; + + /** + * The RAPPOR "p" probability, on the range [0.0, 1.0]. + * + * <p>This is the probability of emitting a '1' bit in the instantaneous randomized response, + * given that the corresponding bit in the permanent randomized response is '0'. + * + * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR. + */ + private final double probabilityP; + + /** + * The RAPPOR "1" probability, on the range [0.0, 1.0]. + * + * <p>This is the probability of emitting a 1 bit in the instantaneous randomized response, given + * that the corresponding bit in the permanent randomized response is 1. + * + * <p>Setting probabilityP to 0.0 and probabilityQ to 1.0 disables the IRR phase of RAPPOR. + */ + private final double probabilityQ; + + /** + * The number of bits in the RAPPOR-encoded report. + * + * <p>Must satisfy 1 <= numBits <= MAX_BITS. + * + * <ul> + * <li>For encodeOrdinal, requires 0 <= ordinal < numBits. + * <li>For encodeString, uses a numBits-wide Bloom filter. + * <li>For encodeBits, only the low-order numBits may be non-zero. + * </ul> + */ + private final int numBits; + + /** + * The number of hash functions used forming the Bloom filter encoding of a string. + * + * <p>Must satisfy 1 <= numBloomHashes <= MAX_BLOOM_HASHES. + */ + private final int numBloomHashes; + + /** + * The number of cohorts used for cohort assignment. + */ + private final int numCohorts; + + /** + * The cohort this user is assigned to for Bloom filter string encoding. + * + * <p>This value is stable for a given (userSecret, numCohorts) pair. Furthermore, if two + * encoders use the same userSecret but have different numCohorts values, the cohort assignment + * for the encoder with the smaller numCohorts value is a bitwise suffix of the cohort assignment + * for the encoder with the larger numCohorts value. It follows that, if you know maximum cohort + * assignment across a set of encoders, and you know the numCohorts setting for each encoder, then + * you can deduce the cohort assignment for each encoder by taking the bitwise-and of the max + * cohort value and (numCohorts-1), noting that numCohorts is required to be a positive power of + * 2. + */ + private final int cohort; + + /** + * A bitmask with 1 bits in all the positions where a RAPPOR-encoded report could have a 1 bit. + * + * <p>The bitmask has the lowest order numBits set to 1 and the rest 0. + */ + private final BitSet inputMask; + + /** + * SHA-256 utility class instance. + * + * <p>This object is stateful; access must be synchronized. The reset method must be + * called before each use. + */ +// BEGIN android-changed: Remove javax +// @GuardedBy("this") +// END android-changed + private final MessageDigest sha256; + + /** + * MD5 utility class instance. + * + * <p>This object is stateful; access must be synchronized. The reset method must be + * called before each use. + */ +// BEGIN android-changed: Remove javax +// @GuardedBy("this") +// END android-changed + private final MessageDigest md5; + + /** + * A SecureRandom instance, initialized with a cryptographically secure random seed. + */ +// BEGIN android-changed +// private final SecureRandom random; + private final Random random; +// BEGIN android-changed + + /** + * Entropy input for constructing HmacDrbg objects. + */ + private final byte[] userSecret; + + /** + * Constructs a new RAPPOR message encoder. + * + * @param userSecret Stable secret randomly selected for this user. UserSecret must be at least + * MIN_USER_SECRET_BYTES bytes of high-quality entropy. Changing the user secret clears the + * memoized cohort assignments and permanent randomized responses. Be aware that resetting + * these memoizations has significant privacy risks -- consult documentation at go/rappor for + * more details. + * @param encoderId Uniquely identifies this encoder. Used to differentiate momoized + * cohort assignments and permanent randomized responses. + * @param numBits The number of bits in the RAPPOR-encoded report. + * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0]. This will be + * quantized to the nearest 1/128. + * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0]. + * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0]. + * @param numCohorts Number of cohorts into which the user pool is randomly segmented. + * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a + * string. + */ + public Encoder(byte[] userSecret, String encoderId, int numBits, + double probabilityF, double probabilityP, double probabilityQ, + int numCohorts, int numBloomHashes) { + this( + null, // random + null, // md5, + null, // sha256, + userSecret, + encoderId, + numBits, + probabilityF, + probabilityP, + probabilityQ, + numCohorts, + numBloomHashes); + } + + /** + * Constructs a new RAPPOR message encoder, using constructor-style dependency injection. + * + * @param random A cryptographically secure random number generator, or null (in which case a + * new SecureRandom will be constructed). + * @param md5 A configured MD5 hash algorithm, or null (in which case a new MessageDigest will be + * constructed). Note: MessageDigest objects are stateful, and that state must not be + * modified while calls to the Encoder are active. + * @param sha256 A configured SHA-256 hash algorithm, or null (in which case a new MessageDigest + * will be constructed). Note: MessageDigest objects are stateful, and that state must not + * be modified while calls to the Encoder are active. + * @param userSecret Stable secret randomly selected for this user. UserSecret must be at least + * 32 bytes of high-quality entropy. Changing the user secret clears the memoized cohort + * assignments and permanent randomized responses. Be aware that resetting these memoizations + * has significant privacy risks -- consult documentation at go/rappor for more details. + * @param encoderId Uniquely identifies this encoder. Used to differentiate momoized + * cohort assignments and permanent randomized responses. + * @param numBits The number of bits in the RAPPOR-encoded report. + * @param probabilityF The RAPPOR "f" probability, on the range [0.0, 1.0]. This will be + * quantized to the nearest 1/128. + * @param probabilityP The RAPPOR "p" probability, on the range [0.0, 1.0]. + * @param probabilityQ The RAPPOR "1" probability, on the range [0.0, 1.0]. + * @param numCohorts Number of cohorts into which the user pool is randomly segmented. + * @param numBloomHashes The number of hash functions used forming the Bloom filter encoding of a + * string. + */ + public Encoder( +// BEGIN android-changed: Remove javax +// @Nullable SecureRandom random, +// @Nullable MessageDigest md5, +// @Nullable MessageDigest sha256, +// SecureRandom random, + Random random, + MessageDigest md5, + MessageDigest sha256, +// END android-changed + byte[] userSecret, + String encoderId, + int numBits, + double probabilityF, + double probabilityP, + double probabilityQ, + int numCohorts, + int numBloomHashes) { + if (md5 != null) { + this.md5 = md5; + } else { + try { + this.md5 = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException impossible) { + // This should never happen. Every implementation of the Java platform + // is required to support MD5. + throw new AssertionError(impossible); + } + } + this.md5.reset(); + + if (sha256 != null) { + this.sha256 = sha256; + } else { + try { + this.sha256 = MessageDigest.getInstance("SHA-256"); + } catch (NoSuchAlgorithmException impossible) { + // This should never happen. Every implementation of the Java platform + // is required to support 256. + throw new AssertionError(impossible); + } + } + this.sha256.reset(); + + this.encoderIdBytes = encoderId.getBytes(StandardCharsets.UTF_8); + + if (random != null) { + this.random = random; + } else { + this.random = new SecureRandom(); + } + + // android-changed: Removed guava dependency + // checkArgument( + // userSecret.length >= MIN_USER_SECRET_BYTES, + // "userSecret must be at least %s bytes of high-quality entropy.", + // MIN_USER_SECRET_BYTES); + checkArgument( + userSecret.length >= MIN_USER_SECRET_BYTES, + "userSecret must be at least " + MIN_USER_SECRET_BYTES + + " bytes of high-quality entropy."); + this.userSecret = userSecret; + + checkArgument( + probabilityF >= 0 && probabilityF <= 1, "probabilityF must be on range [0.0, 1.0]"); + this.probabilityF = Math.round(probabilityF * 128) / 128.0; + + checkArgument( + probabilityP >= 0 && probabilityP <= 1, "probabilityP must be on range [0.0, 1.0]"); + this.probabilityP = probabilityP; + + checkArgument( + probabilityQ >= 0 && probabilityQ <= 1, "probabilityQ must be on range [0.0, 1.0]"); + this.probabilityQ = probabilityQ; + + checkArgument( + numBits >= 1 && numBits <= MAX_BITS, "numBits must be on range [1, " + MAX_BITS + "]."); + this.numBits = numBits; + // Make a bitmask with the lowest-order numBits set to 1. + this.inputMask = new BitSet(numBits); + this.inputMask.set(0, numBits, true); + + checkArgument( + numBloomHashes >= 1 && numBloomHashes <= numBits, + "numBloomHashes must be on range [1, numBits)."); + this.numBloomHashes = numBloomHashes; + + checkArgument( + numCohorts >= 1 && numCohorts <= MAX_COHORTS, + "numCohorts must be on range [1, " + MAX_COHORTS + "]."); + + // Assuming numCohorts >= 1: + // + // If numCohorts is a power of 2, then numCohorts has one bit set and numCohorts - 1 has only + // the bits to the right of numCohorts's bit set. The bitwise-and of these is 0. + // + // If numCohorts is not a power of 2, then numCohorts has at least two bits set. It follows + // subtracting one from numCohorts keeps the most significant bit set to 1, and thus the + // bitwise-and has at least this non-zero bit. + boolean numCohortsIsPowerOfTwo = (numCohorts & (numCohorts - 1)) == 0; + checkArgument(numCohortsIsPowerOfTwo, "numCohorts must be a power of 2."); + this.numCohorts = numCohorts; + + // cohortMasterAssignment depends only on the userSecret. + HmacDrbg cohortDrbg = new HmacDrbg(userSecret, new byte[] {HMAC_DRBG_TYPE_COHORT}); + ByteBuffer cohortDrbgBytes = ByteBuffer.wrap(cohortDrbg.nextBytes(4)); + int cohortMasterAssignment = Math.abs(cohortDrbgBytes.getInt()) % MAX_COHORTS; + this.cohort = cohortMasterAssignment & (numCohorts - 1); + } + + public double getProbabilityF() { + return probabilityF; + } + + public double getProbabilityP() { + return probabilityP; + } + + public double getProbabilityQ() { + return probabilityQ; + } + + public int getNumBits() { + return numBits; + } + + public int getNumBloomHashes() { + return numBloomHashes; + } + + public int getNumCohorts() { + return numCohorts; + } + + public int getCohort() { + return cohort; + } + + /** + * Returns the Encoder ID as a UTF-8 formatted string. + */ + public String getEncoderId() { + return new String(encoderIdBytes, StandardCharsets.UTF_8); + } + + /** + * Encodes a boolean into a RAPPOR report. + * + * <p>The boolean is 0 or 1, then encoded using permanent and instantaneous randomized response. + * + * <p>In most cases, numBits should be 1 when using this method. + */ + public byte[] encodeBoolean(boolean bool) { + BitSet input = new BitSet(numBits); + input.set(0, bool); + return encodeBits(input); + } + + /** + * Encodes an ordinal into a RAPPOR report. + * + * <p>The ordinal is represented using a 1-hot binary representation, then encoded using permanent + * and instantaneous randomized response. + * + * @param ordinal A value on the range [0, numBits). + */ + public byte[] encodeOrdinal(int ordinal) { + checkArgument( + ordinal >= 0 && ordinal < numBits, "Ordinal value must be in range [0, numBits)."); + BitSet input = new BitSet(numBits); + input.set(ordinal, true); + return encodeBits(input); + } + + /** + * Encodes a string into a RAPPOR report. + * + * <p>The string is represented using a Bloom filter with numBloomHashes hash functions, then + * encoded using permanent and instantaneous randomized response. + * + * @param string An arbitrary string. + */ + public byte[] encodeString(String string) { + // Implements a Bloom filter by slicing a single MD5 hash into numBloomHashes bit indices. + byte[] stringInUtf8 = string.getBytes(StandardCharsets.UTF_8); + byte[] message = + ByteBuffer.allocate(4 + stringInUtf8.length) + .putInt(cohort) + .put(stringInUtf8) + .array(); + + byte[] digest; + synchronized (this) { + md5.reset(); + digest = md5.digest(message); + } + // android-changed: Removed guava dependency + // Verify.verify(digest.length == 16); + // Verify.verify(numBloomHashes <= digest.length / 2); + verify(digest.length == 16); + verify(numBloomHashes <= digest.length / 2); + + BitSet input = new BitSet(numBits); + for (int i = 0; i < numBloomHashes; i++) { + // Convert byte pairs to ints on [0, 65535]. + // Anding with 0xFF converts signed byte to unsigned int. + int digestWord = (digest[i * 2] & 0xFF) * 256 + (digest[i * 2 + 1] & 0xFF); + int chosenBit = digestWord % numBits; + input.set(chosenBit, true); + } + + return encodeBits(input); + } + + /** + * Encodes an arbitrary bitstring into a RAPPOR report. + * + * @param bits A bitstring in which only the least significant numBits bits may be 1. + */ + public byte[] encodeBits(byte[] bits) { + return encodeBits(BitSet.valueOf(bits)); + } + + /** + * Encodes an arbitrary bitstring into a RAPPOR report. + * + * @param bits A bitstring in which only the least significant numBits bits may be 1. + */ + private byte[] encodeBits(BitSet bits) { + BitSet permanentRandomizedResponse = computePermanentRandomizedResponse(bits); + BitSet encodedBitSet = computeInstantaneousRandomizedResponse(permanentRandomizedResponse); + + // BitSet.toByteArray only returns enough bytes to capture the most significant bit + // that is set. For example, a BitSet with no bits set could return a length-0 array. + // In contrast, we guarantee that our output is sized according to numBits. + byte[] encodedBytes = encodedBitSet.toByteArray(); + byte[] output = new byte[(numBits + 7) / 8]; + // android-changed: Removed guava dependency + // Verify.verify(encodedBytes.length <= output.length); + verify(encodedBytes.length <= output.length); + System.arraycopy( + encodedBytes, // src + 0, // srcPos + output, // dest + 0, // destPos + encodedBytes.length); // length + return output; + } + + /** + * Returns the permanent randomized response for the given bits. + * + * <p>The response for a particular bits input is guaranteed to always be the same for any encoder + * constructed with the same parameters (including the encoderId and the userSecret). + */ + private BitSet computePermanentRandomizedResponse(BitSet bits) { + // Ensures that the input only has bits set in the lowest + BitSet masked = new BitSet(); + masked.or(bits); + masked.andNot(inputMask); + checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit."); + + if (probabilityF == 0.0) { + return bits; + } + + // Builds a personalization string that contains both the encoderId and input value (bits), + // and is no longer than HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES. The first byte + // of the personalization string is always HMAC_DRBG_TYPE_PRR, to avoid collisions with the + // cohort-generation personalization string. + byte[] personalizationString; + synchronized (this) { + int personalizationStringLength = + Math.min(HmacDrbg.MAX_PERSONALIZATION_STRING_LENGTH_BYTES, 1 + sha256.getDigestLength()); + personalizationString = new byte[personalizationStringLength]; + personalizationString[0] = HMAC_DRBG_TYPE_PRR; + sha256.reset(); + sha256.update(encoderIdBytes); + sha256.update(new byte[] {0}); + sha256.update(bits.toByteArray()); + byte[] digest = sha256.digest(personalizationString); + System.arraycopy(digest, 0, personalizationString, 1, personalizationString.length - 1); + } + + HmacDrbg drbg = new HmacDrbg(userSecret, personalizationString); + byte[] pseudorandomStream = drbg.nextBytes(numBits); + // android-changed: Removed guava dependency + // Verify.verify(numBits <= pseudorandomStream.length); + verify(numBits <= pseudorandomStream.length); + + int probabilityFTimes128 = (int) Math.round(probabilityF * 128); + BitSet result = new BitSet(numBits); + for (int i = 0; i < numBits; i++) { + // Grabs a single byte from the pseudorandom stream. + // Anding with 0xFF converts a signed byte to an unsigned integer. + int pseudorandomByte = pseudorandomStream[i] & 0xFF; + + // Uses bits 1-7 to get a random number between 0 and 127. + int uniform0to127 = pseudorandomByte >> 1; + boolean shouldUseNoise = uniform0to127 < probabilityFTimes128; + + if (shouldUseNoise) { + // Uses bit 0 as a flip of a fair coin. + result.set(i, (pseudorandomByte & 0x01) > 0); + } else { + result.set(i, bits.get(i)); + } + } + return result; + } + + /** + * Returns the instantaneous randomized response for the given bits. + * + * <p>The instantaneous response is NOT memoized -- it is sampled randomly on + * every invocation. + */ + private BitSet computeInstantaneousRandomizedResponse(BitSet bits) { + // Ensures that the input only has bits set in the lowest + BitSet masked = new BitSet(); + masked.or(bits); + masked.andNot(inputMask); + checkArgument(masked.isEmpty(), "Input bits had bits set past Encoder's numBits limit."); + + if (probabilityP == 0.0 && probabilityQ == 1.0) { + return bits; + } + + BitSet response = new BitSet(numBits); + for (int i = 0; i < numBits; i++) { + boolean bit = bits.get(i); + double probability = bit ? probabilityQ : probabilityP; + boolean responseBit = random.nextFloat() < probability; + response.set(i, responseBit); + } + return response; + } + + // BEGIN android-changed: Added guava methods + private static void checkArgument(boolean expression, Object errorMessage) { + if (!expression) { + throw new IllegalArgumentException(String.valueOf(errorMessage)); + } + } + + private static void verify(boolean expression) { + if (!expression) { + throw new java.lang.IllegalStateException(); + } + } + // END android-changed +} |