diff options
Diffstat (limited to 'client/python/rappor.py')
-rwxr-xr-x | client/python/rappor.py | 334 |
1 files changed, 334 insertions, 0 deletions
diff --git a/client/python/rappor.py b/client/python/rappor.py new file mode 100755 index 0000000..721dc3b --- /dev/null +++ b/client/python/rappor.py @@ -0,0 +1,334 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RAPPOR client library. + +Privacy is ensured without a third party by only sending RAPPOR'd data over the +network (as opposed to raw client data). + +Note that we use MD5 for the Bloom filter hash function. (security not required) +""" +import csv +import hashlib +import hmac +import json +import struct +import sys + +from random import SystemRandom + +class Error(Exception): + pass + + +def log(msg, *args): + if args: + msg = msg % args + print >>sys.stderr, msg + + +class Params(object): + """RAPPOR encoding parameters. + + These affect privacy/anonymity. See the paper for details. + """ + def __init__(self): + self.num_bloombits = 16 # Number of bloom filter bits (k) + self.num_hashes = 2 # Number of bloom filter hashes (h) + self.num_cohorts = 64 # Number of cohorts (m) + self.prob_p = 0.50 # Probability p + self.prob_q = 0.75 # Probability q + self.prob_f = 0.50 # Probability f + + # For testing + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __repr__(self): + return repr(self.__dict__) + + def to_json(self): + """Convert this instance to JSON. + + The names are be compatible with the apps/api server. + """ + return json.dumps({ + 'numBits': self.num_bloombits, + 'numHashes': self.num_hashes, + 'numCohorts': self.num_cohorts, + 'probPrr': self.prob_f, + 'probIrr0': self.prob_p, + 'probIrr1': self.prob_q, + }) + + # NOTE: + # - from_csv is currently used in sum_bits.py + # - to_csv is in rappor_sim.print_params + @staticmethod + def from_csv(f): + """Read the RAPPOR parameters from a CSV file. + + Args: + f: file handle + + Returns: + Params instance. + + Raises: + rappor.Error: when the file is malformed. + """ + c = csv.reader(f) + ok = False + p = Params() + for i, row in enumerate(c): + + if i == 0: + if row != ['k', 'h', 'm', 'p', 'q', 'f']: + raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row) + + elif i == 1: + try: + # NOTE: May raise exceptions + p.num_bloombits = int(row[0]) + p.num_hashes = int(row[1]) + p.num_cohorts = int(row[2]) + p.prob_p = float(row[3]) + p.prob_q = float(row[4]) + p.prob_f = float(row[5]) + except (ValueError, IndexError) as e: + raise Error('Row is malformed: %s' % e) + ok = True + + else: + raise Error('Params file should only have two rows') + + if not ok: + raise Error("Expected second row with params") + + return p + + +class _SecureRandom(object): + """Returns an integer where each bit has probability p of being 1.""" + + def __init__(self, prob_one, num_bits): + self.prob_one = prob_one + self.num_bits = num_bits + + def __call__(self): + p = self.prob_one + rand = SystemRandom() + r = 0 + + for i in xrange(self.num_bits): + bit = rand.random() < p + r |= (bit << i) # using bool as int + return r + + +class SecureIrrRand(object): + """Python's os.random()""" + + def __init__(self, params): + """ + Args: + params: rappor.Params + """ + num_bits = params.num_bloombits + # IRR probabilities + + self.p_gen = _SecureRandom(params.prob_p, num_bits) + self.q_gen = _SecureRandom(params.prob_q, num_bits) + + +def to_big_endian(i): + """Convert an integer to a 4 byte big endian string. Used for hashing.""" + # https://docs.python.org/2/library/struct.html + # - Big Endian (>) for consistent network byte order. + # - L means 4 bytes when using > + return struct.pack('>L', i) + + +def get_bloom_bits(word, cohort, num_hashes, num_bloombits): + """Return an array of bits to set in the bloom filter. + + In the real report, we bitwise-OR them together. In hash candidates, we put + them in separate entries in the "map" matrix. + """ + value = to_big_endian(cohort) + word # Cohort is 4 byte prefix. + md5 = hashlib.md5(value) + + digest = md5.digest() + + # Each has is a byte, which means we could have up to 256 bit Bloom filters. + # There are 16 bytes in an MD5, in which case we can have up to 16 hash + # functions per Bloom filter. + if num_hashes > len(digest): + raise RuntimeError("Can't have more than %d hashes" % md5) + + #log('hash_input %r', value) + #log('Cohort %d', cohort) + #log('MD5 %s', md5.hexdigest()) + + return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)] + + +def get_prr_masks(secret, word, prob_f, num_bits): + h = hmac.new(secret, word, digestmod=hashlib.sha256) + #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest()) + + # Now go through each byte + digest_bytes = h.digest() + assert len(digest_bytes) == 32 + + # Use 32 bits. If we want 64 bits, it may be fine to generate another 32 + # bytes by repeated HMAC. For arbitrary numbers of bytes it's probably + # better to use the HMAC-DRBG algorithm. + if num_bits > len(digest_bytes): + raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d)) + + threshold128 = prob_f * 128 + + uniform = 0 + f_mask = 0 + + for i in xrange(num_bits): + ch = digest_bytes[i] + byte = ord(ch) + + u_bit = byte & 0x01 # 1 bit of entropy + uniform |= (u_bit << i) # maybe set bit in mask + + rand128 = byte >> 1 # 7 bits of entropy + noise_bit = (rand128 < threshold128) + f_mask |= (noise_bit << i) # maybe set bit in mask + + return uniform, f_mask + + +def bit_string(irr, num_bloombits): + """Like bin(), but uses leading zeroes, and no '0b'.""" + s = '' + bits = [] + for bit_num in xrange(num_bloombits): + if irr & (1 << bit_num): + bits.append('1') + else: + bits.append('0') + return ''.join(reversed(bits)) + + +class Encoder(object): + """Obfuscates values for a given user using the RAPPOR privacy algorithm.""" + + def __init__(self, params, cohort, secret, irr_rand): + """ + Args: + params: RAPPOR Params() controlling privacy + cohort: integer cohort, for Bloom hashing. + secret: secret string, for the PRR to be a deterministic function of the + reported value. + irr_rand: IRR randomness interface. + """ + # RAPPOR params. NOTE: num_cohorts isn't used. p and q are used by + # irr_rand. + self.params = params + self.cohort = cohort # associated: MD5 + self.secret = secret # associated: HMAC-SHA256 + self.irr_rand = irr_rand # p and q used + + def _internal_encode_bits(self, bits): + """Helper function for simulation / testing. + + Returns: + The PRR and IRR. The PRR should never be sent over the network. + """ + # Compute Permanent Randomized Response (PRR). + uniform, f_mask = get_prr_masks( + self.secret, to_big_endian(bits), self.params.prob_f, + self.params.num_bloombits) + + # Suppose bit i of the Bloom filter is B_i. Then bit i of the PRR is + # defined as: + # + # 1 with prob f/2 + # 0 with prob f/2 + # B_i with prob 1-f + + # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with + # probability f. So in the expression below: + # + # - Bits in (uniform & f_mask) are 1 with probability f/2. + # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability + # f, so we get B_i with probability 1-f. + # - The remaining bits are 0, with remaining probability f/2. + + prr = (bits & ~f_mask) | (uniform & f_mask) + + #log('U %s / F %s', bit_string(uniform, num_bits), + # bit_string(f_mask, num_bits)) + + #log('B %s / PRR %s', bit_string(bloom_bits, num_bits), + # bit_string(prr, num_bits)) + + # Compute Instantaneous Randomized Response (IRR). + # If PRR bit is 0, IRR bit is 1 with probability p. + # If PRR bit is 1, IRR bit is 1 with probability q. + p_bits = self.irr_rand.p_gen() + q_bits = self.irr_rand.q_gen() + + irr = (p_bits & ~prr) | (q_bits & prr) + + return prr, irr # IRR is the rappor + + def _internal_encode(self, word): + """Helper function for simulation / testing. + + Returns: + The Bloom filter bits, PRR, and IRR. The first two values should never + be sent over the network. + """ + bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes, + self.params.num_bloombits) + + bloom = 0 + for bit_to_set in bloom_bits: + bloom |= (1 << bit_to_set) + + prr, irr = self._internal_encode_bits(bloom) + return bloom, prr, irr + + def encode_bits(self, bits): + """Encode a string with RAPPOR. + + Args: + bits: An integer representing bits to encode. + + Returns: + An integer that is the IRR (Instantaneous Randomized Response). + """ + _, irr = self._internal_encode_bits(bits) + return irr + + def encode(self, word): + """Encode a string with RAPPOR. + + Args: + word: the string that should be privately transmitted. + + Returns: + An integer that is the IRR (Instantaneous Randomized Response). + """ + _, _, irr = self._internal_encode(word) + return irr |