aboutsummaryrefslogtreecommitdiff
path: root/client/python/rappor.py
diff options
context:
space:
mode:
Diffstat (limited to 'client/python/rappor.py')
-rwxr-xr-xclient/python/rappor.py334
1 files changed, 334 insertions, 0 deletions
diff --git a/client/python/rappor.py b/client/python/rappor.py
new file mode 100755
index 0000000..721dc3b
--- /dev/null
+++ b/client/python/rappor.py
@@ -0,0 +1,334 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RAPPOR client library.
+
+Privacy is ensured without a third party by only sending RAPPOR'd data over the
+network (as opposed to raw client data).
+
+Note that we use MD5 for the Bloom filter hash function. (security not required)
+"""
+import csv
+import hashlib
+import hmac
+import json
+import struct
+import sys
+
+from random import SystemRandom
+
+class Error(Exception):
+ pass
+
+
+def log(msg, *args):
+ if args:
+ msg = msg % args
+ print >>sys.stderr, msg
+
+
+class Params(object):
+ """RAPPOR encoding parameters.
+
+ These affect privacy/anonymity. See the paper for details.
+ """
+ def __init__(self):
+ self.num_bloombits = 16 # Number of bloom filter bits (k)
+ self.num_hashes = 2 # Number of bloom filter hashes (h)
+ self.num_cohorts = 64 # Number of cohorts (m)
+ self.prob_p = 0.50 # Probability p
+ self.prob_q = 0.75 # Probability q
+ self.prob_f = 0.50 # Probability f
+
+ # For testing
+ def __eq__(self, other):
+ return self.__dict__ == other.__dict__
+
+ def __repr__(self):
+ return repr(self.__dict__)
+
+ def to_json(self):
+ """Convert this instance to JSON.
+
+ The names are be compatible with the apps/api server.
+ """
+ return json.dumps({
+ 'numBits': self.num_bloombits,
+ 'numHashes': self.num_hashes,
+ 'numCohorts': self.num_cohorts,
+ 'probPrr': self.prob_f,
+ 'probIrr0': self.prob_p,
+ 'probIrr1': self.prob_q,
+ })
+
+ # NOTE:
+ # - from_csv is currently used in sum_bits.py
+ # - to_csv is in rappor_sim.print_params
+ @staticmethod
+ def from_csv(f):
+ """Read the RAPPOR parameters from a CSV file.
+
+ Args:
+ f: file handle
+
+ Returns:
+ Params instance.
+
+ Raises:
+ rappor.Error: when the file is malformed.
+ """
+ c = csv.reader(f)
+ ok = False
+ p = Params()
+ for i, row in enumerate(c):
+
+ if i == 0:
+ if row != ['k', 'h', 'm', 'p', 'q', 'f']:
+ raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row)
+
+ elif i == 1:
+ try:
+ # NOTE: May raise exceptions
+ p.num_bloombits = int(row[0])
+ p.num_hashes = int(row[1])
+ p.num_cohorts = int(row[2])
+ p.prob_p = float(row[3])
+ p.prob_q = float(row[4])
+ p.prob_f = float(row[5])
+ except (ValueError, IndexError) as e:
+ raise Error('Row is malformed: %s' % e)
+ ok = True
+
+ else:
+ raise Error('Params file should only have two rows')
+
+ if not ok:
+ raise Error("Expected second row with params")
+
+ return p
+
+
+class _SecureRandom(object):
+ """Returns an integer where each bit has probability p of being 1."""
+
+ def __init__(self, prob_one, num_bits):
+ self.prob_one = prob_one
+ self.num_bits = num_bits
+
+ def __call__(self):
+ p = self.prob_one
+ rand = SystemRandom()
+ r = 0
+
+ for i in xrange(self.num_bits):
+ bit = rand.random() < p
+ r |= (bit << i) # using bool as int
+ return r
+
+
+class SecureIrrRand(object):
+ """Python's os.random()"""
+
+ def __init__(self, params):
+ """
+ Args:
+ params: rappor.Params
+ """
+ num_bits = params.num_bloombits
+ # IRR probabilities
+
+ self.p_gen = _SecureRandom(params.prob_p, num_bits)
+ self.q_gen = _SecureRandom(params.prob_q, num_bits)
+
+
+def to_big_endian(i):
+ """Convert an integer to a 4 byte big endian string. Used for hashing."""
+ # https://docs.python.org/2/library/struct.html
+ # - Big Endian (>) for consistent network byte order.
+ # - L means 4 bytes when using >
+ return struct.pack('>L', i)
+
+
+def get_bloom_bits(word, cohort, num_hashes, num_bloombits):
+ """Return an array of bits to set in the bloom filter.
+
+ In the real report, we bitwise-OR them together. In hash candidates, we put
+ them in separate entries in the "map" matrix.
+ """
+ value = to_big_endian(cohort) + word # Cohort is 4 byte prefix.
+ md5 = hashlib.md5(value)
+
+ digest = md5.digest()
+
+ # Each has is a byte, which means we could have up to 256 bit Bloom filters.
+ # There are 16 bytes in an MD5, in which case we can have up to 16 hash
+ # functions per Bloom filter.
+ if num_hashes > len(digest):
+ raise RuntimeError("Can't have more than %d hashes" % md5)
+
+ #log('hash_input %r', value)
+ #log('Cohort %d', cohort)
+ #log('MD5 %s', md5.hexdigest())
+
+ return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)]
+
+
+def get_prr_masks(secret, word, prob_f, num_bits):
+ h = hmac.new(secret, word, digestmod=hashlib.sha256)
+ #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())
+
+ # Now go through each byte
+ digest_bytes = h.digest()
+ assert len(digest_bytes) == 32
+
+ # Use 32 bits. If we want 64 bits, it may be fine to generate another 32
+ # bytes by repeated HMAC. For arbitrary numbers of bytes it's probably
+ # better to use the HMAC-DRBG algorithm.
+ if num_bits > len(digest_bytes):
+ raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d))
+
+ threshold128 = prob_f * 128
+
+ uniform = 0
+ f_mask = 0
+
+ for i in xrange(num_bits):
+ ch = digest_bytes[i]
+ byte = ord(ch)
+
+ u_bit = byte & 0x01 # 1 bit of entropy
+ uniform |= (u_bit << i) # maybe set bit in mask
+
+ rand128 = byte >> 1 # 7 bits of entropy
+ noise_bit = (rand128 < threshold128)
+ f_mask |= (noise_bit << i) # maybe set bit in mask
+
+ return uniform, f_mask
+
+
+def bit_string(irr, num_bloombits):
+ """Like bin(), but uses leading zeroes, and no '0b'."""
+ s = ''
+ bits = []
+ for bit_num in xrange(num_bloombits):
+ if irr & (1 << bit_num):
+ bits.append('1')
+ else:
+ bits.append('0')
+ return ''.join(reversed(bits))
+
+
+class Encoder(object):
+ """Obfuscates values for a given user using the RAPPOR privacy algorithm."""
+
+ def __init__(self, params, cohort, secret, irr_rand):
+ """
+ Args:
+ params: RAPPOR Params() controlling privacy
+ cohort: integer cohort, for Bloom hashing.
+ secret: secret string, for the PRR to be a deterministic function of the
+ reported value.
+ irr_rand: IRR randomness interface.
+ """
+ # RAPPOR params. NOTE: num_cohorts isn't used. p and q are used by
+ # irr_rand.
+ self.params = params
+ self.cohort = cohort # associated: MD5
+ self.secret = secret # associated: HMAC-SHA256
+ self.irr_rand = irr_rand # p and q used
+
+ def _internal_encode_bits(self, bits):
+ """Helper function for simulation / testing.
+
+ Returns:
+ The PRR and IRR. The PRR should never be sent over the network.
+ """
+ # Compute Permanent Randomized Response (PRR).
+ uniform, f_mask = get_prr_masks(
+ self.secret, to_big_endian(bits), self.params.prob_f,
+ self.params.num_bloombits)
+
+ # Suppose bit i of the Bloom filter is B_i. Then bit i of the PRR is
+ # defined as:
+ #
+ # 1 with prob f/2
+ # 0 with prob f/2
+ # B_i with prob 1-f
+
+ # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with
+ # probability f. So in the expression below:
+ #
+ # - Bits in (uniform & f_mask) are 1 with probability f/2.
+ # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability
+ # f, so we get B_i with probability 1-f.
+ # - The remaining bits are 0, with remaining probability f/2.
+
+ prr = (bits & ~f_mask) | (uniform & f_mask)
+
+ #log('U %s / F %s', bit_string(uniform, num_bits),
+ # bit_string(f_mask, num_bits))
+
+ #log('B %s / PRR %s', bit_string(bloom_bits, num_bits),
+ # bit_string(prr, num_bits))
+
+ # Compute Instantaneous Randomized Response (IRR).
+ # If PRR bit is 0, IRR bit is 1 with probability p.
+ # If PRR bit is 1, IRR bit is 1 with probability q.
+ p_bits = self.irr_rand.p_gen()
+ q_bits = self.irr_rand.q_gen()
+
+ irr = (p_bits & ~prr) | (q_bits & prr)
+
+ return prr, irr # IRR is the rappor
+
+ def _internal_encode(self, word):
+ """Helper function for simulation / testing.
+
+ Returns:
+ The Bloom filter bits, PRR, and IRR. The first two values should never
+ be sent over the network.
+ """
+ bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes,
+ self.params.num_bloombits)
+
+ bloom = 0
+ for bit_to_set in bloom_bits:
+ bloom |= (1 << bit_to_set)
+
+ prr, irr = self._internal_encode_bits(bloom)
+ return bloom, prr, irr
+
+ def encode_bits(self, bits):
+ """Encode a string with RAPPOR.
+
+ Args:
+ bits: An integer representing bits to encode.
+
+ Returns:
+ An integer that is the IRR (Instantaneous Randomized Response).
+ """
+ _, irr = self._internal_encode_bits(bits)
+ return irr
+
+ def encode(self, word):
+ """Encode a string with RAPPOR.
+
+ Args:
+ word: the string that should be privately transmitted.
+
+ Returns:
+ An integer that is the IRR (Instantaneous Randomized Response).
+ """
+ _, _, irr = self._internal_encode(word)
+ return irr