aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 05:11:43 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2023-07-07 05:11:43 +0000
commitd538f39b5cfad89fa85155768a2f71baafb287c1 (patch)
tree5d2b98a63d4f942bd3babd266cb81763bb59e1dc
parent2bf7e5cc344c315958537816be0656816583acf1 (diff)
parent0a1cff34c489372d3f515bf81efef2f9f5c02561 (diff)
downloadrappor-android14-mainline-sdkext-release.tar.gz
Snap for 10453563 from 0a1cff34c489372d3f515bf81efef2f9f5c02561 to mainline-sdkext-releaseaml_sdk_341510000aml_sdk_341410000aml_sdk_341110080aml_sdk_341110000aml_sdk_341010000aml_sdk_340912010android14-mainline-sdkext-release
Change-Id: Ifcd55220d03faac3321768ce36fffaa89d8523db
-rw-r--r--.gitignore5
-rw-r--r--METADATA12
-rw-r--r--README.md118
-rw-r--r--README.version4
-rwxr-xr-xanalysis/R/alternative.R83
-rwxr-xr-xanalysis/R/association.R491
-rwxr-xr-xanalysis/R/association_test.R311
-rwxr-xr-xanalysis/R/decode.R513
-rwxr-xr-xanalysis/R/decode_ngrams.R377
-rwxr-xr-xanalysis/R/decode_test.R354
-rwxr-xr-xanalysis/R/encode.R128
-rwxr-xr-xanalysis/R/fast_em.R137
-rwxr-xr-xanalysis/R/ngrams_simulation.R271
-rwxr-xr-xanalysis/R/read_input.R154
-rwxr-xr-xanalysis/R/run_tests.R48
-rwxr-xr-xanalysis/R/simulation.R268
-rwxr-xr-xanalysis/R/unknowns_test.R139
-rwxr-xr-xanalysis/R/util.R18
-rw-r--r--analysis/cpp/README.md12
-rw-r--r--analysis/cpp/fast_em.cc309
-rw-r--r--analysis/cpp/find_cliques.cc546
-rwxr-xr-xanalysis/cpp/run.sh77
-rw-r--r--analysis/cpp/testdata/graph1.txt23
-rw-r--r--analysis/cpp/testdata/graph3.txt7
-rw-r--r--analysis/tensorflow/README.md10
-rwxr-xr-xanalysis/tensorflow/fast_em.py180
-rwxr-xr-xanalysis/tensorflow/fast_em.sh22
-rw-r--r--apps/README.md58
-rw-r--r--apps/rappor-analysis/counts.csv8
-rw-r--r--apps/rappor-analysis/map.csv300
-rw-r--r--apps/rappor-analysis/params.csv2
-rwxr-xr-xapps/rappor-analysis/run_app.sh12
-rwxr-xr-xapps/rappor-analysis/server.R206
-rw-r--r--apps/rappor-analysis/test.csv20
-rwxr-xr-xapps/rappor-analysis/ui.R96
-rwxr-xr-xapps/rappor-sim/run_app.sh15
-rwxr-xr-xapps/rappor-sim/server.R156
-rwxr-xr-xapps/rappor-sim/ui.R92
-rw-r--r--bin/README.md51
-rw-r--r--bin/decode-assoc19
-rw-r--r--bin/decode-dist19
-rwxr-xr-xbin/decode_assoc.R429
-rwxr-xr-xbin/decode_dist.R144
-rw-r--r--bin/hash-candidates7
-rwxr-xr-xbin/hash_candidates.py64
-rwxr-xr-xbin/hash_candidates_test.py59
-rw-r--r--bin/sum-bits7
-rwxr-xr-xbin/sum_bits.py86
-rwxr-xr-xbin/sum_bits_test.py70
-rwxr-xr-xbin/test.sh261
-rwxr-xr-xbuild.sh120
-rw-r--r--client/cpp/Makefile158
-rw-r--r--client/cpp/README.md129
-rwxr-xr-xclient/cpp/dotd.sh35
-rw-r--r--client/cpp/encoder.cc416
-rw-r--r--client/cpp/encoder.h130
-rw-r--r--client/cpp/encoder_demo.cc56
-rw-r--r--client/cpp/encoder_unittest.cc289
-rw-r--r--client/cpp/libc_rand_impl.cc44
-rw-r--r--client/cpp/libc_rand_impl.h36
-rw-r--r--client/cpp/openssl_hash_impl.cc119
-rw-r--r--client/cpp/openssl_hash_impl.h33
-rw-r--r--client/cpp/openssl_hash_impl_unittest.cc145
-rw-r--r--client/cpp/rappor_deps.h75
-rw-r--r--client/cpp/rappor_sim.cc229
-rwxr-xr-xclient/cpp/run.sh99
-rw-r--r--client/cpp/unix_kernel_rand_impl.cc40
-rw-r--r--client/cpp/unix_kernel_rand_impl.h43
-rw-r--r--client/java/com/google/rappor/Encoder.java (renamed from client/java/com/google/android/rappor/Encoder.java)0
-rw-r--r--client/java/com/google/rappor/HmacDrbg.java (renamed from client/java/com/google/android/rappor/HmacDrbg.java)0
-rw-r--r--client/javatest/com/google/rappor/EncoderTest.java (renamed from client/javatest/com/google/android/rappor/EncoderTest.java)0
-rw-r--r--client/javatest/com/google/rappor/HmacDrbgTest.java (renamed from client/javatest/com/google/android/rappor/HmacDrbgTest.java)0
-rwxr-xr-xclient/python/rappor.py334
-rwxr-xr-xclient/python/rappor_test.py124
-rwxr-xr-xdemo.sh92
-rw-r--r--doc/data-flow.dot83
-rw-r--r--doc/data-flow.md239
-rw-r--r--doc/randomness.md38
-rwxr-xr-xdocs.sh28
-rw-r--r--gh-pages/doc/data-flow.html252
-rw-r--r--gh-pages/doc/data-flow.pngbin0 -> 73365 bytes
-rw-r--r--gh-pages/doc/randomness.html49
-rw-r--r--gh-pages/examples/exp_report/dist.pngbin0 -> 16957 bytes
-rw-r--r--gh-pages/examples/gauss_report/dist.pngbin0 -> 16709 bytes
-rw-r--r--gh-pages/examples/report.html75
-rw-r--r--gh-pages/examples/unif_report/dist.pngbin0 -> 15481 bytes
-rw-r--r--gh-pages/index.html14
-rw-r--r--pipeline/README.md52
-rwxr-xr-xpipeline/alarm-lib.sh124
-rwxr-xr-xpipeline/assoc.sh152
-rwxr-xr-xpipeline/combine_results.py138
-rwxr-xr-xpipeline/combine_results_test.py38
-rwxr-xr-xpipeline/combine_status.py298
-rwxr-xr-xpipeline/combine_status_test.py38
-rwxr-xr-xpipeline/cook.sh147
-rwxr-xr-xpipeline/csv-to-html-test.sh63
-rwxr-xr-xpipeline/csv_to_html.py218
-rwxr-xr-xpipeline/csv_to_html_test.py24
-rwxr-xr-xpipeline/dist.sh135
-rwxr-xr-xpipeline/metric_status.R343
-rwxr-xr-xpipeline/regtest.sh161
-rwxr-xr-xpipeline/task_spec.py364
-rwxr-xr-xpipeline/task_spec_test.py61
-rwxr-xr-xpipeline/tools-lib.sh64
-rwxr-xr-xpipeline/ui.sh322
-rwxr-xr-xpipeline/util.py9
-rwxr-xr-xregtest.sh440
-rwxr-xr-xsetup.sh90
-rwxr-xr-xtest.sh175
-rw-r--r--tests/_fastrand.c101
-rwxr-xr-xtests/analyze_assoc.R126
-rwxr-xr-xtests/assoc_sim.R172
-rwxr-xr-xtests/compare_dist.R264
-rwxr-xr-xtests/compare_dist_test.R43
-rwxr-xr-xtests/fastrand.py35
-rwxr-xr-xtests/fastrand_test.py65
-rwxr-xr-xtests/gen_counts.R213
-rwxr-xr-xtests/gen_counts_test.R109
-rwxr-xr-xtests/gen_true_values.R82
-rwxr-xr-xtests/gen_true_values_test.R50
-rwxr-xr-xtests/make_summary.py401
-rw-r--r--tests/params.csv2
-rwxr-xr-xtests/rappor_sim.py242
-rwxr-xr-xtests/rappor_sim_test.py33
-rw-r--r--tests/regtest.html118
-rwxr-xr-xtests/regtest_spec.py113
-rwxr-xr-xtests/setup.py26
-rwxr-xr-xtests/user_spec.py116
-rw-r--r--tests/uvals.csv2
-rw-r--r--third_party/dygraph-combined.js6
-rw-r--r--ui/README.md6
-rw-r--r--ui/assoc-day.html44
-rw-r--r--ui/assoc-metric.html45
-rw-r--r--ui/assoc-overview.html43
-rw-r--r--ui/assoc-pair.html47
-rw-r--r--ui/day.html49
-rw-r--r--ui/histograms.html48
-rw-r--r--ui/home.html16
-rw-r--r--ui/metric.html83
-rw-r--r--ui/overview.html59
-rw-r--r--ui/table-lib.js482
-rw-r--r--ui/table-sort.css39
-rw-r--r--ui/ui.css53
-rw-r--r--ui/ui.js363
-rwxr-xr-xutil.sh19
145 files changed, 17489 insertions, 4 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9187b3e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.pyc
+*.swp
+_tmp
+tests/_fastrand.so
+tests/build/
diff --git a/METADATA b/METADATA
index d97975c..7b2c091 100644
--- a/METADATA
+++ b/METADATA
@@ -1,3 +1,15 @@
+name: "rappor"
+description: "RAPPOR is a novel privacy technology that allows inferring statistics about populations while preserving the privacy of individual users."
third_party {
+ url {
+ type: ARCHIVE
+ value: "https://github.com/google/rappor/archive/a13fa964edb7c576366c83f40ff58d7c8c1db759.zip"
+ }
+ version: "a13fa964edb7c576366c83f40ff58d7c8c1db759"
license_type: NOTICE
+ last_upgrade_date {
+ year: 2022
+ month: 9
+ day: 19
+ }
}
diff --git a/README.md b/README.md
index 8527287..ccef029 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,124 @@ For a detailed description of the algorithms, see the
Feel free to send feedback to
[rappor-discuss@googlegroups.com][group].
+Running the Demo
+----------------
+
+Although the Python and R libraries should be portable to any platform, our
+end-to-end demo has only been tested on Linux.
+
+If you don't have a Linux box handy, you can [view the generated
+output](http://google.github.io/rappor/examples/report.html).
+
+To setup your enviroment there are some packages and R dependencies. There is a setup script to install them:
+ $ ./setup.sh
+Then to build the native components run:
+ $ ./build.sh
+This compiles and tests the `fastrand` C extension module for Python, which
+speeds up the simulation.
+
+Finally to run the demo run:
+ $ ./demo.sh
+
+The demo strings together the Python and R code. It:
+
+1. Generates simulated input data with different distributions
+2. Runs it through the RAPPOR privacy-preserving reporting mechanisms
+3. Analyzes and plots the aggregated reports against the true input
+
+The output is written to `_tmp/regtest/results.html`, and can be opened with a
+browser.
+
+Dependencies
+------------
+
+[R](http://r-project.org) analysis (`analysis/R`):
+
+- [glmnet](http://cran.r-project.org/web/packages/glmnet/index.html)
+- [limSolve](https://cran.r-project.org/web/packages/limSolve/index.html)
+
+Demo dependencies (`demo.sh`):
+
+These are necessary if you want to test changes to the code.
+
+- R libraries
+ - [ggplot2](http://cran.r-project.org/web/packages/ggplot2/index.html)
+ - [optparse](http://cran.r-project.org/web/packages/optparse/index.html)
+- bash shell / coreutils: to run tests
+
+Python client (`client/python`):
+
+- None. You should be able to just import the `rappor.py` file.
+
+Platform:
+
+- R: tested on R 3.0.
+- Python: tested on Python 2.7.
+- OS: the shell script tests have been tested on Linux, but may work on
+ Mac/Cygwin. The R and Python code should work on any OS.
+
+Development
+-----------
+
+To run tests:
+
+ $ ./test.sh
+
+This currently runs Python unit tests, lints Python source files, and runs R
+unit tests.
+
+API
+---
+
+`rappor.py` is a tiny standalone Python file, and you can easily copy it into a
+Python program.
+
+NOTE: Its interface is subject to change. We are in the demo stage now, but if
+there's demand, we will document and publish the interface.
+
+The R interface is also subject to change.
+
+<!-- TODO: Add links to interface docs when available. -->
+
+The `fastrand` C module is optional. It's likely only useful for simulation of
+thousands of clients. It doesn't use cryptographically strong randomness, and
+thus should **not** be used in production.
+
+Directory Structure
+-------------------
+
+ analysis/
+ R/ # R code for analysis
+ cpp/ # Fast reimplementations of certain analysis
+ # algorithms
+ apps/ # Web apps to help you use RAPPOR (using Shiny)
+ bin/ # Command line tools for analysis.
+ client/ # Client libraries
+ python/ # Python client library
+ rappor.py
+ ...
+ cpp/ # C++ client library
+ encoder.cc
+ ...
+ doc/ # Documentation
+ tests/ # Tools for regression tests
+ compare_dist.R # Test helper for single variable analysis
+ gen_true_values.R # Generate test input
+ make_summary.py # Generate an HTML report for the regtest
+ rappor_sim.py # RAPPOR client simulation
+ regtest_spec.py # Specification of test cases
+ ...
+ build.sh # Build scripts (docs, C extension, etc.)
+ demo.sh # Quick demonstration
+ docs.sh # Generate docs form the markdown in doc/
+ gh-pages/ # Where generated docs go. (A subtree of the branch gh-pages)
+ pipeline/ # Analysis pipeline code.
+ regtest.sh # End-to-end regression tests, including client
+ # libraries and analysis
+ setup.sh # Install dependencies (for Linux)
+ test.sh # Test runner
+
+Documentation
-------------
- [RAPPOR Data Flow](http://google.github.io/rappor/doc/data-flow.html)
diff --git a/README.version b/README.version
deleted file mode 100644
index 19aa180..0000000
--- a/README.version
+++ /dev/null
@@ -1,4 +0,0 @@
-URL: https://github.com/google/rappor
-Version: a13fa964edb7c576366c83f40ff58d7c8c1db759
-BugComponent: 315013
-Owners: rickywai, pvisontay, simonjw
diff --git a/analysis/R/alternative.R b/analysis/R/alternative.R
new file mode 100755
index 0000000..3f0e66d
--- /dev/null
+++ b/analysis/R/alternative.R
@@ -0,0 +1,83 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library(limSolve)
+library(Matrix)
+
+# The next two functions create a matrix (G) and a vector (H) encoding
+# linear inequality constraints that a solution vector (x) must satisfy:
+# G * x >= H
+
+# Currently represent three sets of constraints on the solution vector:
+# - all solution coefficients are nonnegative
+# - the sum total of all solution coefficients is no more than 1
+# - in each of the coordinates of the target vector (estimated Bloom filter)
+# we don't overshoot by more than three standard deviations.
+MakeG <- function(n, X) {
+ d <- Diagonal(n)
+ last <- rep(-1, n)
+ rbind2(rbind2(d, last), -X)
+}
+
+MakeH <- function(n, Y, stds) {
+ # set the floor at 0.01 to avoid degenerate cases
+ YY <- apply(Y + 3 * stds, # in each bin don't overshoot by more than 3 stds
+ 1:2,
+ function(x) min(1, max(0.01, x))) # clamp the bound to [0.01,1]
+
+ c(rep(0, n), # non-negativity condition
+ -1, # coefficients sum up to no more than 1
+ -as.vector(t(YY)) # t is important!
+ )
+}
+
+MakeLseiModel <- function(X, Y, stds) {
+ m <- dim(X)[1]
+ n <- dim(X)[2]
+
+# no slack variables for now
+# slack <- Matrix(FALSE, nrow = m, ncol = m, sparse = TRUE)
+# colnames(slack) <- 1:m
+# diag(slack) <- TRUE
+#
+# G <- MakeG(n + m)
+# H <- MakeH(n + m)
+#
+# G[n+m+1,n:(n+m)] <- -0.1
+# A = cbind2(X, slack)
+
+ w <- as.vector(t(1 / stds))
+ w_median <- median(w[!is.infinite(w)])
+ if(is.na(w_median)) # all w are infinite
+ w_median <- 1
+ w[w > w_median * 2] <- w_median * 2
+ w <- w / mean(w)
+
+ list(# coerce sparse Boolean matrix X to sparse numeric matrix
+ A = Diagonal(x = w) %*% (X + 0),
+ B = as.vector(t(Y)) * w, # transform to vector in the row-first order
+ G = MakeG(n, X),
+ H = MakeH(n, Y, stds),
+ type = 2) # Since there are no equality constraints, lsei defaults to
+ # solve.QP anyway, but outputs a warning unless type == 2.
+}
+
+# CustomLM(X, Y)
+ConstrainedLinModel <- function(X,Y) {
+ model <- MakeLseiModel(X, Y$estimates, Y$stds)
+ coefs <- do.call(lsei, model)$X
+ names(coefs) <- colnames(X)
+
+ coefs
+} \ No newline at end of file
diff --git a/analysis/R/association.R b/analysis/R/association.R
new file mode 100755
index 0000000..d1c7b5e
--- /dev/null
+++ b/analysis/R/association.R
@@ -0,0 +1,491 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library(parallel) # mclapply
+
+source.rappor <- function(rel_path) {
+ abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+ source(abs_path)
+}
+
+source.rappor("analysis/R/util.R") # for Log
+source.rappor("analysis/R/decode.R") # for ComputeCounts
+
+#
+# Tools used to estimate variable distributions of up to three variables
+# in RAPPOR. This contains the functions relevant to estimating joint
+# distributions.
+
+GetOtherProbs <- function(counts, map_by_cohort, marginal, params, pstar,
+ qstar) {
+ # Computes the marginal for the "other" category.
+ #
+ # Args:
+ # counts: m x (k+1) matrix with counts of each bit for each
+ # cohort (m=#cohorts total, k=# bits in bloom filter), first column
+ # stores the total counts
+ # map_by_cohort: list of matrices encoding locations of hashes for each
+ # string "other" category)
+ # marginal: object containing the estimated frequencies of known strings
+ # as well as the strings themselves, variance, etc.
+ # params: RAPPOR encoding parameters
+ #
+ # Returns:
+ # List of vectors of probabilities that each bit was set by the "other"
+ # category. The list is indexed by cohort.
+
+ N <- sum(counts[, 1])
+
+ # Counts of known strings to remove from each cohort.
+ known_counts <- ceiling(marginal$proportion * N / params$m)
+ sum_known <- sum(known_counts)
+
+ # Select only the strings we care about from each cohort.
+ # NOTE: drop = FALSE necessary if there is one candidate
+ candidate_map <- lapply(map_by_cohort, function(map_for_cohort) {
+ map_for_cohort[, marginal$string, drop = FALSE]
+ })
+
+ # If no strings were found, all nonzero counts were set by "other"
+ if (length(marginal) == 0) {
+ probs_other <- apply(counts, 1, function(cohort_row) {
+ cohort_row[-1] / cohort_row[1]
+ })
+ return(as.list(as.data.frame(probs_other)))
+ }
+
+ # Counts set by known strings without noise considerations.
+ known_counts_by_cohort <- sapply(candidate_map, function(map_for_cohort) {
+ as.vector(as.matrix(map_for_cohort) %*% known_counts)
+ })
+
+ # Protect against R's matrix/vector confusion. This ensures
+ # known_counts_by_cohort is a matrix in the k=1 case.
+ dim(known_counts_by_cohort) <- c(params$m, params$k)
+
+ # Counts set by known vals zero bits adjusting by p plus true bits
+ # adjusting by q.
+ known_counts_by_cohort <- (sum_known - known_counts_by_cohort) * pstar +
+ known_counts_by_cohort * qstar
+
+ # Add the left hand sums to make it a m x (k+1) "counts" matrix
+ known_counts_by_cohort <- cbind(sum_known, known_counts_by_cohort)
+
+ # Counts set by the "other" category.
+ reduced_counts <- counts - known_counts_by_cohort
+ reduced_counts[reduced_counts < 0] <- 0
+ probs_other <- apply(reduced_counts, 1, function(cohort_row) {
+ cohort_row[-1] / cohort_row[1]
+ })
+
+ # Protect against R's matrix/vector confusion.
+ dim(probs_other) <- c(params$k, params$m)
+
+ probs_other[probs_other > 1] <- 1
+ probs_other[is.nan(probs_other)] <- 0
+ probs_other[is.infinite(probs_other)] <- 0
+
+ # Convert it from a k x m matrix to a list indexed by m cohorts.
+ # as.data.frame makes each cohort a column, which can be indexed by
+ # probs_other[[cohort]].
+ result <- as.list(as.data.frame(probs_other))
+
+ result
+}
+
+GetCondProbBooleanReports <- function(reports, pstar, qstar, num_cores) {
+ # Compute conditional probabilities given a set of Boolean reports.
+ #
+ # Args:
+ # reports: RAPPOR reports as a list of bit arrays (of length 1, because
+ # this is a boolean report)
+ # pstar, qstar: standard params computed from from rappor parameters
+ # num_cores: number of cores to pass to mclapply to parallelize apply
+ #
+ # Returns:
+ # Conditional probability of all boolean reports corresponding to
+ # candidates (TRUE, FALSE)
+
+ # The values below are p(report=1|value=TRUE), p(report=1|value=FALSE)
+ cond_probs_for_1 <- c(qstar, pstar)
+ # The values below are p(report=0|value=TRUE), p(report=0|value=FALSE)
+ cond_probs_for_0 <- c(1 - qstar, 1 - pstar)
+
+ cond_report_dist <- mclapply(reports, function(report) {
+ if (report[[1]] == 1) {
+ cond_probs_for_1
+ } else {
+ cond_probs_for_0
+ }
+ }, mc.cores = num_cores)
+ cond_report_dist
+}
+
+GetCondProbStringReports <- function(reports, cohorts, map, m, pstar, qstar,
+ marginal, prob_other = NULL, num_cores) {
+ # Wrapper around GetCondProb. Given a set of reports, cohorts, map and
+ # parameters m, p*, and q*, it first computes bit indices by cohort, and
+ # then applies GetCondProb individually to each report.
+ #
+ # Args:
+ # reports: RAPPOR reports as a list of bit arrays
+ # cohorts: cohorts corresponding to these reports as a list
+ # map: map file
+ # m, pstar, qstar: standard params computed from from rappor parameters
+ # marginal: list containing marginal estimates (output of Decode)
+ # prob_other: vector of length k, indicating how often each bit in the
+ # Bloom filter was set by a string in the "other" category.
+ #
+ # Returns:
+ # Conditional probability of all reports given each of the strings in
+ # marginal$string
+
+ # Get bit indices that are set per candidate per cohort
+ bit_indices_by_cohort <- lapply(1:m, function(cohort) {
+ map_for_cohort <- map$map_by_cohort[[cohort]]
+ # Find the bits set by the candidate strings
+ bit_indices <- lapply(marginal$string, function(x) {
+ which(map_for_cohort[, x])
+ })
+ bit_indices
+ })
+
+ # Apply GetCondProb over all reports
+ cond_report_dist <- mclapply(seq(length(reports)), function(i) {
+ cohort <- cohorts[i]
+ #Log('Report %d, cohort %d', i, cohort)
+ bit_indices <- bit_indices_by_cohort[[cohort]]
+ GetCondProb(reports[[i]], pstar, qstar, bit_indices,
+ prob_other = prob_other[[cohort]])
+ }, mc.cores = num_cores)
+ cond_report_dist
+}
+
+
+GetCondProb <- function(report, pstar, qstar, bit_indices, prob_other = NULL) {
+ # Given the observed bit array, estimate P(report | true value).
+ # Probabilities are estimated for all truth values.
+ #
+ # Args:
+ # report: A single observed RAPPOR report (binary vector of length k).
+ # params: RAPPOR parameters.
+ # bit_indices: list with one entry for each candidate. Each entry is an
+ # integer vector of length h, specifying which bits are set for the
+ # candidate in the report's cohort.
+ # prob_other: vector of length k, indicating how often each bit in the
+ # Bloom filter was set by a string in the "other" category.
+ #
+ # Returns:
+ # Conditional probability of report given each of the strings in
+ # candidate_strings
+ ones <- sum(report)
+ zeros <- length(report) - ones
+ probs <- ifelse(report == 1, pstar, 1 - pstar)
+
+ # Find the likelihood of report given each candidate string
+ prob_obs_vals <- sapply(bit_indices, function(x) {
+ prod(c(probs[-x], ifelse(report[x] == 1, qstar, 1 - qstar)))
+ })
+
+ # Account for the "other" category
+ if (!is.null(prob_other)) {
+ prob_other <- prod(c(prob_other[which(report == 1)],
+ (1 - prob_other)[which(report == 0)]))
+ c(prob_obs_vals, prob_other)
+ } else {
+ prob_obs_vals
+ }
+}
+
+UpdatePij <- function(pij, cond_prob) {
+ # Update the probability matrix based on the EM algorithm.
+ #
+ # Args:
+ # pij: conditional distribution of x (vector)
+ # cond_prob: conditional distribution computed previously
+ #
+ # Returns:
+ # Updated pijs from em algorithm (maximization)
+
+ # NOTE: Not using mclapply here because we have a faster C++ implementation.
+ # mclapply spawns multiple processes, and each process can take up 3 GB+ or 5
+ # GB+ of memory.
+ wcp <- lapply(cond_prob, function(x) {
+ z <- x * pij
+ z <- z / sum(z)
+ z[is.nan(z)] <- 0
+ z
+ })
+ Reduce("+", wcp) / length(wcp)
+}
+
+ComputeVar <- function(cond_prob, est) {
+ # Computes the variance of the estimated pij's.
+ #
+ # Args:
+ # cond_prob: conditional distribution computed previously
+ # est: estimated pij's
+ #
+ # Returns:
+ # Variance of the estimated pij's
+
+ inform <- Reduce("+", lapply(cond_prob, function(x) {
+ (outer(as.vector(x), as.vector(x))) / (sum(x * est))^2
+ }))
+ var_cov <- solve(inform)
+ sd <- matrix(sqrt(diag(var_cov)), dim(cond_prob[[1]]))
+ list(var_cov = var_cov, sd = sd, inform = inform)
+}
+
+EM <- function(cond_prob, starting_pij = NULL, estimate_var = FALSE,
+ max_em_iters = 1000, epsilon = 10^-6, verbose = FALSE) {
+ # Performs estimation.
+ #
+ # Args:
+ # cond_prob: conditional distribution computed previously
+ # starting_pij: estimated pij's
+ # estimate_var: flags whether we should estimate the variance
+ # of our computed distribution
+ # max_em_iters: maximum number of EM iterations
+ # epsilon: convergence parameter
+ # verbose: flags whether to display error data
+ #
+ # Returns:
+ # Estimated pij's, variance, error params
+
+ pij <- list()
+ state_space <- dim(cond_prob[[1]])
+ if (is.null(starting_pij)) {
+ pij[[1]] <- array(1 / prod(state_space), state_space)
+ } else {
+ pij[[1]] <- starting_pij
+ }
+
+ i <- 0 # visible outside loop
+ if (nrow(pij[[1]]) > 0) {
+ # Run EM
+ for (i in 1:max_em_iters) {
+ pij[[i + 1]] <- UpdatePij(pij[[i]], cond_prob)
+ dif <- max(abs(pij[[i + 1]] - pij[[i]]))
+ if (dif < epsilon) {
+ break
+ }
+ Log('EM iteration %d, dif = %e', i, dif)
+ }
+ }
+ # Compute the variance of the estimate.
+ est <- pij[[length(pij)]]
+ if (estimate_var) {
+ var_cov <- ComputeVar(cond_prob, est)
+ sd <- var_cov$sd
+ inform <- var_cov$inform
+ var_cov <- var_cov$var_cov
+ } else {
+ var_cov <- NULL
+ inform <- NULL
+ sd <- NULL
+ }
+ list(est = est, sd = sd, var_cov = var_cov, hist = pij, num_em_iters = i)
+}
+
+TestIndependence <- function(est, inform) {
+ # Tests the degree of independence between variables.
+ #
+ # Args:
+ # est: esimated pij values
+ # inform: information matrix
+ #
+ # Returns:
+ # Chi-squared statistic for whether two variables are independent
+
+ expec <- outer(apply(est, 1, sum), apply(est, 2, sum))
+ diffs <- matrix(est - expec, ncol = 1)
+ stat <- t(diffs) %*% inform %*% diffs
+ df <- (nrow(est) - 1) * (ncol(est) - 1)
+ list(stat = stat, pval = pchisq(stat, df, lower = FALSE))
+}
+
+UpdateJointConditional <- function(cond_report_dist, joint_conditional = NULL) {
+ # Updates the joint conditional distribution of d variables, where
+ # num_variables is chosen by the client. Since variables are conditionally
+ # independent of one another, this is basically an outer product.
+ #
+ # Args:
+ # joint_conditional: The current state of the joint conditional
+ # distribution. This is a list with as many elements as there
+ # are reports.
+ # cond_report_dist: The conditional distribution of variable x, which will
+ # be outer-producted with the current joint conditional.
+ #
+ # Returns:
+ # A list of same length as joint_conditional containing the joint
+ # conditional distribution of all variables. If I want
+ # P(X'=x',Y=y'|X=x,Y=y), I will look at
+ # joint_conditional[x,x',y,y'].
+
+ if (is.null(joint_conditional)) {
+ lapply(cond_report_dist, function(x) array(x))
+ } else {
+ mapply("outer", joint_conditional, cond_report_dist,
+ SIMPLIFY = FALSE)
+ }
+}
+
+ComputeDistributionEM <- function(reports, report_cohorts, maps,
+ ignore_other = FALSE,
+ params = NULL,
+ params_list = NULL,
+ marginals = NULL,
+ estimate_var = FALSE,
+ num_cores = 10,
+ em_iter_func = EM,
+ max_em_iters = 1000) {
+ # Computes the distribution of num_variables variables, where
+ # num_variables is chosen by the client, using the EM algorithm.
+ #
+ # Args:
+ # reports: A list of num_variables elements, each a 2-dimensional array
+ # containing the counts of each bin for each report
+ # report_cohorts: A num_variables-element list; the ith element is an array
+ # containing the cohort of jth report for ith variable.
+ # maps: A num_variables-element list containing the map for each variable
+ # ignore_other: A boolean describing whether to compute the "other" category
+ # params: RAPPOR encoding parameters. If set, all variables are assumed to
+ # be encoded with these parameters.
+ # params_list: A list of num_variables elements, each of which is the
+ # RAPPOR encoding parameters for a variable (a list itself). If set,
+ # it must be the same length as 'reports'.
+ # marginals: List of estimated marginals for each variable
+ # estimate_var: A flag telling whether to estimate the variance.
+ # em_iter_func: Function that implements the iterative EM algorithm.
+
+ # Handle the case that the client wants to find the joint distribution of too
+ # many variables.
+ num_variables <- length(reports)
+
+ if (is.null(params) && is.null(params_list)) {
+ stop("Either params or params_list must be passed")
+ }
+
+ Log('Computing joint conditional')
+
+ # Compute the counts for each variable and then do conditionals.
+ joint_conditional = NULL
+ found_strings <- list()
+
+ for (j in (1:num_variables)) {
+ Log('Processing var %d', j)
+
+ var_report <- reports[[j]]
+ var_cohort <- report_cohorts[[j]]
+ var_map <- maps[[j]]
+ if (!is.null(params)) {
+ var_params <- params
+ } else {
+ var_params <- params_list[[j]]
+ }
+
+ var_counts <- NULL
+ if (is.null(marginals)) {
+ Log('\tSumming bits to gets observed counts')
+ var_counts <- ComputeCounts(var_report, var_cohort, var_params)
+
+ Log('\tDecoding marginal')
+ marginal <- Decode(var_counts, var_map$all_cohorts_map, var_params,
+ quiet = TRUE)$fit
+ Log('\tMarginal for var %d has %d values:', j, nrow(marginal))
+ print(marginal[, c('estimate', 'proportion')]) # rownames are the string
+ cat('\n')
+
+ if (nrow(marginal) == 0) {
+ Log('ERROR: Nothing decoded for variable %d', j)
+ return (NULL)
+ }
+ } else {
+ marginal <- marginals[[j]]
+ }
+ found_strings[[j]] <- marginal$string
+
+ p <- var_params$p
+ q <- var_params$q
+ f <- var_params$f
+ # pstar and qstar needed to compute other probabilities as well as for
+ # inputs to GetCondProb{Boolean, String}Reports subsequently
+ pstar <- (1 - f / 2) * p + (f / 2) * q
+ qstar <- (1 - f / 2) * q + (f / 2) * p
+ k <- var_params$k
+
+ # Ignore other probability if either ignore_other is set or k == 1
+ # (Boolean RAPPOR)
+ if (ignore_other || (k == 1)) {
+ prob_other <- vector(mode = "list", length = var_params$m)
+ } else {
+ # Compute the probability of the "other" category
+ if (is.null(var_counts)) {
+ var_counts <- ComputeCounts(var_report, var_cohort, var_params)
+ }
+ prob_other <- GetOtherProbs(var_counts, var_map$map_by_cohort, marginal,
+ var_params, pstar, qstar)
+ found_strings[[j]] <- c(found_strings[[j]], "Other")
+ }
+
+ # Get the joint conditional distribution
+ Log('\tGetCondProb for each report (%d cores)', num_cores)
+
+ # TODO(pseudorandom): check RAPPOR type more systematically instead of by
+ # checking if k == 1
+ if (k == 1) {
+ cond_report_dist <- GetCondProbBooleanReports(var_report, pstar, qstar,
+ num_cores)
+ } else {
+ cond_report_dist <- GetCondProbStringReports(var_report,
+ var_cohort, var_map, var_params$m, pstar, qstar,
+ marginal, prob_other, num_cores)
+ }
+
+ Log('\tUpdateJointConditional')
+
+ # Update the joint conditional distribution of all variables
+ joint_conditional <- UpdateJointConditional(cond_report_dist,
+ joint_conditional)
+ }
+
+ N <- length(joint_conditional)
+ dimensions <- dim(joint_conditional[[1]])
+ # e.g. 2 x 3
+ dimensions_str <- paste(dimensions, collapse = ' x ')
+ total_entries <- prod(c(N, dimensions))
+
+ Log('Starting EM with N = %d matrices of size %s (%d entries)',
+ N, dimensions_str, total_entries)
+
+ start_time <- proc.time()[['elapsed']]
+
+ # Run expectation maximization to find joint distribution
+ em <- em_iter_func(joint_conditional, max_em_iters=max_em_iters,
+ epsilon = 10 ^ -6, verbose = FALSE,
+ estimate_var = estimate_var)
+
+ em_elapsed_time <- proc.time()[['elapsed']] - start_time
+
+ dimnames(em$est) <- found_strings
+ # Return results in a usable format
+ list(fit = em$est,
+ sd = em$sd,
+ em_elapsed_time = em_elapsed_time,
+ num_em_iters = em$num_em_iters,
+ # This last field is implementation-specific; it can be used for
+ # interactive debugging.
+ em = em)
+}
diff --git a/analysis/R/association_test.R b/analysis/R/association_test.R
new file mode 100755
index 0000000..0cd24ce
--- /dev/null
+++ b/analysis/R/association_test.R
@@ -0,0 +1,311 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Authors: vpihur@google.com (Vasyl Pihur), fanti@google.com (Giulia Fanti)
+
+library(RUnit)
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/association.R")
+source("analysis/R/fast_em.R")
+source("analysis/R/util.R")
+
+SamplePopulations <- function(N, num_variables = 1, params,
+ variable_opts) {
+ # Samples a number of variables. User specifies the number of variables
+ # and some desired properties of those variables.
+ #
+ # Args:
+ # N: Number of reports to generate.
+ # params: RAPPOR parameters, like Bloom filter size, number of
+ # hash bits, etc.
+ # variable_opts: List of options for generating the ground truth:
+ # independent = whether distinct variables should be independently drawn
+ # deterministic = whether the variables should be drawn from a
+ # Poisson distribution or uniformly assigned across the range
+ # of 1:num_strings
+ # num_strings: Only does something if deterministic == TRUE, and
+ # specifies how many strings to use in the uniform assignment
+ # of ground truth strings.
+ #
+ # Returns:
+ # RAPPOR simulated ground truth for each piece of data.
+
+ m <- params$m
+ num_strings <- variable_opts$num_strings
+
+ if (variable_opts$deterministic) {
+ # If a deterministic assignment is desired, evenly distribute
+ # strings across all cohorts.
+
+ reps <- ceiling(N / num_strings)
+ variables <- lapply(1:num_variables,
+ function(i)
+ as.vector(sapply(1:num_strings, function(x)
+ rep(x, reps)))[1:N])
+ cohorts <- lapply(1:num_variables,
+ function(i) rep(1:m, ceiling(N / m))[1:N])
+ } else {
+ # Otherwise, draw from a Poisson random variable
+ variables <- lapply(1:num_variables, function(i) rpois(N, 1) + 1)
+
+ # Randomly assign cohorts in each dimension
+ cohorts <- lapply(1:num_variables,
+ function(i) sample(1:params$m, N, replace = TRUE))
+
+ if (!variable_opts$independent) {
+ # If user wants dependent RVs, subsequent variables are closely correlated
+ # with the first variable in the foll. manner:
+ # variable_i ~ variable_1 + (i-1) Bernoulli(0.5)
+
+ bernoulli_corr <- function(x) {
+ variables[[1]] + (x - 1) * sample(c(0, 1), N, replace = TRUE)}
+
+ variables[2:num_variables] <- lapply(2:num_variables,
+ function(x) bernoulli_corr(x))
+ }
+ }
+ list(variables = variables, cohorts = cohorts)
+}
+
+Simulate <- function(N, num_variables, params, variable_opts = NULL,
+ truth = NULL, basic = FALSE) {
+ if (is.null(truth)) {
+ truth <- SamplePopulations(N, num_variables, params,
+ variable_opts)
+ }
+ strs <- lapply(truth$variables, function(x) sort(seq(max(x))))
+ # strs <- lapply(truth$variables, function(x) sort(unique(x)))
+ # strs <- lapply(truth$variables, function(x) 1:length(unique(x)))
+
+ # Construct lists of maps and reports
+ if (variable_opts$deterministic) {
+ # Build the maps
+ map <- CreateMap(strs[[1]], params, FALSE, basic = basic)
+ maps <- lapply(1:num_variables, function(x) map)
+ # Build the reports
+ report <- EncodeAll(truth$variables[[1]], truth$cohorts[[1]],
+ map$map_by_cohort, params)
+ reports <- lapply(1:num_variables, function(x) report)
+ } else {
+ # Build the maps
+ maps <- lapply(1:num_variables, function(x)
+ CreateMap(strs[[x]], params, FALSE,
+ basic = basic))
+ # Build the reports
+ reports <- lapply(1:num_variables, function(x)
+ EncodeAll(truth$variables[[x]], truth$cohorts[[x]],
+ maps[[x]]$map_by_cohort, params))
+ }
+
+ list(reports = reports, cohorts = truth$cohorts,
+ truth = truth$variables, maps = maps, strs = strs)
+
+}
+
+# ----------------Actual testing starts here--------------- #
+TestComputeDistributionEM <- function() {
+ # Test various aspects of ComputeDistributionEM in association.R.
+ # Tests include:
+ # Test 1: Compute a joint distribution of uniformly distributed,
+ # perfectly correlated strings
+ # Test 2: Compute a marginal distribution of uniformly distributed strings
+ # Test 3: Check the "other" category estimation works by removing
+ # a string from the known map.
+ # Test 4: Test that the variance from EM algorithm is 1/N when there
+ # is no noise in the system.
+ # Test 5: Check that the right answer is still obtained when f = 0.2.
+
+ num_variables <- 3
+ N <- 100
+
+ # Initialize the parameters
+ params <- list(k = 12, h = 2, m = 4, p = 0, q = 1, f = 0)
+ variable_opts <- list(deterministic = TRUE, num_strings = 2,
+ independent = FALSE)
+ sim <- Simulate(N, num_variables, params, variable_opts)
+
+ # Test 1: Delta function pmf
+ joint_dist <- ComputeDistributionEM(sim$reports,
+ sim$cohorts, sim$maps,
+ ignore_other = TRUE,
+ params = params,
+ marginals = NULL,
+ estimate_var = FALSE)
+ # The recovered distribution should be close to the delta function.
+ checkTrue(abs(joint_dist$fit["1", "1", "1"] - 0.5) < 0.01)
+ checkTrue(abs(joint_dist$fit["2", "2", "2"] - 0.5) < 0.01)
+
+ # Test 2: Now compute a marginal using EM
+ dist <- ComputeDistributionEM(list(sim$reports[[1]]),
+ list(sim$cohorts[[1]]),
+ list(sim$maps[[1]]),
+ ignore_other = TRUE,
+ params = params,
+ marginals = NULL,
+ estimate_var = FALSE)
+ checkTrue(abs(dist$fit["1"] - 0.5) < 0.01)
+
+ # Test 3: Check that the "other" category is correctly computed
+ # Build a modified map with no column 2 (i.e. we only know that string
+ # "1" is a valid string
+ map <- sim$maps[[1]]
+ small_map <- map
+
+ for (i in 1:params$m) {
+ locs <- which(map$map_by_cohort[[i]][, 1])
+ small_map$map_by_cohort[[i]] <- sparseMatrix(locs, rep(1, length(locs)),
+ dims = c(params$k, 1))
+ locs <- which(map$all_cohorts_map[, 1])
+ colnames(small_map$map_by_cohort[[i]]) <- sim$strs[1]
+ }
+ small_map$all_cohorts_map <- do.call("rBind", small_map$map_by_cohort)
+
+ dist <- ComputeDistributionEM(list(sim$reports[[1]]),
+ list(sim$cohorts[[1]]),
+ list(small_map),
+ ignore_other = FALSE,
+ params = params,
+ marginals = NULL,
+ estimate_var = FALSE)
+
+ # The recovered distribution should be uniform over 2 strings.
+ checkTrue(abs(dist$fit[1] - 0.5) < 0.1)
+
+ # Test 4: Test the variance is 1/N
+ variable_opts <- list(deterministic = TRUE, num_strings = 1)
+ sim <- Simulate(N, num_variables = 1, params, variable_opts)
+ dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
+ sim$maps, ignore_other = TRUE,
+ params = params, marginals = NULL,
+ estimate_var = TRUE)
+
+ checkEqualsNumeric(dist$em$var_cov[1, 1], 1 / N)
+
+ # Test 5: Check that when f=0.2, we still get a good estimate
+ params <- list(k = 12, h = 2, m = 2, p = 0, q = 1, f = 0.2)
+ variable_opts <- list(deterministic = TRUE, num_strings = 2)
+ sim <- Simulate(N, num_variables = 2, params, variable_opts)
+ dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
+ sim$maps, ignore_other = TRUE,
+ params = params, marginals = NULL,
+ estimate_var = FALSE)
+
+ checkTrue(abs(dist$fit["1", "1"] - 0.5) < 0.15)
+ checkTrue(abs(dist$fit["2", "2"] - 0.5) < 0.15)
+
+ # Test 6: Check the computed joint distribution with randomized
+ # correlated inputs from the Poisson distribution
+ # Expect to have correlation between strings n and n + 1
+ N <- 1000
+ params <- list(k = 16, h = 2, m = 4, p = 0.1, q = 0.9, f = 0.1)
+ variable_opts <- list(deterministic = FALSE, independent = FALSE)
+ sim <- Simulate(N, num_variables = 2, params, variable_opts)
+ dist <- ComputeDistributionEM(sim$reports, sim$cohorts,
+ sim$maps, ignore_other = TRUE,
+ params = params, marginals = NULL,
+ estimate_var = FALSE)
+
+ print_dist <- TRUE # to print joint distribution, set to TRUE
+
+ if (print_dist) {
+ # dist$fit[dist$fit<1e-4] <- 0
+ # Sort by row names and column names to visually see correlation
+ print(dist$fit[sort(rownames(dist$fit)), sort(colnames(dist$fit))])
+ }
+
+ # Check for correlations (constants chosen heuristically to get good
+ # test confidence with small # of samples)
+ # Should have mass roughly 1/2e and 1/2e each
+ checkTrue(abs(dist$fit["1", "1"] - dist$fit["1", "2"]) < 0.1)
+ checkTrue(abs(dist$fit["2", "2"] - dist$fit["2", "3"]) < 0.1)
+
+ # Should have mass roughly 1/4e and 1/4e each
+ checkTrue(abs(dist$fit["3", "3"] - dist$fit["3", "4"]) < 0.06)
+
+ # Check for lack of probability mass
+ checkTrue(dist$fit["1", "3"] < 0.02)
+ checkTrue(dist$fit["1", "4"] < 0.02)
+ checkTrue(dist$fit["2", "1"] < 0.02)
+ checkTrue(dist$fit["2", "4"] < 0.02)
+ checkTrue(dist$fit["3", "1"] < 0.02)
+ checkTrue(dist$fit["3", "2"] < 0.02)
+}
+
+MakeCondProb <- function() {
+ d = matrix(c(1,1,2,2,3,3), nrow=3, ncol=2)
+ d = d / sum(d)
+
+ e = matrix(c(3,3,2,2,1,1), nrow=3, ncol=2)
+ e = e / sum(e)
+
+ list(d, e, d) # 3 reports
+}
+
+# Test the slow version in R.
+RunEmFunction <- function(cond_prob, max_em_iters) {
+ cond_prob <- MakeCondProb()
+
+ # Mechanical test of 4 iterations. em$hist has 5 elements.
+ result <- EM(cond_prob, max_em_iters=max_em_iters)
+ result$est
+}
+
+# Run a test of the EM executable
+RunEmExecutable <- function(em_executable, cond_prob, max_em_iters) {
+ print(cond_prob)
+
+ if (!file.exists(em_executable)) {
+ stop(sprintf("EM executable %s doesn't exist (build it?)", em_executable))
+ }
+ em_iter_func <- ConstructFastEM(em_executable, "/tmp")
+
+ result <- em_iter_func(cond_prob, max_em_iters=max_em_iters)
+ result$est
+}
+
+TestCppImplementation <- function() {
+ cond_prob <- MakeCondProb()
+ max_em_iters <- 10
+ fit1 <- RunEmFunction(cond_prob, max_em_iters)
+
+ # Assume we're in the repo root
+ em_cpp <- file.path(getwd(), "analysis/cpp/_tmp/fast_em")
+ fit2 <- RunEmExecutable(em_cpp, cond_prob, max_em_iters)
+
+ cpp_diff <- abs(fit1 - fit2)
+ print(cpp_diff)
+ Log("C++ implementation difference after %d iterations: %e", max_em_iters,
+ sum(cpp_diff))
+
+ # After 10 iterations they should be almost indistinguishable.
+ checkTrue(sum(cpp_diff) < 1e-10)
+}
+
+TestTensorFlowImplementation <- function() {
+ cond_prob <- MakeCondProb()
+ max_em_iters <- 10
+ fit1 <- RunEmFunction(cond_prob, max_em_iters)
+
+ em_tf <- file.path(getwd(), "analysis/tensorflow/fast_em.sh")
+ fit2 <- RunEmExecutable(em_tf, cond_prob, max_em_iters)
+
+ tf_diff <- abs(fit1 - fit2)
+ print(tf_diff)
+ Log("TensorFlow implementation difference after %d iterations: %e",
+ max_em_iters, sum(tf_diff))
+
+ checkTrue(sum(tf_diff) < 1e-10)
+}
diff --git a/analysis/R/decode.R b/analysis/R/decode.R
new file mode 100755
index 0000000..7d83f2b
--- /dev/null
+++ b/analysis/R/decode.R
@@ -0,0 +1,513 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This library implements the RAPPOR marginal decoding algorithms using LASSO.
+
+library(glmnet)
+
+# So we don't have to change pwd
+source.rappor <- function(rel_path) {
+ abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+ source(abs_path)
+}
+
+source.rappor('analysis/R/alternative.R')
+
+EstimateBloomCounts <- function(params, obs_counts) {
+ # Estimates the number of times each bit in each cohort was set in original
+ # Bloom filters.
+ #
+ # Input:
+ # params: a list of RAPPOR parameters:
+ # k - size of a Bloom filter
+ # h - number of hash functions
+ # m - number of cohorts
+ # p - P(IRR = 1 | PRR = 0)
+ # q - P(IRR = 1 | PRR = 1)
+ # f - Proportion of bits in the Bloom filter that are set randomly
+ # to 0 or 1 regardless of the underlying true bit value
+ # obs_counts: a matrix of size m by (k + 1). Column one contains sample
+ # sizes for each cohort. Other counts indicated how many times
+ # each bit was set in each cohort.
+ #
+ # Output:
+ # ests: a matrix of size m by k with estimated counts for the probability
+ # of each bit set to 1 in the true Bloom filter.
+ # stds: standard deviation of the estimates.
+
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ m <- params$m
+ k <- params$k
+
+ stopifnot(m == nrow(obs_counts), k + 1 == ncol(obs_counts))
+
+ p11 <- q * (1 - f/2) + p * f / 2 # probability of a true 1 reported as 1
+ p01 <- p * (1 - f/2) + q * f / 2 # probability of a true 0 reported as 1
+
+ p2 <- p11 - p01 # == (1 - f) * (q - p)
+
+ # When m = 1, obs_counts does not have the right dimensions. Fixing this.
+ dim(obs_counts) <- c(m, k + 1)
+
+ ests <- apply(obs_counts, 1, function(cohort_row) {
+ N <- cohort_row[1] # sample size for the cohort -- first column is total
+ v <- cohort_row[-1] # counts for individual bits
+ (v - p01 * N) / p2 # unbiased estimator for individual bits'
+ # true counts. It can be negative or
+ # exceed the total.
+ })
+
+ # NOTE: When k == 1, rows of obs_counts have 2 entries. Then cohort_row[-1]
+ # is a singleton vector, and apply() returns a *vector*. When rows have 3
+ # entries, cohort_row[-1] is a vector of length 2 and apply() returns a
+ # *matrix*.
+ #
+ # Fix this by explicitly setting dimensions. NOTE: It's k x m, not m x k.
+ dim(ests) <- c(k, m)
+
+ total <- sum(obs_counts[,1])
+
+ variances <- apply(obs_counts, 1, function(cohort_row) {
+ N <- cohort_row[1]
+ v <- cohort_row[-1]
+ p_hats <- (v - p01 * N) / (N * p2) # expectation of a true 1
+ p_hats <- pmax(0, pmin(1, p_hats)) # clamp to [0,1]
+ r <- p_hats * p11 + (1 - p_hats) * p01 # expectation of a reported 1
+ N * r * (1 - r) / p2^2 # variance of the binomial
+ })
+
+ dim(variances) <- c(k, m)
+
+ # Transform counts from absolute values to fractional, removing bias due to
+ # variability of reporting between cohorts.
+ ests <- apply(ests, 1, function(x) x / obs_counts[,1])
+ stds <- apply(variances^.5, 1, function(x) x / obs_counts[,1])
+
+ # Some estimates may be set to infinity, e.g. if f=1. We want to account for
+ # this possibility, and set the corresponding counts to 0.
+ ests[abs(ests) == Inf] <- 0
+
+ list(estimates = ests, stds = stds)
+}
+
+FitLasso <- function(X, Y, intercept = TRUE) {
+ # Fits a Lasso model to select a subset of columns of X.
+ #
+ # Input:
+ # X: a design matrix of size km by M (the number of candidate strings).
+ # Y: a vector of size km with estimated counts from EstimateBloomCounts().
+ # intercept: whether to fit with intercept or not.
+ #
+ # Output:
+ # a vector of size ncol(X) of coefficients.
+
+ # TODO(mironov): Test cv.glmnet instead of glmnet
+ mod <- try(glmnet(X, Y, standardize = FALSE, intercept = intercept,
+ lower.limits = 0, # outputs are non-negative
+ # Cap the number of non-zero coefficients to 500 or
+ # 80% of the length of Y, whichever is less. The 500 cap
+ # is for performance reasons, 80% is to avoid overfitting.
+ pmax = min(500, length(Y) * .8)),
+ silent = TRUE)
+
+ # If fitting fails, return an empty data.frame.
+ if (class(mod)[1] == "try-error") {
+ coefs <- setNames(rep(0, ncol(X)), colnames(X))
+ } else {
+ coefs <- coef(mod)
+ coefs <- coefs[-1, ncol(coefs), drop = FALSE] # coefs[1] is the intercept
+ }
+ coefs
+}
+
+PerformInference <- function(X, Y, N, mod, params, alpha, correction) {
+ m <- params$m
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ h <- params$h
+
+ q2 <- .5 * f * (p + q) + (1 - f) * q
+ p2 <- .5 * f * (p + q) + (1 - f) * p
+ resid_var <- p2 * (1 - p2) * (N / m) / (q2 - p2)^2
+
+ # Total Sum of Squares (SS).
+ TSS <- sum((Y - mean(Y))^2)
+ # Error Sum of Squares (ESS).
+ ESS <- resid_var * nrow(X)
+
+ betas <- matrix(mod$coefs, ncol = 1)
+
+# mod_var <- summary(mod$fit)$sigma^2
+# betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas))
+#
+# z_values <- betas / betas_sd
+#
+# # 1-sided t-test.
+# p_values <- pnorm(z_values, lower = FALSE)
+
+ fit <- data.frame(string = colnames(X), Estimate = betas,
+ SD = mod$stds, # z_stat = z_values, pvalue = p_values,
+ stringsAsFactors = FALSE)
+
+# if (correction == "FDR") {
+# fit <- fit[order(fit$pvalue, decreasing = FALSE), ]
+# ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit))
+# if (length(ind) > 0) {
+# fit <- fit[1:max(ind), ]
+# } else {
+# fit <- fit[numeric(0), ]
+# }
+# } else {
+# fit <- fit[fit$p < alpha, ]
+# }
+
+ fit <- fit[order(fit$Estimate, decreasing = TRUE), ]
+
+ if (nrow(fit) > 0) {
+ str_names <- fit$string
+ str_names <- str_names[!is.na(str_names)]
+ if (length(str_names) > 0 && length(str_names) < nrow(X)) {
+ this_data <- as.data.frame(as.matrix(X[, str_names]))
+ Y_hat <- predict(lm(Y ~ ., data = this_data))
+ RSS <- sum((Y_hat - mean(Y))^2)
+ } else {
+ RSS <- NA
+ }
+ } else {
+ RSS <- 0
+ }
+
+ USS <- TSS - ESS - RSS
+ SS <- c(RSS, USS, ESS) / TSS
+
+ list(fit = fit, SS = SS, resid_sigma = sqrt(resid_var))
+}
+
+ComputePrivacyGuarantees <- function(params, alpha, N) {
+ # Compute privacy parameters and guarantees.
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ h <- params$h
+
+ q2 <- .5 * f * (p + q) + (1 - f) * q
+ p2 <- .5 * f * (p + q) + (1 - f) * p
+
+ exp_e_one <- ((q2 * (1 - p2)) / (p2 * (1 - q2)))^h
+ if (exp_e_one < 1) {
+ exp_e_one <- 1 / exp_e_one
+ }
+ e_one <- log(exp_e_one)
+
+ exp_e_inf <- ((1 - .5 * f) / (.5 * f))^(2 * h)
+ e_inf <- log(exp_e_inf)
+
+ std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
+ detection_freq <- qnorm(1 - alpha) * std_dev_counts / N
+
+ privacy_names <- c("Effective p", "Effective q", "exp(e_1)",
+ "e_1", "exp(e_inf)", "e_inf", "Detection frequency")
+ privacy_vals <- c(p2, q2, exp_e_one, e_one, exp_e_inf, e_inf, detection_freq)
+
+ privacy <- data.frame(parameters = privacy_names,
+ values = privacy_vals)
+ privacy
+}
+
+FitDistribution <- function(estimates_stds, map, quiet = FALSE) {
+ # Find a distribution over rows of map that approximates estimates_stds best
+ #
+ # Input:
+ # estimates_stds: a list of two m x k matrices, one for estimates, another
+ # for standard errors
+ # map : an (m * k) x S boolean matrix
+ #
+ # Output:
+ # a float vector of length S, so that a distribution over map's rows sampled
+ # according to this vector approximates estimates
+
+ S <- ncol(map) # total number of candidates
+
+ support_coefs <- 1:S
+
+ if (S > length(estimates_stds$estimates) * .8) {
+ # the system is close to being underdetermined
+ lasso <- FitLasso(map, as.vector(t(estimates_stds$estimates)))
+
+ # Select non-zero coefficients.
+ support_coefs <- which(lasso > 0)
+
+ if(!quiet)
+ cat("LASSO selected ", length(support_coefs), " non-zero coefficients.\n")
+ }
+
+ coefs <- setNames(rep(0, S), colnames(map))
+
+ if(length(support_coefs) > 0) { # LASSO may return an empty list
+ constrained_coefs <- ConstrainedLinModel(map[, support_coefs, drop = FALSE],
+ estimates_stds)
+
+ coefs[support_coefs] <- constrained_coefs
+ }
+
+ coefs
+}
+
+Resample <- function(e) {
+ # Simulate resampling of the Bloom filter estimates by adding Gaussian noise
+ # with estimated standard deviation.
+ estimates <- matrix(mapply(function(x, y) x + rnorm(1, 0, y),
+ e$estimates, e$stds),
+ nrow = nrow(e$estimates), ncol = ncol(e$estimates))
+ stds <- e$stds * 2^.5
+
+ list(estimates = estimates, stds = stds)
+}
+
+# Private function
+# Decode for Boolean RAPPOR inputs
+# Returns a list with attribute fit only. (Inference and other aspects
+# currently not incorporated because they're unnecessary for association.)
+.DecodeBoolean <- function(counts, params, num_reports) {
+ # Boolean variables are reported without cohorts and to estimate counts,
+ # first sum up counts across all cohorts and then run EstimateBloomCounts
+ # with the number of cohorts set to 1.
+ params$m <- 1 # set number of cohorts to 1
+ summed_counts <- colSums(counts) # sum counts across cohorts
+ es <- EstimateBloomCounts(params, summed_counts) # estimate boolean counts
+
+ ests <- es$estimates[[1]]
+ std <- es$stds[[1]]
+
+ fit <- data.frame(
+ string = c("TRUE", "FALSE"),
+ estimate = c(ests * num_reports,
+ num_reports - ests * num_reports),
+ std_error = c(std * num_reports, std * num_reports),
+ proportion = c(ests, 1 - ests),
+ prop_std_error = c(std, std))
+
+ low_95 <- fit$proportion - 1.96 * fit$prop_std_error
+ high_95 <- fit$proportion + 1.96 * fit$prop_std_error
+
+ fit$prop_low_95 <- pmax(low_95, 0.0)
+ fit$prop_high_95 <- pmin(high_95, 1.0)
+ rownames(fit) <- fit$string
+
+ return(list(fit = fit))
+}
+
+CheckDecodeInputs <- function(counts, map, params) {
+ # Returns an error message, or NULL if there is no error.
+
+ if (nrow(map) != (params$m * params$k)) {
+ return(sprintf(
+ "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
+ params$m * params$k, nrow(map)))
+ }
+
+ if ((ncol(counts) - 1) != params$k) {
+ return(sprintf(paste0(
+ "Dimensions of counts file do not match: m = %d, k = %d, ",
+ "nrow(counts) = %d, ncol(counts) = %d"), params$m, params$k,
+ nrow(counts), ncol(counts)))
+
+ }
+
+ # numerically correct comparison
+ if (isTRUE(all.equal((1 - params$f) * (params$p - params$q), 0))) {
+ return("Information is lost. Cannot decode.")
+ }
+
+ return(NULL) # no error
+}
+
+Decode <- function(counts, map, params, alpha = 0.05,
+ correction = c("Bonferroni"), quiet = FALSE, ...) {
+
+ error_msg <- CheckDecodeInputs(counts, map, params)
+ if (!is.null(error_msg)) {
+ stop(error_msg)
+ }
+
+ k <- params$k
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ h <- params$h
+ m <- params$m
+
+ S <- ncol(map) # total number of candidates
+
+ N <- sum(counts[, 1])
+ if (k == 1) {
+ return(.DecodeBoolean(counts, params, N))
+ }
+
+ filter_cohorts <- which(counts[, 1] != 0) # exclude cohorts with zero reports
+
+ # stretch cohorts to bits
+ filter_bits <- as.vector(matrix(1:nrow(map), ncol = m)[,filter_cohorts, drop = FALSE])
+
+ map_filtered <- map[filter_bits, , drop = FALSE]
+
+ es <- EstimateBloomCounts(params, counts)
+
+ estimates_stds_filtered <-
+ list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
+ stds = es$stds[filter_cohorts, , drop = FALSE])
+
+ coefs_all <- vector()
+
+ # Run the fitting procedure several times (5 seems to be sufficient and not
+ # too many) to estimate standard deviation of the output.
+ for(r in 1:5) {
+ if(r > 1)
+ e <- Resample(estimates_stds_filtered)
+ else
+ e <- estimates_stds_filtered
+
+ coefs_all <- rbind(coefs_all,
+ FitDistribution(e, map_filtered, quiet))
+ }
+
+ coefs_ssd <- N * apply(coefs_all, 2, sd) # compute sample standard deviations
+ coefs_ave <- N * apply(coefs_all, 2, mean)
+
+ # Only select coefficients more than two standard deviations from 0. May
+ # inflate empirical SD of the estimates.
+ reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)
+
+ mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])
+
+ coefs_ave_zeroed <- coefs_ave
+ coefs_ave_zeroed[-reported] <- 0
+
+ residual <- as.vector(t(estimates_stds_filtered$estimates)) -
+ map_filtered %*% coefs_ave_zeroed / N
+
+ if (correction == "Bonferroni") {
+ alpha <- alpha / S
+ }
+
+ inf <- PerformInference(map_filtered[, reported, drop = FALSE],
+ as.vector(t(estimates_stds_filtered$estimates)),
+ N, mod, params, alpha,
+ correction)
+ fit <- inf$fit
+ # If this is a basic RAPPOR instance, just use the counts for the estimate
+ # (Check if the map is diagonal to tell if this is basic RAPPOR.)
+ if (sum(map) == sum(diag(map))) {
+ fit$Estimate <- colSums(counts)[-1]
+ }
+
+ # Estimates from the model are per instance so must be multipled by h.
+ # Standard errors are also adjusted.
+ fit$estimate <- floor(fit$Estimate)
+ fit$proportion <- fit$estimate / N
+
+ fit$std_error <- floor(fit$SD)
+ fit$prop_std_error <- fit$std_error / N
+
+ # 1.96 standard deviations gives 95% confidence interval.
+ low_95 <- fit$proportion - 1.96 * fit$prop_std_error
+ high_95 <- fit$proportion + 1.96 * fit$prop_std_error
+ # Clamp estimated proportion. pmin/max: vectorized min and max
+ fit$prop_low_95 <- pmax(low_95, 0.0)
+ fit$prop_high_95 <- pmin(high_95, 1.0)
+
+ fit <- fit[, c("string", "estimate", "std_error", "proportion",
+ "prop_std_error", "prop_low_95", "prop_high_95")]
+
+ allocated_mass <- sum(fit$proportion)
+ num_detected <- nrow(fit)
+
+ ss <- round(inf$SS, digits = 3)
+ explained_var <- ss[[1]]
+ missing_var <- ss[[2]]
+ noise_var <- ss[[3]]
+
+ noise_std_dev <- round(inf$resid_sigma, digits = 3)
+
+ # Compute summary of the fit.
+ parameters <-
+ c("Candidate strings", "Detected strings",
+ "Sample size (N)", "Discovered Prop (out of N)",
+ "Explained Variance", "Missing Variance", "Noise Variance",
+ "Theoretical Noise Std. Dev.")
+ values <- c(S, num_detected, N, allocated_mass,
+ explained_var, missing_var, noise_var, noise_std_dev)
+
+ res_summary <- data.frame(parameters = parameters, values = values)
+
+ privacy <- ComputePrivacyGuarantees(params, alpha, N)
+ params <- data.frame(parameters =
+ c("k", "h", "m", "p", "q", "f", "N", "alpha"),
+ values = c(k, h, m, p, q, f, N, alpha))
+
+ # This is a list of decode stats in a better format than 'summary'.
+ metrics <- list(sample_size = N,
+ allocated_mass = allocated_mass,
+ num_detected = num_detected,
+ explained_var = explained_var,
+ missing_var = missing_var)
+
+ list(fit = fit, summary = res_summary, privacy = privacy, params = params,
+ lasso = NULL, residual = as.vector(residual),
+ counts = counts[, -1], resid = NULL, metrics = metrics,
+ ests = es$estimates # ests needed by Shiny rappor-sim app
+ )
+}
+
+ComputeCounts <- function(reports, cohorts, params) {
+ # Counts the number of times each bit in the Bloom filters was set for
+ # each cohort.
+ #
+ # Args:
+ # reports: A list of N elements, each containing the
+ # report for a given report
+ # cohorts: A list of N elements, each containing the
+ # cohort number for a given report
+ # params: A list of parameters for the problem
+ #
+ # Returns:
+ # An mx(k+1) array containing the number of times each bit was set
+ # in each cohort.
+
+ # Check that the cohorts are evenly assigned. We assume that if there
+ # are m cohorts, each cohort should have approximately N/m reports.
+ # The constraint we impose here simply says that cohort bins should
+ # each have within N/m reports of one another. Since the most popular
+ # cohort is expected to have about O(logN/loglogN) reports (which we )
+ # approximate as O(logN) bins for practical values of N, a discrepancy of
+ # O(N) bins seems significant enough to alter expected behavior. This
+ # threshold can be changed to be more sensitive if desired.
+ N <- length(reports)
+ cohort_freqs <- table(factor(cohorts, levels = 1:params$m))
+ imbalance_threshold <- N / params$m
+ if ((max(cohort_freqs) - min(cohort_freqs)) > imbalance_threshold) {
+ cat("\nNote: You are using unbalanced cohort assignments, which can",
+ "significantly degrade estimation quality!\n\n")
+ }
+
+ # Count the times each bit was set, and add cohort counts to first column
+ counts <- lapply(1:params$m, function(i)
+ Reduce("+", reports[which(cohorts == i)]))
+ counts[which(cohort_freqs == 0)] <- data.frame(rep(0, params$k))
+ cbind(cohort_freqs, do.call("rbind", counts))
+}
diff --git a/analysis/R/decode_ngrams.R b/analysis/R/decode_ngrams.R
new file mode 100755
index 0000000..e2585cb
--- /dev/null
+++ b/analysis/R/decode_ngrams.R
@@ -0,0 +1,377 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# This file has functions that aid in the estimation of a distribution when the
+# dictionary is unknown. There are functions for estimating pairwise joint
+# ngram distributions, pruning out false positives, and combining the two
+# steps.
+
+FindPairwiseCandidates <- function(report_data, N, ngram_params, params) {
+ # Finds the pairwise most likely ngrams.
+ #
+ # Args:
+ # report_data: Object containing data relevant to reports:
+ # $inds: The indices of reports collected using various pairs
+ # $cohorts: The cohort of each report
+ # $map: The map used for all the ngrams
+ # $reports: The reports used for each ngram and full string
+ # N: Number of reports collected
+ # ngram_params: Parameters related to ngram size
+ # params: Parameter list.
+ #
+ # Returns:
+ # List: list of matrices, list of pairwise distributions.
+
+ inds <- report_data$inds
+ cohorts <- report_data$cohorts
+ num_ngrams_collected <- ngram_params$num_ngrams_collected
+ map <- report_data$map
+ reports <- report_data$reports
+
+ # Cycle over all the unique pairs of ngrams being collected
+ found_candidates <- list()
+
+ # Generate the map list to be used for all ngrams
+ maps <- lapply(1:num_ngrams_collected, function(x) map)
+ num_candidate_ngrams <- length(inds)
+
+ .ComputeDist <- function(i, inds, cohorts, reports, maps, params,
+ num_ngrams_collected) {
+ library(glmnet)
+ ind <- inds[[i]]
+ cohort_subset <- lapply(1:num_ngrams_collected, function(x)
+ cohorts[ind])
+ report_subset <- reports[[i]]
+ new_dist <- ComputeDistributionEM(report_subset,
+ cohort_subset,
+ maps, ignore_other = FALSE,
+ params = params, estimate_var = FALSE)
+ new_dist
+ }
+
+ # Compute the pairwise distributions (could be parallelized)
+ dists <- lapply(seq(num_candidate_ngrams), function(i)
+ .ComputeDist(i, inds, cohorts, reports, maps,
+ params, num_ngrams_collected))
+
+ dists_null <- sapply(dists, function(x) is.null(x))
+ if (any(dists_null)) {
+ return (list(found_candidates = list(), dists = dists))
+ }
+ cat("Found the pairwise ngram distributions.\n")
+
+ # Find the threshold for choosing "significant" ngram pairs
+ f <- params$f; q <- params$q; p <- params$p
+ q2 <- .5 * f * (p + q) + (1 - f) * q
+ p2 <- .5 * f * (p + q) + (1 - f) * p
+ std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
+ (threshold <- std_dev_counts / N)
+ threshold <- 0.04
+
+ # Filter joints to remove infrequently co-occurring ngrams.
+ candidate_strs <- lapply(1:num_candidate_ngrams, function(i) {
+ fit <- dists[[i]]$fit
+ edges <- which(fit > threshold, arr.ind = TRUE, FALSE)
+
+ # Recover the list of strings that seem significant
+ found_candidates <- sapply(1:ncol(edges), function(x) {
+ chunks <- sapply(edges[, x],
+ function(j) dimnames(fit)[[x]][j])
+ chunks
+ })
+ # sapply returns either "character" vector (for n=1) or a matrix. Convert
+ # it to a matrix. This can be seen as follows:
+ #
+ # > class(sapply(1:5, function(x) "a"))
+ # [1] "character"
+ # > class(sapply(1:5, function(x) c("a", "b")))
+ # [1] "matrix"
+ found_candidates <- rbind(found_candidates)
+
+ # Remove the "others"
+ others <- which(found_candidates == "Other")
+ if (length(others) > 0) {
+ other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1]
+ # drop = FALSE necessary to keep it a matrix
+ found_candidates <- found_candidates[-other, , drop = FALSE]
+ }
+
+ found_candidates
+ })
+ if (any(lapply(found_candidates, function(x) length(x)) == 0)) {
+ return (NULL)
+ }
+
+ list(candidate_strs = candidate_strs, dists = dists)
+}
+
+FindFeasibleStrings <- function(found_candidates, pairings, num_ngrams,
+ ngram_size) {
+ # Uses the list of strings found by the pairwise comparisons to build
+ # a list of full feasible strings. This relies on the iterative,
+ # graph-based approach.
+ #
+ # Args:
+ # found_candidates: list of candidates found by each pairwise decoding
+ # pairings: Matrix of size 2x(num_ngrams choose 2) listing all the
+ # ngram position pairings.
+ # num_ngrams: The total number of ngrams per word.
+ # ngram_size: Number of characters per ngram
+ #
+ # Returns:
+ # List of full string candidates.
+
+ # Which ngram pairs are adjacent, i.e. of the form (i,i+1)
+ adjacent <- sapply(seq(num_ngrams - 1), function(x) {
+ c(1 + (x - 1) * ngram_size, x * ngram_size + 1)
+ })
+
+ adjacent_pairs <- apply(adjacent, 2, function(x) {
+ which(apply(pairings, 1, function(y) identical(y, x)))
+ })
+
+ # The first set of candidates are ngrams found in positions 1 and 2
+ active_cands <- found_candidates[[adjacent_pairs[1]]]
+ if (class(active_cands) == "list") {
+ return (list())
+ } else {
+ active_cands <- as.data.frame(active_cands)
+ }
+
+ # Now check successive ngrams to find consistent combinations
+ # i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc.
+ for (i in 2:length(adjacent_pairs)) {
+ if (nrow(active_cands) == 0) {
+ return (list())
+ }
+ new_cands <- found_candidates[[adjacent_pairs[i]]]
+ new_cands <- as.data.frame(new_cands)
+ # Builds the set of possible candidates based only on ascending
+ # candidate pairs
+ active_cands <- BuildCandidates(active_cands, new_cands)
+ }
+
+ if (nrow(active_cands) == 0) {
+ return (list())
+ }
+ # Now refine these candidates using non-adjacent bigrams
+ remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)]
+ # For each non-adjacent pair, make sure that all the candidates are
+ # consistent (in this phase, candidates can ONLY be eliminated)
+
+ for (i in remaining) {
+ new_cands <- found_candidates[[i]]
+ new_cands <- as.data.frame(new_cands)
+ # Prune out all candidates that do not agree with new_cands
+ active_cands <- PruneCandidates(active_cands, pairings[i, ],
+ ngram_size,
+ new_cands = new_cands)
+ }
+ # Consolidate the string ngrams into a full string representation
+ if (length(active_cands) > 0) {
+ active_cands <- sort(apply(active_cands, 1,
+ function(x) paste0(x, collapse = "")))
+ }
+ unname(active_cands)
+}
+
+BuildCandidates <- function(active_cands, new_cands) {
+ # Takes in a data frame where each row is a valid sequence of ngrams
+ # checks which of the new_cands ngram pairs are consistent with
+ # the original active_cands ngram sequence.
+ #
+ # Args:
+ # active_cands: data frame of ngram sequence candidates (1 candidate
+ # sequence per row)
+ # new_cands: An rx2 data frame with a new list of candidate ngram
+ # pairs that might fit in with the previous list of candidates
+ #
+ # Returns:
+ # Updated active_cands, with another column if valid extensions are
+ # found.
+
+ # Get the trailing ngrams from the current candidates
+ to_check <- as.vector(tail(t(active_cands), n = 1))
+ # Check which of the elements in to_check are leading ngrams among the
+ # new candidates
+ present <- sapply(to_check, function(x) any(x == new_cands))
+ # Remove the strings that are not represented among the new candidates
+ to_check <- to_check[present]
+ # Now insert the new candidates where they belong
+ active_cands <- active_cands[present, , drop = FALSE]
+ active_cands <- cbind(active_cands, col = NA)
+ num_cands <- nrow(active_cands)
+ hit_list <- c()
+ for (j in 1:num_cands) {
+ inds <- which(new_cands[, 1] == to_check[j])
+ if (length(inds) == 0) {
+ hit_list <- c(hit_list, j)
+ next
+ }
+ # If there are multiple candidates fitting with an ngram, include
+ # each /full/ string as a candidate
+ extra <- length(inds) - 1
+ if (extra > 0) {
+ rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra))
+ to_paste <- active_cands[j, ]
+ # Add the new candidates to the bottom
+ for (p in 1:extra) {
+ active_cands <- rbind(active_cands, to_paste)
+ }
+ } else {
+ rep_inds <- c(j)
+ }
+ active_cands[rep_inds, ncol(active_cands)] <-
+ as.vector(new_cands[inds, 2])
+ new_num_cands <- nrow(active_cands)
+ }
+ # If there were some false candidates in the original set, remove them
+ if (length(hit_list) > 0) {
+ active_cands <- active_cands[-hit_list, , drop = FALSE]
+ }
+ active_cands
+}
+
+PruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) {
+ # Takes in a data frame where each row is a valid sequence of ngrams
+ # checks which of the new_cands ngram pairs are consistent with
+ # the original active_cands ngram sequence. This can ONLY remove
+ # candidates presented in active_cands.
+ #
+ # Args:
+ # active_cands: data frame of ngram sequence candidates (1 candidate
+ # sequence per row)
+ # pairing: A length-2 list storing which two ngrams are measured
+ # ngram_size: Number of characters per ngram
+ # new_cands: An rx2 data frame with a new list of candidate ngram
+ # pairs that might fit in with the previous list of candidates
+ #
+ # Returns:
+ # Updated active_cands, with a reduced number of rows.
+
+ # Convert the pairing to an ngram index
+ cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1)
+
+ cands_to_check <- active_cands[, cols, drop = FALSE]
+ # Find the candidates that are inconsistent with the new data
+ hit_list <- sapply(1:nrow(cands_to_check), function(j) {
+ to_kill <- FALSE
+ if (nrow(new_cands) == 0) {
+ return (TRUE)
+ }
+ if (!any(apply(new_cands, 1, function(x)
+ all(cands_to_check[j, , drop = FALSE] == x)))) {
+ to_kill <- TRUE
+ }
+ to_kill
+ })
+
+ # Determine which rows are false positives
+ hit_indices <- which(hit_list)
+ # Remove the false positives
+ if (length(hit_indices) > 0) {
+ active_cands <- active_cands[-hit_indices, ]
+ }
+ active_cands
+}
+
+EstimateDictionary <- function(report_data, N, ngram_params, params) {
+ # Takes in a list of report data and returns a list of string
+ # estimates of the dictionary.
+ #
+ # Args:
+ # report_data: Object containing data relevant to reports:
+ # $inds: The indices of reports collected using various pairs
+ # $cohorts: The cohort of each report
+ # $map: THe map used for all the ngrams
+ # $reports: The reports used for each ngram and full string
+ # N: the number of individuals sending reports
+ # ngram_params: Parameters related to ngram length, etc
+ # params: Parameter vector with RAPPOR noise levels, cohorts, etc
+ #
+ # Returns:
+ # List: list of found candidates, list of pairwise candidates
+
+ pairwise_candidates <- FindPairwiseCandidates(report_data, N,
+ ngram_params,
+ params)$candidate_strs
+ cat("Found the pairwise candidates. \n")
+ if (is.null(pairwise_candidates)) {
+ return (list())
+ }
+ found_candidates <- FindFeasibleStrings(pairwise_candidates,
+ report_data$pairings,
+ ngram_params$num_ngrams,
+ ngram_params$ngram_size)
+ cat("Found all the candidates. \n")
+ list(found_candidates = found_candidates,
+ pairwise_candidates = pairwise_candidates)
+}
+
+WriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams,
+ ngram_size) {
+ # Args:
+ # conn: R connection to write to. Should be opened with mode w+.
+ # pairwise_candidates: list of matrices. Each matrix represents a subgraph;
+ # it contains the edges between partitions i and j, so there are (k choose
+ # 2) matrices. Each matrix has dimension 2 x E, where E is the number of
+ # edges.
+ # pairings: 2 x (k choose 2) matrix of character positions. Each row
+ # corresponds to a subgraph; it has 1-based character index of partitions
+ # i and j.
+ # num_ngrams: length of pairwise_candidates, or the number of partitions in
+ # the k-partite graph
+
+ # File Format:
+ #
+ # num_partitions 3
+ # ngram_size 2
+ # 0.ab 1.cd
+ # 0.ab 2.ef
+ #
+ # The first line specifies the number of partitions (k).
+ # The remaining lines are edges, where each node is <partition>.<bigram>.
+ #
+ # Partitions are numbered from 0. The partition of the left node will be
+ # less than the partition of the right node.
+
+ # First two lines are metadata
+ cat(sprintf('num_partitions %d\n', num_ngrams), file = conn)
+ cat(sprintf('ngram_size %d\n', ngram_size), file = conn)
+
+ for (i in 1:length(pairwise_candidates)) {
+ # The two pairwise_candidates for this subgraph.
+ # Turn 1-based character positions into 0-based partition numbers,
+ # e.g. (3, 5) -> (1, 2)
+
+ pos1 <- pairings[[i, 1]]
+ pos2 <- pairings[[i, 2]]
+ part1 <- (pos1 - 1) / ngram_size
+ part2 <- (pos2 - 1) / ngram_size
+ cat(sprintf("Writing partition (%d, %d)\n", part1, part2))
+
+ p <- pairwise_candidates[[i]]
+ # each row is an edge
+ for (j in 1:nrow(p)) {
+ n1 <- p[[j, 1]]
+ n2 <- p[[j, 2]]
+ line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2)
+ # NOTE: It would be faster to preallocate 'lines', but we would have to
+ # make a two passes through pairwise_candidates.
+ cat(line, file = conn)
+ }
+ }
+}
+
diff --git a/analysis/R/decode_test.R b/analysis/R/decode_test.R
new file mode 100755
index 0000000..74c46ce
--- /dev/null
+++ b/analysis/R/decode_test.R
@@ -0,0 +1,354 @@
+#!/usr/bin/Rscript
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library(RUnit)
+library(abind)
+
+source('analysis/R/decode.R')
+source('tests/gen_counts.R')
+
+L1Distance <- function(X, Y) {
+ # Computes the L1 distance between two named vectors
+ common <- intersect(names(X), names(Y))
+
+ L1_intersect <- sum(abs(X[common] - Y[common]))
+ L1_X_minus_Y <- sum(X[!names(X) %in% common])
+ L1_Y_minus_X <- sum(Y[!names(Y) %in% common])
+
+ (L1_intersect + L1_X_minus_Y + L1_Y_minus_X) / 2
+}
+
+LInfDistance <- function(X, Y) {
+ # Computes the L_infinity distance between two named vectors
+ common <- intersect(names(X), names(Y))
+
+ max(abs(X[common] - Y[common]),
+ abs(X[!names(X) %in% common]),
+ abs(Y[!names(Y) %in% common]))
+}
+
+MatrixVectorMerge <- function(mat, vec) {
+ # Attaches a vector to a matrix, matching corresponding column names
+
+ mat_only <- setdiff(colnames(mat), names(vec))
+ vec_only <- setdiff(names(vec), colnames(mat))
+
+ # extend the vector with missing columns
+ vec_long <- c(vec, setNames(rep(NA, length(mat_only)), mat_only))
+
+ # extend the matrix with missing columns
+ newcols <- matrix(NA, nrow = nrow(mat), ncol = length(vec_only))
+ colnames(newcols) <- vec_only
+ mat_long <- cbind(mat, newcols)
+
+ # Now vec and mat have the same columns, but in the wrong order. Sort the
+ # columns lexicographically.
+ if(length(vec_long) > 0) {
+ mat_long <- mat_long[, order(colnames(mat_long)), drop = FALSE]
+ vec_long <- vec_long[order(names(vec_long))]
+ }
+
+ rbind(mat_long, vec_long)
+}
+
+RunMultipleTests <- function(title, fun, repetitions, ...) {
+ # Run a function with an annotated progress indicator. The function's outputs
+ # are concatenated and returned as a list of length repetitions.
+ cat(title, ": ")
+
+ if(repetitions == 1) {
+ # only run once
+ results <- list(fun(...))
+
+ cat(" Done.\n")
+ } else { # run multiple times
+ pb <- txtProgressBar(min = 0, max = repetitions,
+ width = getOption("width") - 20 - nchar(title))
+
+ results <- vector(mode = "list", repetitions)
+ for(i in 1:repetitions) {
+ setTxtProgressBar(pb, i)
+ results[[i]] <- fun(...)
+ }
+ cat(" Done.")
+ close(pb)
+ }
+
+ results
+}
+
+CheckEstimatesAndStdsHelper <- function(params, map, pdf, total) {
+ # Helper function for TestEstimateBloomCounts.
+ partition <- RandomPartition(total, pdf)
+ counts <- GenerateCounts(params, map, partition, 1)
+
+ EstimateBloomCounts(params, counts)
+}
+
+CheckEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) {
+ # Checks that the expectations returned by EstimateBloomCounts on simulated
+ # inputs match the ground truth and the empirical standard deviation matches
+ # EstimateBloomCounts outputs.
+ #
+ # Input:
+ # repetitions: the number of runs ofEstimateBloomCounts
+ # title: label
+ # params: params vector
+ # map: the map table
+ # pdf: probability density function of the distribution from which simulated
+ # clients are sampled
+ # total: number of reports
+
+ results <- RunMultipleTests(title, CheckEstimatesAndStdsHelper, repetitions,
+ params, map, pdf, total)
+
+ estimates <- abind(lapply(results, function(r) r$estimates), along = 3)
+ stds <- abind(lapply(results, function(r) r$stds), along = 3)
+
+ ave_e <- apply(estimates, 1:2, mean)
+ observed_stds <- apply(estimates, 1:2, sd)
+ ave_stds <- apply(stds, 1:2, mean)
+
+ ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE)
+
+ checkTrue(!any(abs(ave_e - ground_truth) > 1E-9 + # tolerance level
+ (ave_stds / repetitions^.5) * 5),
+ "Averages deviate too much from expectations.")
+
+ checkTrue(!any(observed_stds > ave_stds * (1 + 5 * repetitions^.5)),
+ "Expected standard deviations are too high")
+
+ checkTrue(!any(observed_stds < ave_stds * (1 - 5 * repetitions^.5)),
+ "Expected standard deviations are too low")
+}
+
+TestEstimateBloomCounts <- function() {
+ # Unit tests for the EstimateBloomCounts function.
+
+ report4x2 <- list(k = 4, m = 2) # 2 cohorts, 4 bits each
+ map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values
+ map0[1,] <- c(1, 0, 0)
+ map0[2,] <- c(0, 1, 0)
+ map0[3,] <- c(0, 0, 1)
+ map0[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all
+ map0[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3
+
+ colnames(map0) <- c('v1', 'v2', 'v3')
+
+ pdf0 <- c(1/2, 1/3, 1/6)
+ names(pdf0) <- colnames(map0)
+
+ noise0 <- list(p = 0, q = 1, f = 0) # no noise at all
+
+ CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)",
+ c(report4x2, noise0), map0, pdf0, 100)
+
+ noise1 <- list(p = 0.4, q = .6, f = 0.5)
+ CheckEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)",
+ c(report4x2, noise1), map0, pdf0, 100)
+
+ # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
+ values <- 100
+
+ report8x32 <- list(k = 8, m = 32) # 32 cohorts, 8 bits each
+
+ map1 <- matrix(rbinom(32 * 8 * values, 1, .25), nrow = 32 * 8, ncol = values)
+
+ colnames(map1) <- sprintf("v%d", 1:values)
+
+ pdf1 <- ComputePdf("zipf1", values)
+
+ CheckEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)",
+ c(report8x32, noise1), map1, pdf1, 10^9)
+}
+
+CheckDecodeHelper <- function(params, map, pdf, num_clients,
+ tolerance_l1, tolerance_linf) {
+ # Helper function for TestDecode. Simulates a RAPPOR run and checks results of
+ # Decode's output against the ground truth. Output is returned as a list.
+
+ partition <- RandomPartition(num_clients, pdf)
+ counts <- GenerateCounts(params, map, partition, 1)
+ total <- sum(partition)
+
+ decoded <- Decode(counts, map, params, quiet = TRUE)
+
+ decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$string)
+
+ checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1,
+ sprintf("L1 distance is too large: \
+ L1Distance = %f, total^0.5 * tolerance_l1 = %f",
+ L1Distance(decoded_partition, partition),
+ total^0.5 * tolerance_l1))
+
+ checkTrue(LInfDistance(decoded_partition, partition) <
+ max(partition)^.5 * tolerance_linf,
+ sprintf("L_inf distance is too large: \
+ L1Distance = %f, max(partition)^0.5 * tolerance_linf = %f",
+ L1Distance(decoded_partition, partition),
+ max(partition)^0.5 * tolerance_linf))
+
+ list(estimates = decoded_partition,
+ stds = setNames(decoded$fit$std_error, decoded$fit$string))
+}
+
+CheckDecodeAveAndStds <- function(...) {
+ # Runs Decode multiple times (specified by the repetition argument), checks
+ # individuals runs against the ground truth, and the estimates of the standard
+ # error against empirical observations.
+
+ results <- RunMultipleTests(...)
+
+ estimates <- matrix(nrow = 0, ncol = 0)
+ lapply(results, function(r) MatrixVectorMerge(estimates, r$estimates))
+
+ stds <- matrix(nrow = 0, ncol = 0)
+ lapply(results, function(r) MatrixVectorMerge(stds, r$stds))
+
+ empirical_stds <- apply(estimates, 2, sd, na.rm = TRUE)
+ estimated_stds <- apply(stds, 2, mean, na.rm = TRUE)
+
+ if(dim(estimates)[1] > 1) {
+ checkTrue(any(estimated_stds > empirical_stds / 2),
+ "Our estimate for the standard deviation is too low")
+
+ checkTrue(any(estimated_stds < empirical_stds * 3),
+ "Our estimate for the standard deviation is too high")
+ }
+}
+
+TestDecode <- function() {
+ # Unit tests for the Decode function.
+
+ # TOY TESTS: three values, 2 cohorts, 4 bits each
+
+ params_4x2 <- list(k = 4, m = 2, h = 2) # 2 cohorts, 4 bits each
+ map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values
+ map0[1,] <- c(1, 0, 0)
+ map0[2,] <- c(0, 1, 0)
+ map0[3,] <- c(0, 0, 1)
+ map0[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all
+ map0[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3
+
+ colnames(map0) <- c('v1', 'v2', 'v3')
+ distribution0 <- setNames(c(1/2, 1/3, 1/6), colnames(map0))
+
+ # Even in the absence of noise, the inferred counts won't necessarily
+ # match the ground truth. Must be close enough though.
+ noise0 <- list(p = 0, q = 1, f = 0) # no noise whatsoever
+
+ # Args are: message str, test function, # repetitions,
+ # params, map, true pdf, # clients,
+ # tolerances
+ CheckDecodeAveAndStds("Testing Decode (1/5)", CheckDecodeHelper, 100,
+ c(params_4x2, noise0), map0, distribution0, 100,
+ tolerance_l1 = 5,
+ tolerance_linf = 3)
+
+ noise1 <- list(p = .4, q = .6, f = .5) # substantial noise, very few reports
+ CheckDecodeAveAndStds("Testing Decode (2/5)", CheckDecodeHelper, 100,
+ c(params_4x2, noise1), map0, distribution0, 100,
+ tolerance_l1 = 20,
+ tolerance_linf = 20)
+
+ # substantial noise, many reports
+ CheckDecodeAveAndStds("Testing Decode (3/5)", CheckDecodeHelper, 100,
+ c(params_4x2, noise1), map0, distribution0, 100000,
+ tolerance_l1 = 50,
+ tolerance_linf = 40)
+
+ # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
+ num_values <- 100
+
+ params_8x32 <- list(k = 8, m = 32, h = 2) # 32 cohorts, 8 bits each
+
+ map1 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol =
+ num_values)
+
+ colnames(map1) <- sprintf("v%d", 1:num_values)
+
+ distribution1 <- ComputePdf("zipf1", num_values)
+ names(distribution1) <- colnames(map1)
+ CheckDecodeAveAndStds("Testing Decode (4/5)", CheckDecodeHelper, 100,
+ c(params_8x32, noise1), map1, distribution1, 10^6,
+ tolerance_l1 = num_values * 3,
+ tolerance_linf = 100)
+
+ # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports
+ num_values <- 500
+
+ params_8x32 <- list(k = 8, m = 32, h = 2) # 32 cohorts, 8 bits each
+
+ map2 <- matrix(rbinom(32 * 8 * num_values, 1, .25), nrow = 32 * 8, ncol =
+ num_values)
+
+ colnames(map2) <- sprintf("v%d", 1:num_values)
+
+ distribution2 <- ComputePdf("zipf1.5", num_values)
+ names(distribution2) <- colnames(map2)
+
+ CheckDecodeAveAndStds("Testing Decode (5/5)", CheckDecodeHelper, 1,
+ c(params_8x32, noise1), map2, distribution2, 10^6,
+ tolerance_l1 = num_values * 3,
+ tolerance_linf = 80)
+
+}
+
+TestDecodeBool <- function() {
+ # Testing Boolean Decode
+ num_values <- 2
+ # 1 bit; rest of the params don't matter
+ params_bool <- list(k = 1, m = 128, h = 2)
+ # setting up map_bool to be consistent with the Decode API and for
+ # GenerateCounts()
+ map_bool <- matrix(c(0, 1), nrow = 128 * 1, ncol = num_values, byrow = TRUE)
+
+ colnames(map_bool) <- c("FALSE", "TRUE")
+ distribution_bool <- ComputePdf("zipf1.5", num_values)
+ names(distribution_bool) <- colnames(map_bool)
+ noise2 <- list(p = 0.25, q = 0.75, f = 0.5)
+
+ # tolerance_l1 set to four standard deviations to avoid any flakiness in
+ # tests
+ CheckDecodeAveAndStds("Testing .DecodeBoolean (1/3)", CheckDecodeHelper, 100,
+ c(params_bool, noise2), map_bool, distribution_bool,
+ 10^6,
+ tolerance_l1 = 4 * num_values,
+ tolerance_linf = 80)
+
+ noise1 <- list(p = .4, q = .6, f = .5) # substantial noise => 7 stddevs error
+ CheckDecodeAveAndStds("Testing .DecodeBoolean (2/3)", CheckDecodeHelper, 100,
+ c(params_bool, noise1), map_bool, distribution_bool,
+ 10^6,
+ tolerance_l1 = 7 * num_values,
+ tolerance_linf = 80)
+
+ distribution_near_zero <- c(0.999, 0.001)
+ names(distribution_near_zero) <- colnames(map_bool)
+
+ CheckDecodeAveAndStds("Testing .DecodeBoolean (3/3)", CheckDecodeHelper, 100,
+ c(params_bool, noise2), map_bool,
+ distribution_near_zero, 10^6,
+ tolerance_l1 = 4 * num_values,
+ tolerance_linf = 80)
+}
+
+RunAll <- function() {
+ TestEstimateBloomCounts()
+ TestDecode()
+ TestDecodeBool()
+}
+
+RunAll()
diff --git a/analysis/R/encode.R b/analysis/R/encode.R
new file mode 100755
index 0000000..c1d0782
--- /dev/null
+++ b/analysis/R/encode.R
@@ -0,0 +1,128 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Encode <- function(value, map, strs, params, N, id = NULL,
+ cohort = NULL, B = NULL, BP = NULL) {
+ # Encode value to RAPPOR and return a report.
+ #
+ # Input:
+ # value: value to be encoded
+ # map: a mapping matrix describing where each element of strs map in
+ # each cohort
+ # strs: a vector of possible values with value being one of them
+ # params: a list of RAPPOR parameters described in decode.R
+ # N: sample size
+ # Optional parameters:
+ # id: user ID (smaller than N)
+ # cohort: specifies cohort number (smaller than m)
+ # B: input Bloom filter itself, in which case value is ignored
+ # BP: input Permanent Randomized Response (memoized for multiple colections
+ # from the same user
+
+ k <- params$k
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ h <- params$h
+ m <- params$m
+ if (is.null(cohort)) {
+ cohort <- sample(1:m, 1)
+ }
+
+ if (is.null(id)) {
+ id <- sample(N, 1)
+ }
+
+ ind <- which(value == strs)
+
+ if (is.null(B)) {
+ B <- as.numeric(map[[cohort]][, ind])
+ }
+
+ if (is.null(BP)) {
+ BP <- sapply(B, function(x) sample(c(0, 1, x), 1,
+ prob = c(0.5 * f, 0.5 * f, 1 - f)))
+ }
+ rappor <- sapply(BP, function(x) rbinom(1, 1, ifelse(x == 1, q, p)))
+
+ list(value = value, rappor = rappor, B = B, BP = BP, cohort = cohort, id = id)
+}
+
+ExamplePlot <- function(res, k, ebs = 1, title = "", title_cex = 4,
+ voff = .17, acex = 1.5, posa = 2, ymin = 1,
+ horiz = FALSE) {
+ PC <- function(k, report) {
+ char <- as.character(report)
+ if (k > 128) {
+ char[char != ""] <- "|"
+ }
+ char
+ }
+
+ # Annotation settings
+ anc <- "darkorange2"
+ colors <- c("lavenderblush3", "maroon4")
+
+ par(omi = c(0, .55, 0, 0))
+ # Setup plotting.
+ plot(1:k, rep(1, k), ylim = c(ymin, 4), type = "n",
+ xlab = "Bloom filter bits",
+ yaxt = "n", ylab = "", xlim = c(0, k), bty = "n", xaxt = "n")
+ mtext(paste0("Participant ", res$id, " in cohort ", res$cohort), 3, 2,
+ adj = 1, col = anc, cex = acex)
+ axis(1, 2^(0:15), 2^(0:15))
+ abline(v = which(res$B == 1), lty = 2, col = "grey")
+
+ # First row with the true value.
+ text(k / 2, 4, paste0('"', paste0(title, as.character(res$value)), '"'),
+ cex = title_cex, col = colors[2], xpd = NA)
+
+ # Second row with BF: B.
+ points(1:k, rep(3, k), pch = PC(k, res$B), col = colors[res$B + 1],
+ cex = res$B + 1)
+ text(k, 3 + voff, paste0(sum(res$B), " signal bits"), cex = acex,
+ col = anc, pos = posa)
+
+ # Third row: B'.
+ points(1:k, rep(2, k), pch = PC(k, res$BP), col = colors[res$BP + 1],
+ cex = res$BP + 1)
+ text(k, 2 + voff, paste0(sum(res$BP), " bits on"),
+ cex = acex, col = anc, pos = posa)
+
+ # Row 4: actual RAPPOR report.
+ report <- res$rappor
+ points(1:k, rep(1, k), pch = PC(k, as.character(report)),
+ col = colors[report + 1], cex = report + 1)
+ text(k, 1 + voff, paste0(sum(res$rappor), " bits on"), cex = acex,
+ col = anc, pos = posa)
+
+ mtext(c("True value:", "Bloom filter (B):",
+ "Fake Bloom \n filter (B'):", "Report sent\n to server:"),
+ 2, 1, at = 4:1, las = 2)
+ legend("topright", legend = c("0", "1"), fill = colors, bty = "n",
+ cex = 1.5, horiz = horiz)
+ legend("topleft", legend = ebs, plot = FALSE)
+}
+
+PlotPopulation <- function(probs, detected, detection_frequency) {
+ cc <- c("gray80", "darkred")
+ color <- rep(cc[1], length(probs))
+ color[detected] <- cc[2]
+ bp <- barplot(probs, col = color, border = color)
+ inds <- c(1, c(max(which(probs > 0)), length(probs)))
+ axis(1, bp[inds], inds)
+ legend("topright", legend = c("Detected", "Not-detected"),
+ fill = rev(cc), bty = "n")
+ abline(h = detection_frequency, lty = 2, col = "grey")
+}
diff --git a/analysis/R/fast_em.R b/analysis/R/fast_em.R
new file mode 100755
index 0000000..c19862c
--- /dev/null
+++ b/analysis/R/fast_em.R
@@ -0,0 +1,137 @@
+# fast_em.R: Wrapper around analysis/cpp/fast_em.cc.
+#
+# This serializes the input, shells out, and deserializes the output.
+
+.Flatten <- function(list_of_matrices) {
+ listOfVectors <- lapply(list_of_matrices, as.vector)
+ #print(listOfVectors)
+
+ # unlist takes list to vector.
+ unlist(listOfVectors)
+}
+
+.WriteListOfMatrices <- function(list_of_matrices, f) {
+ flattened <- .Flatten(list_of_matrices)
+
+ # NOTE: UpdateJointConditional does outer product of dimensions!
+
+ # 3 letter strings are null terminated
+ writeBin('ne ', con = f)
+ num_entries <- length(list_of_matrices)
+ writeBin(num_entries, con = f)
+
+ Log('Wrote num_entries = %d', num_entries)
+
+ # For 2x3, this is 6
+ writeBin('es ', con = f)
+
+ entry_size <- as.integer(prod(dim(list_of_matrices[[1]])))
+ writeBin(entry_size, con = f)
+
+ Log('Wrote entry_size = %d', entry_size)
+
+ # now write the data
+ writeBin('dat', con = f)
+ writeBin(flattened, con = f)
+}
+
+.ExpectTag <- function(f, tag) {
+ # Read a single NUL-terminated character string.
+ actual <- readBin(con = f, what = "char", n = 1)
+
+ # Assert that we got what was expected.
+ if (length(actual) != 1) {
+ stop(sprintf("Failed to read a tag '%s'", tag))
+ }
+ if (actual != tag) {
+ stop(sprintf("Expected '%s', got '%s'", tag, actual))
+ }
+}
+
+.ReadResult <- function (f, entry_size, matrix_dims) {
+ .ExpectTag(f, "emi")
+ # NOTE: assuming R integers are 4 bytes (uint32_t)
+ num_em_iters <- readBin(con = f, what = "int", n = 1)
+
+ .ExpectTag(f, "pij")
+ pij <- readBin(con = f, what = "double", n = entry_size)
+
+ # Adjust dimensions
+ dim(pij) <- matrix_dims
+
+ Log("Number of EM iterations: %d", num_em_iters)
+ Log("PIJ read from external implementation:")
+ print(pij)
+
+ # est, sd, var_cov, hist
+ list(est = pij, num_em_iters = num_em_iters)
+}
+
+.SanityChecks <- function(joint_conditional) {
+ # Display some stats before sending it over to C++.
+
+ inf_counts <- lapply(joint_conditional, function(m) {
+ sum(m == Inf)
+ })
+ total_inf <- sum(as.numeric(inf_counts))
+
+ nan_counts <- lapply(joint_conditional, function(m) {
+ sum(is.nan(m))
+ })
+ total_nan <- sum(as.numeric(nan_counts))
+
+ zero_counts <- lapply(joint_conditional, function(m) {
+ sum(m == 0.0)
+ })
+ total_zero <- sum(as.numeric(zero_counts))
+
+ #sum(joint_conditional[joint_conditional == Inf, ])
+ Log('total inf: %s', total_inf)
+ Log('total nan: %s', total_nan)
+ Log('total zero: %s', total_zero)
+}
+
+ConstructFastEM <- function(em_executable, tmp_dir) {
+
+ return(function(joint_conditional, max_em_iters = 1000,
+ epsilon = 10 ^ -6, verbose = FALSE,
+ estimate_var = FALSE) {
+ matrix_dims <- dim(joint_conditional[[1]])
+ # Check that number of dimensions is 2.
+ if (length(matrix_dims) != 2) {
+ Log('FATAL: Expected 2 dimensions, got %d', length(matrix_dims))
+ stop()
+ }
+
+ entry_size <- prod(matrix_dims)
+ Log('entry size: %d', entry_size)
+
+ .SanityChecks(joint_conditional)
+
+ input_path <- file.path(tmp_dir, 'list_of_matrices.bin')
+ Log("Writing flattened list of matrices to %s", input_path)
+ f <- file(input_path, 'wb') # binary file
+ .WriteListOfMatrices(joint_conditional, f)
+ close(f)
+ Log("Done writing %s", input_path)
+
+ output_path <- file.path(tmp_dir, 'pij.bin')
+
+ cmd <- sprintf("%s %s %s %s", em_executable, input_path, output_path,
+ max_em_iters)
+
+ Log("Shell command: %s", cmd)
+ exit_code <- system(cmd)
+
+ Log("Done running shell command")
+ if (exit_code != 0) {
+ stop(sprintf("Command failed with code %d", exit_code))
+ }
+
+ f <- file(output_path, 'rb')
+ result <- .ReadResult(f, entry_size, matrix_dims)
+ close(f)
+
+ result
+ })
+}
diff --git a/analysis/R/ngrams_simulation.R b/analysis/R/ngrams_simulation.R
new file mode 100755
index 0000000..ca7ce49
--- /dev/null
+++ b/analysis/R/ngrams_simulation.R
@@ -0,0 +1,271 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Authors: vpihur@google.com (Vasyl Pihur) and fanti@google.com (Giulia Fanti)
+#
+# Tools used to simulate sending partial ngrams to the server for estimating the
+# dictionary of terms over which we want to learn a distribution. This
+# mostly contains functions that aid in the generation of synthetic data.
+
+library(RUnit)
+library(parallel)
+
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/association.R")
+source("analysis/R/decode_ngrams.R")
+
+# The alphabet is the set of all possible characters that will appear in a
+# string. Here we use the English alphabet, but one might want to include
+# numbers or punctuation marks.
+alphabet <- letters
+
+GenerateCandidates <- function(alphabet, ngram_size = 2) {
+ # Draws a random string for each individual in the
+ # population from distribution.
+ #
+ # Args:
+ # N: Number of individuals in the population
+ # num_strs: Number of strings from which to draw strings
+ # str_len: Length of each string
+ #
+ # Returns:
+ # Vector of strings for each individual in the population
+
+ cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet))
+ apply(cands, 1, function(x) paste0(x, collapse = ""))
+}
+
+GenerateString <- function(n) {
+ # Generates a string of a given length from the alphabet.
+ #
+ # Args:
+ # n: Number of characters in the string
+ #
+ # Returns:
+ # String of length n
+ paste0(sample(alphabet, n, replace = TRUE), collapse = "")
+}
+
+GeneratePopulation <- function(N, num_strs, str_len = 10,
+ distribution = 1) {
+ # Generates a string for each individual in the population from distribution.
+ #
+ # Args:
+ # N: Number of individuals in the population
+ # num_strs: Number of strings from which to draw strings
+ # str_len: Length of each string
+ # distribution: which type of distribution to use
+ # 1: Zipfian
+ # 2: Geometric (exponential)
+ # 3: Step function
+ #
+ # Returns:
+ # Vector of strings for each individual in the population
+
+ strs <- sapply(1:num_strs, function(i) GenerateString(str_len))
+
+ if (distribution == 1) {
+ # Zipfian-ish distribution
+ prob <- (1:num_strs)^20
+ prob <- prob / sum(prob) + 0.001
+ prob <- prob / sum(prob)
+ } else if (distribution == 2) {
+ # Geometric distribution (discrete approximation to exponential)
+ p <- 0.3
+ prob <- p * (1 - p)^(1:num_strs - 1)
+ prob <- prob / sum(prob)
+ } else {
+ # Uniform
+ prob <- rep(1 / num_strs, num_strs)
+ }
+
+ sample(strs, N, replace = TRUE, prob = prob)
+}
+
+SelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) {
+ # Selects which ngrams each user will encode and then submit.
+ #
+ # Args:
+ # str: String from which ngram is built.
+ # num_ngrams: Number of ngrams to choose
+ # size: Number of characters per ngram
+ # max_str_len: Maximum number of characters in the string
+ #
+ # Returns:
+ # List of each individual's ngrams and which positions the ngrams
+ # were drawn from.
+
+ start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams))
+ ngrams <- mapply(function(x, y, str) substr(str, x, y),
+ start, start + size - 1,
+ MoreArgs = list(str = str))
+ list(ngrams = ngrams, starts = start)
+}
+
+UpdateMapWithCandidates <- function(str_candidates, sim, params) {
+ # Generates a new map based on the returned candidates.
+ # Normally this would be created on the spot by having the
+ # aggregator hash the string candidates. But since we already have
+ # the map from simulation, we'll just choose the appropriate
+ # column
+ #
+ # Arguments:
+ # str_candidates: Vector of string candidates
+ # sim: Simulation object containing the original map
+ # params: RAPPOR parameter list
+
+ k <- params$k
+ h <- params$h
+ m <- params$m
+
+ # First add the real candidates to the map
+ valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
+ updated_map <- sim$full_map
+ updated_map$map_by_cohort <- lapply(1:m, function(i) {
+ sim$full_map$map_by_cohort[[i]][, valid_cands]
+ })
+
+ # Now add the false positives (we can just draw random strings for
+ # these since they didn't appear in the original dataset anyway)
+ new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
+ M <- length(new_cands)
+ if (M > 0) {
+ for (i in 1:m) {
+ ones <- sample(1:k, M * h, replace = TRUE)
+ cols <- rep(1:M, each = h)
+ strs <- c(sort(valid_cands), new_cands)
+ updated_map$map_by_cohort[[i]] <-
+ do.call(cBind, list(updated_map$map_by_cohort[[i]],
+ sparseMatrix(ones, cols, dims = c(k, M))))
+ colnames(updated_map$map_by_cohort[[i]]) <- strs
+ }
+ }
+ if (class(updated_map$map_by_cohort[[1]]) == "logical") {
+ updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort)
+ updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE)
+ colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands)
+ } else {
+ updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort)
+ }
+ updated_map
+}
+
+SimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10,
+ alphabet, params, distribution = 1) {
+ # Simulates the creation and encoding of ngrams for each individual.
+ #
+ # Args:
+ # N: Number of individuals in the population
+ # ngram_params: Parameters about ngram size, etc.
+ # str_len: Length of each string
+ # num_strs: NUmber of strings in the dictionary
+ # alphabet: Alphabet used to generate strings
+ # params: RAPPOR parameters, like noise and cohorts
+ #
+ # Returns:
+ # List containing all the information needed for estimating and
+ # verifying the results.
+
+ # Get the list of strings for each user
+ strs <- GeneratePopulation(N, num_strs = num_strs,
+ str_len = str_len,
+ distribution)
+
+ # Split them into ngrams and encode
+ ngram <- lapply(strs, function(i)
+ SelectNGrams(i,
+ num_ngrams = ngram_params$num_ngrams_collected,
+ size = ngram_params$ngram_size,
+ max_str_len = str_len))
+
+ cands <- GenerateCandidates(alphabet, ngram_params$ngram_size)
+ map <- CreateMap(cands, params, FALSE)
+ cohorts <- sample(1:params$m, N, replace = TRUE)
+
+ g <- sapply(ngram, function(x) paste(x$starts, sep = "_",
+ collapse = "_"))
+ ug <- sort(unique(g))
+ pairings <- t(sapply(ug, function(x)
+ sapply(strsplit(x, "_"), function(y) as.numeric(y))))
+
+ inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i]))
+
+ reports <- lapply(1:length(ug), function(k) {
+ # Generate the ngram reports
+ lapply(1:ngram_params$num_ngrams_collected, function(x) {
+ EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]),
+ cohorts[inds[[k]]], map$map_by_cohort, params)})
+ })
+ cat("Encoded the ngrams.\n")
+ # Now generate the full string reports
+ full_map <- CreateMap(sort(unique(strs)), params, FALSE)
+ full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params)
+
+ list(reports = reports, cohorts = cohorts, ngram = ngram, map = map,
+ strs = strs, pairings = pairings, inds = inds, cands = cands,
+ full_reports = full_reports, full_map = full_map)
+
+}
+
+
+EstimateDictionaryTrial <- function(N, str_len, num_strs,
+ params, ngram_params,
+ distribution = 3) {
+ # Runs a single trial for simulation. Generates simulated reports,
+ # decodes them, and returns the result.
+ #
+ # Arguments:
+ # N: Number of users to simulation
+ # str_len: The length of strings to estimate
+ # num_strs: The number of strings in the dictionary
+ # params: RAPPOR parameter list
+ # ngram_params: Parameters related to the size of ngrams
+ # distribution: Tells what kind of distribution to use:
+ # 1: Zipfian
+ # 2: Geometric
+ # 3: Uniform (default)
+ #
+ # Returns:
+ # List with recovered and true marginals.
+
+ # We call the needed libraries here in order to make them available when this
+ # function gets called by BorgApply. Otherwise, they do not get included.
+ library(glmnet)
+ library(parallel)
+ sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
+ alphabet, params, distribution)
+
+ res <- EstimateDictionary(sim, N, ngram_params, params)
+ str_candidates <- res$found_candidates
+ pairwise_candidates <- res$pairwise_candidates
+
+ if (length(str_candidates) == 0) {
+ return (NULL)
+ }
+ updated_map <- UpdateMapWithCandidates(str_candidates, sim, params)
+
+ # Compute the marginal for this new set of strings
+ variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params)
+ # Our dictionary estimate
+ marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit
+ # Estimate given full dictionary knowledge
+ marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit
+ # The true (sampled) data distribution
+ truth <- sort(table(sim$strs)) / N
+
+ list(marginal = marginal, marginal_full = marginal_full,
+ truth = truth, pairwise_candidates = pairwise_candidates)
+}
diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R
new file mode 100755
index 0000000..47f8be5
--- /dev/null
+++ b/analysis/R/read_input.R
@@ -0,0 +1,154 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Read parameter, counts and map files.
+
+library(Matrix)
+
+source.rappor <- function(rel_path) {
+ abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+ source(abs_path)
+}
+
+source.rappor("analysis/R/util.R") # for Log
+
+
+ReadParameterFile <- function(params_file) {
+ # Read parameter file. Format:
+ # k, h, m, p, q, f
+ # 128, 2, 8, 0.5, 0.75, 0.75
+
+ params <- as.list(read.csv(params_file))
+ if (length(params) != 6) {
+ stop("There should be exactly 6 columns in the parameter file.")
+ }
+ if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) {
+ stop("Parameter names must be k,h,m,p,q,f.")
+ }
+ params
+}
+
+# Handle the case of redundant cohorts, i.e. the counts file needs to be
+# further aggregated to obtain counts for the number of cohorts specified in
+# the params file.
+#
+# NOTE: Why is this happening?
+AdjustCounts <- function(counts, params) {
+ apply(counts, 2, function(x) {
+ tapply(x, rep(1:params$m, nrow(counts) / params$m), sum)
+ })
+}
+
+ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) {
+ # Read in the counts file.
+ if (!file.exists(counts_file)) {
+ return(NULL)
+ }
+ counts <- read.csv(counts_file, header = FALSE)
+
+ if (adjust_counts) {
+ counts <- AdjustCounts(counts, params)
+ }
+
+ if (nrow(counts) != params$m) {
+ stop(sprintf("Got %d rows in the counts file, expected m = %d",
+ nrow(counts), params$m))
+ }
+
+ if ((ncol(counts) - 1) != params$k) {
+ stop(paste0("Counts file: number of columns should equal to k + 1: ",
+ ncol(counts)))
+ }
+
+ if (any(counts < 0)) {
+ stop("Counts file: all counts must be positive.")
+ }
+
+ # Turn counts from a data frame into a matrix. (In R a data frame and matrix
+ # are sometimes interchangeable, but sometimes we need it to be matrix.)
+ as.matrix(counts)
+}
+
+ReadMapFile <- function(map_file, params) {
+ # Read in the map file which is in the following format (two hash functions):
+ # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ...
+ # str2, ...
+ # Output:
+ # map: a sparse representation of set bits for each candidate string.
+ # strs: a vector of all candidate strings.
+
+ Log("Parsing %s", map_file)
+
+ map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE)
+ strs <- map_pos[, 1]
+ strs[strs == ""] <- "Empty"
+
+ # Remove duplicated strings.
+ ind <- which(!duplicated(strs))
+ strs <- strs[ind]
+ map_pos <- map_pos[ind, ]
+
+ n <- ncol(map_pos) - 1
+ if (n != (params$h * params$m)) {
+ stop(paste0("Map file: number of columns should equal hm + 1:",
+ n, "_", params$h * params$m))
+ }
+
+ row_pos <- unlist(map_pos[, -1], use.names = FALSE)
+ col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1)
+
+ # TODO: When would this ever happen?
+ removed <- which(is.na(row_pos))
+ if (length(removed) > 0) {
+ Log("Removed %d entries", length(removed))
+ row_pos <- row_pos[-removed]
+ col_pos <- col_pos[-removed]
+ }
+
+ map <- sparseMatrix(row_pos, col_pos,
+ dims = c(params$m * params$k, length(strs)))
+
+ colnames(map) <- strs
+ list(map = map, strs = strs, map_pos = map_pos)
+}
+
+LoadMapFile <- function(map_file, params) {
+ # Reads the map file, caching an .rda (R binary data) version of it to speed
+ # up future loads.
+
+ rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE)
+ # This must be unique per process, so concurrent processes don't try to
+ # write the same file.
+ tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid())
+
+ # First save to a temp file, and then atomically rename to the destination.
+ if (file.exists(rda_path)) {
+ Log("Loading %s", rda_path)
+ load(rda_path, .GlobalEnv) # creates the 'map' variable in the global env
+ } else {
+ map <- ReadMapFile(map_file, params)
+
+ Log("Saving %s as an rda file for faster access", map_file)
+ tryCatch({
+ save(map, file = tmp_path)
+ file.rename(tmp_path, rda_path)
+ }, warning = function(w) {
+ Log("WARNING: %s", w)
+ }, error = function(e) {
+ Log("ERROR: %s", e)
+ })
+ }
+ return(map)
+}
diff --git a/analysis/R/run_tests.R b/analysis/R/run_tests.R
new file mode 100755
index 0000000..8a4692f
--- /dev/null
+++ b/analysis/R/run_tests.R
@@ -0,0 +1,48 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Run unit tests for RAPPOR R code.
+
+library(RUnit)
+
+run_tests <- function() {
+ dirs <- "analysis/R" # Run from root
+ test_suite <- defineTestSuite("rappor", dirs, testFileRegexp = "_test.R$",
+ testFuncRegexp = "^Test")
+ stopifnot(isValidTestSuite(test_suite))
+
+ test_result <- runTestSuite(test_suite)
+
+ printTextProtocol(test_result) # print to stdout
+
+ result <- test_result[[1]] # Result for our only suite
+
+ # Sanity check: fail if there were no tests found.
+ if (result$nTestFunc == 0) {
+ cat("No tests found.\n")
+ return(FALSE)
+ }
+ if (result$nFail != 0 || result$nErr != 0) {
+ cat("Some tests failed.\n")
+ return(FALSE)
+ }
+ return(TRUE)
+}
+
+if (!run_tests()) {
+ quit(status = 1)
+}
diff --git a/analysis/R/simulation.R b/analysis/R/simulation.R
new file mode 100755
index 0000000..251c595
--- /dev/null
+++ b/analysis/R/simulation.R
@@ -0,0 +1,268 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# RAPPOR simulation library. Contains code for encoding simulated data and
+# creating the map used to encode and decode reports.
+
+library(glmnet)
+library(parallel) # mclapply
+
+SetOfStrings <- function(num_strings = 100) {
+ # Generates a set of strings for simulation purposes.
+ strs <- paste0("V_", as.character(1:num_strings))
+ strs
+}
+
+GetSampleProbs <- function(params) {
+ # Generate different underlying distributions for simulations purposes.
+ # Args:
+ # - params: a list describing the shape of the true distribution:
+ # c(num_strings, prop_nonzero_strings, decay_type,
+ # rate_exponetial).
+ nstrs <- params[[1]]
+ nonzero <- params[[2]]
+ decay <- params[[3]]
+ expo <- params[[4]]
+ background <- params[[5]]
+
+ probs <- rep(0, nstrs)
+ ind <- floor(nstrs * nonzero)
+ if (decay == "Linear") {
+ probs[1:ind] <- (ind:1) / sum(1:ind)
+ } else if (decay == "Constant") {
+ probs[1:ind] <- 1 / ind
+ } else if (decay == "Exponential") {
+ temp <- seq(0, nonzero, length.out = ind)
+ temp <- exp(-temp * expo)
+ temp <- temp + background
+ temp <- temp / sum(temp)
+ probs[1:ind] <- temp
+ } else {
+ stop('params[[4]] must be in c("Linear", "Exponenential", "Constant")')
+ }
+ probs
+}
+
+EncodeAll <- function(x, cohorts, map, params, num_cores = 1) {
+ # Encodes the ground truth into RAPPOR reports.
+ #
+ # Args:
+ # x: Observed strings for each report, Nx1 vector
+ # cohort: Cohort assignment for each report, Nx1 vector
+ # map: list of matrices encoding locations of hashes for each
+ # string, for each cohort
+ # params: System parameters
+ #
+ # Returns:
+ # RAPPOR reports for each piece of data.
+
+ p <- params$p
+ q <- params$q
+ f <- params$f
+ k <- params$k
+
+ qstar <- (1 - f / 2) * q + (f / 2) * p
+ pstar <- (1 - f / 2) * p + (f / 2) * q
+
+ candidates <- colnames(map[[1]])
+ if (!all(x %in% candidates)) {
+ stop("Some strings are not in the map. set(X) - set(candidates): ",
+ paste(setdiff(unique(x), candidates), collapse=" "), "\n")
+ }
+ bfs <- mapply(function(x, y) y[, x], x, map[cohorts], SIMPLIFY = FALSE,
+ USE.NAMES = FALSE)
+ reports <- mclapply(bfs, function(x) {
+ noise <- sample(0:1, k, replace = TRUE, prob = c(1 - pstar, pstar))
+ ind <- which(x)
+ noise[ind] <- sample(0:1, length(ind), replace = TRUE,
+ prob = c(1 - qstar, qstar))
+ noise
+ }, mc.cores = num_cores)
+
+ reports
+}
+
+CreateMap <- function(strs, params, generate_pos = TRUE, basic = FALSE) {
+ # Creates a list of 0/1 matrices corresponding to mapping between the strs and
+ # Bloom filters for each instance of the RAPPOR.
+ # Ex. for 3 strings, 2 instances, 1 hash function and Bloom filter of size 4,
+ # the result could look this:
+ # [[1]]
+ # 1 0 0 0
+ # 0 1 0 0
+ # 0 0 0 1
+ # [[2]]
+ # 0 1 0 0
+ # 0 0 0 1
+ # 0 0 1 0
+ #
+ # Args:
+ # strs: a vector of strings
+ # params: a list of parameters in the following format:
+ # (k, h, m, p, q, f).
+ # generate_pos: Tells whether to generate an object storing the
+ # positions of the nonzeros in the matrix
+ # basic: Tells whether to use basic RAPPOR (only works if h=1).
+
+ M <- length(strs)
+ map_by_cohort <- list()
+ k <- params$k
+ h <- params$h
+ m <- params$m
+
+ for (i in 1:m) {
+ if (basic && (h == 1) && (k == M)) {
+ ones <- 1:M
+ } else {
+ ones <- sample(1:k, M * h, replace = TRUE)
+ }
+ cols <- rep(1:M, each = h)
+ map_by_cohort[[i]] <- sparseMatrix(ones, cols, dims = c(k, M))
+ colnames(map_by_cohort[[i]]) <- strs
+ }
+
+ all_cohorts_map <- do.call("rBind", map_by_cohort)
+ if (generate_pos) {
+ map_pos <- t(apply(all_cohorts_map, 2, function(x) {
+ ind <- which(x == 1)
+ n <- length(ind)
+ if (n < h * m) {
+ ind <- c(ind, rep(NA, h * m - n))
+ }
+ ind
+ }))
+ } else {
+ map_pos <- NULL
+ }
+
+ list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map,
+ map_pos = map_pos)
+}
+
+GetSample <- function(N, strs, probs) {
+ # Sample for the strs population with distribution probs.
+ sample(strs, N, replace = TRUE, prob = probs)
+}
+
+GetTrueBits <- function(samp, map, params) {
+ # Convert sample generated by GetSample() to Bloom filters where mapping
+ # is defined in map.
+ # Output:
+ # - reports: a matrix of size [num_instances x size] where each row
+ # represents the number of times each bit in the Bloom filter
+ # was set for a particular instance.
+ # Note: reports[, 1] contains the same size for each instance.
+
+ N <- length(samp)
+ k <- params$k
+ m <- params$m
+ strs <- colnames(map[[1]])
+ reports <- matrix(0, m, k + 1)
+ inst <- sample(1:m, N, replace = TRUE)
+ for (i in 1:m) {
+ tab <- table(samp[inst == i])
+ tab2 <- rep(0, length(strs))
+ tab2[match(names(tab), strs)] <- tab
+ counts <- apply(map[[i]], 1, function(x) x * tab2)
+ # cat(length(tab2), dim(map[[i]]), dim(counts), "\n")
+ reports[i, ] <- c(sum(tab2), apply(counts, 2, sum))
+ }
+ reports
+}
+
+GetNoisyBits <- function(truth, params) {
+ # Applies RAPPOR to the Bloom filters.
+ # Args:
+ # - truth: a matrix generated by GetTrueBits().
+
+ k <- params$k
+ p <- params$p
+ q <- params$q
+ f <- params$f
+
+ rappors <- apply(truth, 1, function(x) {
+ # The following samples considering 4 cases:
+ # 1. Signal and we lie on the bit.
+ # 2. Signal and we tell the truth.
+ # 3. Noise and we lie.
+ # 4. Noise and we tell the truth.
+
+ # Lies when signal sampled from the binomial distribution.
+ lied_signal <- rbinom(k, x[-1], f)
+
+ # Remaining must be the non-lying bits when signal. Sampled with q.
+ truth_signal <- x[-1] - lied_signal
+
+ # Lies when there is no signal which happens x[1] - x[-1] times.
+ lied_nosignal <- rbinom(k, x[1] - x[-1], f)
+
+ # Trtuh when there's no signal. These are sampled with p.
+ truth_nosignal <- x[1] - x[-1] - lied_nosignal
+
+ # Total lies and sampling lies with 50/50 for either p or q.
+ lied <- lied_signal + lied_nosignal
+ lied_p <- rbinom(k, lied, .5)
+ lied_q <- lied - lied_p
+
+ # Generating the report where sampling of either p or q occurs.
+ rbinom(k, lied_q + truth_signal, q) + rbinom(k, lied_p + truth_nosignal, p)
+ })
+
+ cbind(truth[, 1], t(rappors))
+}
+
+GenerateSamples <- function(N = 10^5, params, pop_params, alpha = .05,
+ prop_missing = 0,
+ correction = "Bonferroni") {
+ # Simulate N reports with pop_params describing the population and
+ # params describing the RAPPOR configuration.
+ num_strings = pop_params[[1]]
+
+ strs <- SetOfStrings(num_strings)
+ probs <- GetSampleProbs(pop_params)
+ samp <- GetSample(N, strs, probs)
+ map <- CreateMap(strs, params)
+ truth <- GetTrueBits(samp, map$map_by_cohort, params)
+ rappors <- GetNoisyBits(truth, params)
+
+ strs_apprx <- strs
+ map_apprx <- map$all_cohorts_map
+ # Remove % of strings to simulate missing variables.
+ if (prop_missing > 0) {
+ ind <- which(probs > 0)
+ removed <- sample(ind, ceiling(prop_missing * length(ind)))
+ map_apprx <- map$all_cohorts_map[, -removed]
+ strs_apprx <- strs[-removed]
+ }
+
+ # Randomize the columns.
+ ind <- sample(1:length(strs_apprx), length(strs_apprx))
+ map_apprx <- map_apprx[, ind]
+ strs_apprx <- strs_apprx[ind]
+
+ fit <- Decode(rappors, map_apprx, params, alpha = alpha,
+ correction = correction)
+
+ # Add truth column.
+ fit$fit$Truth <- table(samp)[fit$fit$string]
+ fit$fit$Truth[is.na(fit$fit$Truth)] <- 0
+
+ fit$map <- map$map_by_cohort
+ fit$truth <- truth
+ fit$strs <- strs
+ fit$probs <- probs
+
+ fit
+}
diff --git a/analysis/R/unknowns_test.R b/analysis/R/unknowns_test.R
new file mode 100755
index 0000000..5efd738
--- /dev/null
+++ b/analysis/R/unknowns_test.R
@@ -0,0 +1,139 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Author: fanti@google.com (Giulia Fanti)
+#
+# Tests the unknown unknowns dictionary estimation functions.
+# There are two main components involved in estimating this unknown
+# distribution:
+# a) Find the pairwise ngrams that co-occur often.
+# b) Determine which full strings are consisted with all pairwise
+# relations.
+#
+# TestEstimateDictionary() tests the full pipeline, including parts (a)
+# and (b).
+# TestFindFeasibleStrings() tests only part (b).
+# Both tests generate their own data.
+
+library(parallel)
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/association.R")
+source("analysis/R/decode_ngrams.R")
+source("analysis/R/ngrams_simulation.R")
+alphabet <- letters
+options(warn = -1)
+
+GeneratePopulation <- function(N, num_strs, str_len = 10,
+ distribution = NULL) {
+ # Generates a /deterministic/ string for each individual in the
+ # population from distribution.
+ #
+ # Args:
+ # N: Number of individuals in the population
+ # num_strs: Number of strings from which to draw strings
+ # str_len: Length of each string
+ # distribution: Just here for compatibility with original
+ # GeneratePopulation function in ngrams_simulation.R
+ #
+ # Returns:
+ # Vector of strings for each individual in the population
+
+ strs <- sapply(1:num_strs, function(i) {
+ paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "")
+ })
+
+ # Uniform distribution
+ prob <- rep(1 / num_strs, num_strs)
+ sample(strs, N, replace = TRUE, prob = prob)
+}
+
+TestEstimateDictionary <- function() {
+ # Tests that the algorithm without noise recovers a uniform
+ # string population correctly.
+
+ # Compute the strings from measuring only 2 ngrams
+ N <- 100
+ str_len <- 6
+ ngram_size <- 2
+ num_ngrams <- str_len / ngram_size
+ num_strs <- 1
+
+ params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
+
+ ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
+ num_ngrams_collected = 2)
+
+ sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
+ alphabet, params, distribution = 3)
+
+ res <- EstimateDictionary(sim, N, ngram_params, params)
+
+ # Check that the correct strings are found
+ if (num_strs == 1) {
+ checkTrue(res$found_candidates == sort(unique(sim$strs)))
+ } else {
+ checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs))))
+ }
+}
+
+TestFindFeasibleStrings <- function() {
+ # Tests that FindPairwiseCandidates weeds out false positives.
+ # We test this by adding false positives to the pairwise estimates.
+ N <- 100
+ str_len <- 6
+ ngram_size <- 2
+ num_ngrams <- str_len / ngram_size
+ num_strs <- 2
+
+ params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
+
+ ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
+ num_ngrams_collected = 2)
+
+ sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
+ alphabet, params)
+
+ pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params,
+ params)$candidate_strs
+ cat("Found the pairwise candidates. \n")
+
+ pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le"))
+
+ if (is.null(pairwise_candidates)) {
+ return (FALSE)
+ }
+
+ conn <- file('graph.txt', 'w+')
+ WriteKPartiteGraph(conn,
+ pairwise_candidates,
+ sim$pairings,
+ ngram_params$num_ngrams,
+ ngram_params$ngram_size)
+
+ close(conn)
+ cat("Wrote graph.txt\n")
+
+ found_candidates <- FindFeasibleStrings(pairwise_candidates,
+ sim$pairings,
+ ngram_params$num_ngrams,
+ ngram_params$ngram_size)
+ # Check that the correct strings are found
+ if (num_strs == 1) {
+ checkTrue(found_candidates == sort(unique(sim$strs)))
+ } else {
+ checkTrue(all.equal(found_candidates, sort(unique(sim$strs))))
+ }
+}
diff --git a/analysis/R/util.R b/analysis/R/util.R
new file mode 100755
index 0000000..7fa75fe
--- /dev/null
+++ b/analysis/R/util.R
@@ -0,0 +1,18 @@
+#!/usr/bin/Rscript
+#
+# Common utility library for all R scripts.
+
+# Log message with timing. Example:
+#
+# _____ 1.301 My message
+#
+# The prefix makes it stand out (vs R's print()), and the number is the time so
+# far.
+#
+# NOTE: The shell script log uses hyphens.
+
+Log <- function(...) {
+ cat(sprintf('_____ %.3f ', proc.time()[['elapsed']]))
+ cat(sprintf(...))
+ cat('\n')
+}
diff --git a/analysis/cpp/README.md b/analysis/cpp/README.md
new file mode 100644
index 0000000..bb14832
--- /dev/null
+++ b/analysis/cpp/README.md
@@ -0,0 +1,12 @@
+find_cliques
+============
+
+This tool does part of the analysis for unknown dictionaries. To run it:
+
+ $ ./run.sh demo
+
+This compiles and runs it on files in the testdata/ directory.
+
+See comments in find_cliques.cc for information on how it works.
+
+
diff --git a/analysis/cpp/fast_em.cc b/analysis/cpp/fast_em.cc
new file mode 100644
index 0000000..5bdfedb
--- /dev/null
+++ b/analysis/cpp/fast_em.cc
@@ -0,0 +1,309 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <assert.h>
+#include <stdarg.h> // va_list, etc.
+#include <stdio.h> // fread()
+#include <stdlib.h> // exit()
+#include <stdint.h> // uint16_t
+#include <string.h> // strcmp()
+#include <cmath> // std::abs operates on doubles
+#include <cstdlib> // strtol
+#include <vector>
+
+using std::vector;
+
+// Log messages to stdout.
+void log(const char* fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ vprintf(fmt, args);
+ va_end(args);
+ printf("\n");
+}
+
+const int kTagLen = 4; // 4 byte tags in the file format
+
+bool ExpectTag(FILE* f, const char* tag) {
+ char buf[kTagLen];
+
+ if (fread(buf, sizeof buf[0], kTagLen, f) != kTagLen) {
+ return false;
+ }
+ if (strcmp(buf, tag) != 0) {
+ log("Error: expected '%s'", tag);
+ return false;
+ }
+ return true;
+}
+
+static bool ReadListOfMatrices(
+ FILE* f, uint32_t* num_entries_out, uint32_t* entry_size_out,
+ vector<double>* v_out) {
+ if (!ExpectTag(f, "ne ")) {
+ return false;
+ }
+
+ // R integers are serialized as uint32_t
+ uint32_t num_entries;
+ if (fread(&num_entries, sizeof num_entries, 1, f) != 1) {
+ return false;
+ }
+
+ log("num entries: %d", num_entries);
+
+ if (!ExpectTag(f, "es ")) {
+ return false;
+ }
+
+ uint32_t entry_size;
+ if (fread(&entry_size, sizeof entry_size, 1, f) != 1) {
+ return false;
+ }
+ log("entry_size: %d", entry_size);
+
+ if (!ExpectTag(f, "dat")) {
+ return false;
+ }
+
+ // Now read dynamic data
+ size_t vec_length = num_entries * entry_size;
+
+ vector<double>& v = *v_out;
+ v.resize(vec_length);
+
+ if (fread(&v[0], sizeof v[0], vec_length, f) != vec_length) {
+ return false;
+ }
+
+ // Print out head for sanity
+ size_t n = 20;
+ for (size_t i = 0; i < n && i < v.size(); ++i) {
+ log("%d: %f", i, v[i]);
+ }
+
+ *num_entries_out = num_entries;
+ *entry_size_out = entry_size;
+
+ return true;
+}
+
+void PrintEntryVector(const vector<double>& cond_prob, size_t m,
+ size_t entry_size) {
+ size_t c_base = m * entry_size;
+ log("cond_prob[m = %d] = ", m);
+ for (size_t i = 0; i < entry_size; ++i) {
+ printf("%e ", cond_prob[c_base + i]);
+ }
+ printf("\n");
+}
+
+void PrintPij(const vector<double>& pij) {
+ double sum = 0.0;
+ printf("PIJ:\n");
+ for (size_t i = 0; i < pij.size(); ++i) {
+ printf("%f ", pij[i]);
+ sum += pij[i];
+ }
+ printf("\n");
+ printf("SUM: %f\n", sum); // sum is 1.0 after normalization
+ printf("\n");
+}
+
+// EM algorithm to iteratively estimate parameters.
+
+static int ExpectationMaximization(
+ uint32_t num_entries, uint32_t entry_size, const vector<double>& cond_prob,
+ int max_em_iters, double epsilon, vector<double>* pij_out) {
+ // Start out with uniform distribution.
+ vector<double> pij(entry_size, 0.0);
+ double init = 1.0 / entry_size;
+ for (size_t i = 0; i < pij.size(); ++i) {
+ pij[i] = init;
+ }
+ log("Initialized %d entries with %f", pij.size(), init);
+
+ vector<double> prev_pij(entry_size, 0.0); // pij on previous iteration
+
+ log("Starting up to %d EM iterations", max_em_iters);
+
+ int em_iter = 0; // visible after loop
+ for (; em_iter < max_em_iters; ++em_iter) {
+ //
+ // lapply() step.
+ //
+
+ // Computed below as a function of old Pij and conditional probability for
+ // each report.
+ vector<double> new_pij(entry_size, 0.0);
+
+ // m is the matrix index, giving the conditional probability matrix for a
+ // single report.
+ for (size_t m = 0; m < num_entries; ++m) {
+ vector<double> z(entry_size, 0.0);
+
+ double sum_z = 0.0;
+
+ // base index for the matrix corresponding to a report.
+ size_t c_base = m * entry_size;
+
+ for (size_t i = 0; i < entry_size; ++i) { // multiply and running sum
+ size_t c_index = c_base + i;
+ z[i] = cond_prob[c_index] * pij[i];
+ sum_z += z[i];
+ }
+
+ // Normalize and Reduce("+", wcp) step. These two steps are combined for
+ // memory locality.
+ for (size_t i = 0; i < entry_size; ++i) {
+ new_pij[i] += z[i] / sum_z;
+ }
+ }
+
+ // Divide outside the loop
+ for (size_t i = 0; i < entry_size; ++i) {
+ new_pij[i] /= num_entries;
+ }
+
+ //PrintPij(new_pij);
+
+ //
+ // Check for termination
+ //
+ double max_dif = 0.0;
+ for (size_t i = 0; i < entry_size; ++i) {
+ double dif = std::abs(new_pij[i] - pij[i]);
+ if (dif > max_dif) {
+ max_dif = dif;
+ }
+ }
+
+ pij = new_pij; // copy
+
+ log("fast EM iteration %d, dif = %e", em_iter, max_dif);
+
+ if (max_dif < epsilon) {
+ log("Early EM termination: %e < %e", max_dif, epsilon);
+ break;
+ }
+ }
+
+ *pij_out = pij;
+ // If we reached iteration index 10, then there were 10 iterations: the last
+ // one terminated the loop.
+ return em_iter;
+}
+
+bool WriteTag(const char* tag, FILE* f_out) {
+ assert(strlen(tag) == 3); // write 3 byte tags with NUL byte
+ return fwrite(tag, 1, 4, f_out) == 4;
+}
+
+// Write the probabilities as a flat list of doubles. The caller knows what
+// the dimensions are.
+bool WriteResult(const vector<double>& pij, uint32_t num_em_iters,
+ FILE* f_out) {
+ if (!WriteTag("emi", f_out)) {
+ return false;
+ }
+ if (fwrite(&num_em_iters, sizeof num_em_iters, 1, f_out) != 1) {
+ return false;
+ }
+
+ if (!WriteTag("pij", f_out)) {
+ return false;
+ }
+ size_t n = pij.size();
+ if (fwrite(&pij[0], sizeof pij[0], n, f_out) != n) {
+ return false;
+ }
+ return true;
+}
+
+// Like atoi, but with basic (not exhaustive) error checking.
+bool StringToInt(const char* s, int* result) {
+ bool ok = true;
+ char* end; // mutated by strtol
+
+ *result = strtol(s, &end, 10); // base 10
+ // If strol didn't consume any characters, it failed.
+ if (end == s) {
+ ok = false;
+ }
+ return ok;
+}
+
+int main(int argc, char **argv) {
+ if (argc < 4) {
+ log("Usage: read_numeric INPUT OUTPUT max_em_iters");
+ return 1;
+ }
+
+ char* in_filename = argv[1];
+ char* out_filename = argv[2];
+
+ int max_em_iters;
+ if (!StringToInt(argv[3], &max_em_iters)) {
+ log("Error parsing max_em_iters");
+ return 1;
+ }
+
+ FILE* f = fopen(in_filename, "rb");
+ if (f == NULL) {
+ return 1;
+ }
+
+ // Try opening first so we don't do a long computation and then fail.
+ FILE* f_out = fopen(out_filename, "wb");
+ if (f_out == NULL) {
+ return 1;
+ }
+
+ uint32_t num_entries;
+ uint32_t entry_size;
+ vector<double> cond_prob;
+ if (!ReadListOfMatrices(f, &num_entries, &entry_size, &cond_prob)) {
+ log("Error reading list of matrices");
+ return 1;
+ }
+
+ fclose(f);
+
+ // Sanity check
+ double debug_sum = 0.0;
+ for (size_t m = 0; m < num_entries; ++m) {
+ // base index for the matrix corresponding to a report.
+ size_t c_base = m * entry_size;
+ for (size_t i = 0; i < entry_size; ++i) { // multiply and running sum
+ debug_sum += cond_prob[c_base + i];
+ }
+ }
+ log("Debug sum: %f", debug_sum);
+
+ double epsilon = 1e-6;
+ log("epsilon: %f", epsilon);
+
+ vector<double> pij(entry_size);
+ int num_em_iters = ExpectationMaximization(
+ num_entries, entry_size, cond_prob, max_em_iters, epsilon, &pij);
+
+ if (!WriteResult(pij, num_em_iters, f_out)) {
+ log("Error writing result matrix");
+ return 1;
+ }
+ fclose(f_out);
+
+ log("fast EM done");
+ return 0;
+}
diff --git a/analysis/cpp/find_cliques.cc b/analysis/cpp/find_cliques.cc
new file mode 100644
index 0000000..ea2dda9
--- /dev/null
+++ b/analysis/cpp/find_cliques.cc
@@ -0,0 +1,546 @@
+// Copyright 2014 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <assert.h>
+#include <stdarg.h> // va_list, etc.
+#include <stdio.h>
+#include <stdint.h> // uint16_t
+#include <string>
+// Using unordered_{set,map} and not the older set,map since they only require
+// implementing equality, not comparison. They require a C++ 11 compiler.
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+// find_cliques.cc: Find k-cliques in a k-partite graph. This is part of the
+// RAPPOR analysis for unknown dictionaries.
+//
+// A clique is a complete subgraph; it has (|N| choose 2) edges.
+//
+// This does the same computation as FindFeasibleStrings in
+// analysis/R/decode_ngrams.R.
+
+// Graph format:
+//
+// num_partitions 3
+// 0.ab 1.bc
+// 0.ab 2.de
+//
+// See WriteKPartiteGraph in analysis/R/decode_ngrams.R for details.
+//
+// PERFORMANCE
+//
+// The code is optimized in terms of memory locality. Nodes are 4 bytes; Edges
+// are 8 bytes; PathArray is a contiguous block of memory.
+
+using std::unordered_map;
+using std::unordered_set;
+using std::string;
+using std::vector;
+
+// TODO: log to stderr. Add VERBOSE logging.
+void log(const char* fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ vprintf(fmt, args);
+ va_end(args);
+ printf("\n");
+}
+
+// Nodes and Edges are value types. A node is 4 bytes. 2^16 = 65536
+// partitions is plenty.
+struct Node {
+ uint16_t partition;
+ // Right now we support bigrams. We may want to support trigrams or
+ // arbitrary n-grams, although there will be a performance hit.
+ char ngram[2];
+
+ // for debugging only
+ string ToString() const {
+ char buf[100];
+ snprintf(buf, sizeof(buf), "%d.%c%c", partition, ngram[0], ngram[1]);
+ return string(buf); // copies buf
+ }
+};
+
+// Implement hash and equality functors for unordered_set.
+struct NodeHash {
+ int operator() (const Node& node) const {
+ // DJB hash: http://floodyberry.com/noncryptohashzoo/DJB.html
+ int h = 5381;
+ h = (h << 5) + h + node.partition;
+ h = (h << 5) + h + node.ngram[0];
+ h = (h << 5) + h + node.ngram[1];
+ // log("hash %s = %d", node.ToString().c_str(), h);
+ return h;
+ }
+};
+
+struct NodeEq {
+ bool operator() (const Node& x, const Node& y) const {
+ // TODO: optimize to 4 byte comparison with memcmp(&x, &y, sizeof(Node))?
+ // NOTE: x.ngram == y.ngram is wrong; it compares pointers!
+ return x.partition == y.partition &&
+ x.ngram[0] == y.ngram[0] &&
+ x.ngram[1] == y.ngram[1];
+ }
+};
+
+// This is an undirected edge, but we still call them "left" and "right"
+// because the partition of "left" must be less than that of "right".
+//
+// NOTE: To reduce the size further, we could have a NodePool, and then typedef
+// uint16_t NodeId. Edge and Path can both use a 2 byte NodeId instead of a 4
+// byte Node. ToString() can take the NodePool for pretty printing.
+//
+// This will be better for the EnumeratePaths stage, but it will be
+// worse for the CheckForCliques stage (doing the lookups may reduce memory
+// locality).
+
+struct Edge {
+ Node left;
+ Node right;
+
+ // for debugging only
+ string ToString() const {
+ return left.ToString() + " - " + right.ToString();
+ }
+};
+
+// Implement hash and equality functors for unordered_set.
+struct EdgeHash {
+ int operator() (const Edge& edge) const {
+ // DJB hash
+ int h = 5381;
+ h = (h << 5) + h + NodeHash()(edge.left);
+ h = (h << 5) + h + NodeHash()(edge.right);
+ return h;
+ }
+};
+
+struct EdgeEq {
+ bool operator() (const Edge& x, const Edge& y) const {
+ // TODO: optimize to 8 byte comparison with memcmp(&x, &y, sizeof(Edge))?
+ // This is in the inner loop for removing cadidates.
+ return NodeEq()(x.left, y.left) && NodeEq()(x.right, y.right);
+ }
+};
+
+typedef unordered_set<Edge, EdgeHash, EdgeEq> EdgeSet;
+
+// The full graph. It is k-partite, which can be seen by the node naming
+// convention.
+struct Graph {
+ int num_partitions;
+ vector<Edge> edges;
+};
+
+// Given a Node, look up Nodes in the adjacent partition that it is connected
+// to.
+typedef unordered_map<Node, vector<Node>, NodeHash, NodeEq> Adjacency;
+
+// for debugging only
+string AdjacencyToString(const Adjacency& a) {
+ string s;
+ for (auto& kv : a) {
+ s += kv.first.ToString();
+ s += " : <";
+ for (auto& node : kv.second) {
+ s += node.ToString();
+ s += " ";
+ }
+ s += "> ";
+ }
+ return s;
+}
+
+// Subgraph where only edges between adjacent partitions are included.
+//
+// We have k partitions, numbered 0 to k-1. This means we have k-1 "columns",
+// numbered 0 to k-2.
+//
+// A column is subgraph containing edges between adjacent partitions of the
+// k-partite graph.
+//
+// The ColumnSubgraph class represents ALL columns (and is itself a subgraph).
+
+class ColumnSubgraph {
+ public:
+ explicit ColumnSubgraph(int num_columns)
+ : num_columns_(num_columns),
+ adj_list_(new Adjacency[num_columns]) {
+ }
+ ~ColumnSubgraph() {
+ delete[] adj_list_;
+ }
+ void AddEdge(Edge e) {
+ int part = e.left.partition;
+ assert(part < num_columns_);
+
+ adj_list_[part][e.left].push_back(e.right);
+ }
+ void GetColumn(int part, vector<Edge>* out) const {
+ const Adjacency& a = adj_list_[part];
+ for (auto& kv : a) {
+ for (auto& right : kv.second) {
+ Edge e;
+ e.left = kv.first;
+ e.right = right;
+ out->push_back(e);
+ }
+ }
+ }
+ // Get the nodes in the next partition adjacent to node N
+ void GetAdjacentNodes(Node n, vector<Node>* out) const {
+ int part = n.partition;
+ const Adjacency& a = adj_list_[part];
+
+ // log("GetAdjacentNodes %s, part %d", n.ToString().c_str(), part);
+
+ auto it = a.find(n);
+ if (it == a.end()) {
+ return;
+ }
+ // TODO: it would be better not to copy these.
+ for (auto node : it->second) {
+ out->push_back(node);
+ }
+ }
+
+ // accessor
+ int num_columns() const { return num_columns_; }
+
+ // for debugging only
+ string ToString() const {
+ string s("[\n");
+ char buf[100];
+ for (int i = 0; i < num_columns_; ++i) {
+ const Adjacency& a = adj_list_[i];
+ snprintf(buf, sizeof(buf), "%d (%zu) ", i, a.size());
+ s += string(buf);
+ s += AdjacencyToString(a);
+ s += "\n";
+ }
+ s += " ]";
+ return s;
+ }
+
+ private:
+ int num_columns_;
+ // Adjacency list. An array of k-1 maps.
+ // Lookup goes from nodes in partition i to nodes in partition i+1.
+ Adjacency* adj_list_;
+};
+
+void BuildColumnSubgraph(const Graph& g, ColumnSubgraph* a) {
+ for (const auto& e : g.edges) {
+ if (e.left.partition + 1 == e.right.partition) {
+ a->AddEdge(e);
+ }
+ }
+}
+
+// A 2D array of paths. It's an array because all paths are the same length.
+// We use a single vector<> to represent it, to reduce memory allocation.
+class PathArray {
+ public:
+ explicit PathArray(int path_length)
+ : path_length_(path_length),
+ num_paths_(0) {
+ }
+ void AddEdgeAsPath(Edge e) {
+ // Can only initialize PathArray with edges when path length is 2
+ assert(path_length_ == 2);
+
+ nodes_.push_back(e.left);
+ nodes_.push_back(e.right);
+ num_paths_++;
+ }
+ Node LastNodeInPath(int index) const {
+ int start = index * path_length_;
+ return nodes_[start + path_length_ -1];
+ }
+ // Pretty print a single path in this array. For debugging only.
+ string PathDebugString(int index) const {
+ string s("[ ");
+ for (int i = index * path_length_; i < (index + 1) * path_length_; ++i) {
+ s += nodes_[i].ToString();
+ s += " - ";
+ }
+ s += " ]";
+ return s;
+ }
+ // Print the word implied by the path.
+ string PathAsString(int index) const {
+ string s;
+ for (int i = index * path_length_; i < (index + 1) * path_length_; ++i) {
+ s += nodes_[i].ngram[0];
+ s += nodes_[i].ngram[1];
+ }
+ return s;
+ }
+ const Node* GetPathStart(int index) const {
+ return &nodes_[index * path_length_];
+ }
+ void AddPath(const Node* start, int prefix_length, Node right) {
+ // Make sure it is one less
+ assert(prefix_length == path_length_-1);
+
+ // TODO: replace with memcpy? Is it faster?
+ for (int i = 0; i < prefix_length; ++i) {
+ nodes_.push_back(start[i]);
+ }
+ nodes_.push_back(right);
+ num_paths_++;
+ }
+
+ // accessors
+ int num_paths() const { return num_paths_; }
+ int path_length() const { return path_length_; }
+
+ private:
+ int path_length_;
+ int num_paths_;
+ vector<Node> nodes_;
+};
+
+// Given a PathArray of length i, produce one of length i+1.
+//
+// NOTE: It would be more efficient to filter 'right_nodes' here, and only add
+// a new path if it forms a "partial clique" (at step i+1). This amounts to
+// doing the membership tests in edge_set for each "column", instead of waiting
+// until the end.
+//
+// This will reduce the exponential blowup of EnumeratePaths (although it
+// doesn't change the worst case).
+
+void EnumerateStep(
+ const ColumnSubgraph& subgraph, const PathArray& in, PathArray* out) {
+
+ int prefix_length = in.path_length();
+
+ for (int i = 0; i < in.num_paths(); ++i) {
+ // log("col %d, path %d", col, i);
+
+ // last node in every path
+ Node last_node = in.LastNodeInPath(i);
+
+ // TODO: avoid copying of nodes?
+ vector<Node> right_nodes;
+ subgraph.GetAdjacentNodes(last_node, &right_nodes);
+
+ // Get a pointer to the start of the path
+ const Node* start = in.GetPathStart(i);
+
+ for (Node right : right_nodes) {
+ out->AddPath(start, prefix_length, right);
+ }
+ }
+}
+
+// Given a the column subgraph, produce an array of all possible paths of
+// length k. These will be subsequently checked to see if they are cliques.
+void EnumeratePaths(
+ const ColumnSubgraph& subgraph, PathArray* candidates) {
+ // edges between partitions 0 and 1, a "column" of edges
+ vector<Edge> edges0;
+ subgraph.GetColumn(0, &edges0);
+
+ int num_columns = subgraph.num_columns();
+ PathArray** arrays = new PathArray*[num_columns];
+
+ // Initialize using column 0.
+ int path_length = 2;
+ arrays[0] = new PathArray(path_length);
+ for (auto& e : edges0) {
+ arrays[0]->AddEdgeAsPath(e);
+ }
+
+ // Iterate over columns 1 to k-1.
+ for (int i = 1; i < num_columns; ++i) {
+ log("--- Column %d", i);
+
+ path_length++;
+ if (i == num_columns - 1) {
+ arrays[i] = candidates; // final result, from output argument!
+ } else {
+ arrays[i] = new PathArray(path_length); // intermediate result
+ }
+ PathArray* in = arrays[i - 1];
+ PathArray* out = arrays[i];
+
+ EnumerateStep(subgraph, *in, out);
+
+ log("in num paths: %d", in->num_paths());
+ log("out num paths: %d", out->num_paths());
+
+ // We create an destroy a PathArray on every iteration. On each
+ // iteration, the PathArray grows both rows and columns, so it's hard to
+ // avoid this.
+ delete in;
+ }
+}
+
+// Inserts the path number 'p' in incomplete if the path is not a complete
+// subgraph.
+bool IsClique(const Node* path, int k, const EdgeSet& edge_set) {
+ // We need to ensure that (k choose 2) edges are all in edge_set.
+ // We already know that k-1 of them are present, so we need to check (k
+ // choose 2) - (k-1).
+ for (int i = 0; i < k; ++i) {
+ for (int j = i + 1; j < k; ++j) {
+ if (i + 1 == j) {
+ // Already know this edge exists. NOTE: does this even speed things
+ // up? It's a branch in the middle of an inner loop.
+ continue;
+ }
+ Edge e;
+ e.left = path[i];
+ e.right = path[j];
+ if (edge_set.find(e) == edge_set.end()) {
+ log("Didn't find edge %s", e.ToString().c_str());
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+void CheckForCliques(const PathArray& candidates,
+ const EdgeSet& edge_set,
+ unordered_set<int>* incomplete) {
+ int k = candidates.path_length();
+ for (int p = 0; p < candidates.num_paths(); ++p) {
+ const Node* path = candidates.GetPathStart(p);
+ // NOTE: We could run many IsClique invocations in parallel. It reads from
+ // edge_set. The different 'incomplete' sets can be merged.
+ if (!IsClique(path, k, edge_set)) {
+ incomplete->insert(p);
+ return; // IMPORTANT: early return
+ }
+ }
+}
+
+// Parse text on stdin into a graph, and do some validation.
+bool ParseGraph(Graph* g, EdgeSet* edge_set) {
+ // NOTE: It's possible that there NO k-cliques.
+
+ int ret = fscanf(stdin, "num_partitions %d\n", &(g->num_partitions));
+ if (ret != 1) {
+ log("ERROR: Expected 'num_partitions <integer>'\n");
+ return false;
+ }
+ log("num_partitions = %d", g->num_partitions);
+
+ int ngram_size;
+ ret = fscanf(stdin, "ngram_size %d\n", &ngram_size);
+ if (ret != 1) {
+ log("ERROR: Expected 'ngram_size <integer>'\n");
+ return false;
+ }
+ if (ngram_size != 2) {
+ log("ERROR: Only bigrams are currently supported (got n = %d)\n", ngram_size);
+ return false;
+ }
+
+ int num_edges = 0;
+ while (true) {
+ int part1, part2;
+ char c1, c2, c3, c4;
+ int ret = fscanf(stdin, "edge %d.%c%c %d.%c%c\n",
+ &part1, &c1, &c2, &part2, &c3, &c4);
+ if (ret == EOF) {
+ log("Read %d edges", num_edges);
+ break;
+ }
+ if (ret != 6) {
+ log("ERROR: Expected 6 values for edge, got %d", ret);
+ return false;
+ }
+ // log("%d -> %d", part1, part2);
+ if (part1 >= part2) {
+ log("ERROR: edge in wrong order (%d >= %d)", part1, part2);
+ return false;
+ }
+
+ Edge e;
+ e.left.partition = part1;
+ e.left.ngram[0] = c1;
+ e.left.ngram[1] = c2;
+
+ e.right.partition = part2;
+ e.right.ngram[0] = c3;
+ e.right.ngram[1] = c4;
+
+ g->edges.push_back(e);
+
+ // For lookup in CheckForCliques
+ edge_set->insert(e);
+
+ num_edges++;
+ }
+ return true;
+}
+
+int main() {
+ log("sizeof(Node) = %zu", sizeof(Node));
+ log("sizeof(Edge) = %zu", sizeof(Edge));
+ // This should be true no matter what platform we use, e.g. since we use
+ // uint16_t.
+ assert(sizeof(Node) == 4);
+ assert(sizeof(Edge) == 8);
+
+ Graph g;
+ EdgeSet edge_set;
+
+ log("ParseGraph");
+ if (!ParseGraph(&g, &edge_set)) {
+ log("Fatal error parsing graph.");
+ return 1;
+ }
+
+ // If there are k partitions, there are k-1 edge "columns".
+ ColumnSubgraph subgraph(g.num_partitions - 1);
+ log("BuildColumnSubgraph");
+ BuildColumnSubgraph(g, &subgraph);
+ log("%s", subgraph.ToString().c_str());
+
+ // PathArray candidates(num_partitions);
+ log("EnumeratePaths");
+ PathArray candidates(g.num_partitions);
+ EnumeratePaths(subgraph, &candidates);
+
+ log("EnumeratePaths produced %d candidates", candidates.num_paths());
+ for (int i = 0; i < candidates.num_paths(); ++i) {
+ log("%d %s", i, candidates.PathDebugString(i).c_str());
+ }
+
+ // array of indices of incomplete paths, i.e. paths that are not complete
+ // subgraphs
+ log("CheckForCliques");
+ unordered_set<int> incomplete;
+ CheckForCliques(candidates, edge_set, &incomplete);
+ for (auto p : incomplete) {
+ log("Path %d is incomplete", p);
+ }
+
+ log("Found the following cliques/words:");
+ // Now print all the complete ones to stdout
+ for (int i = 0; i < candidates.num_paths(); i++) {
+ if (incomplete.find(i) == incomplete.end()) {
+ log("%d %s", i, candidates.PathAsString(i).c_str());
+ }
+ }
+ log("Done");
+}
diff --git a/analysis/cpp/run.sh b/analysis/cpp/run.sh
new file mode 100755
index 0000000..7710027
--- /dev/null
+++ b/analysis/cpp/run.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+#
+# Usage:
+# ./run.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+# Call gcc with the flags we like.
+# NOTE: -O3 does a lot for fast_em. (More than 5x speedup over unoptimized)
+
+cpp-compiler() {
+ g++ -Wall -Wextra -O3 "$@"
+ #clang++ -Wall -Wextra -O3 "$@"
+}
+
+build-find-cliques() {
+ mkdir -p _tmp
+ # C++ 11 for unordered_{map,set}
+ cpp-compiler -std=c++0x -o _tmp/find_cliques find_cliques.cc
+}
+
+find-cliques() {
+ _tmp/find_cliques "$@"
+}
+
+test-bad-edge() {
+ # Edge should go from lesser partition number to greater
+ find-cliques <<EOF
+num_partitions 3
+ngram_size 2
+edge 1.ab 0.cd
+EOF
+}
+
+test-bad-size() {
+ # Only support n =2 now
+ find-cliques <<EOF
+num_partitions 3
+ngram_size 3
+edge 0.ab 1.cd
+EOF
+}
+
+demo() {
+ local graph=${1:-testdata/graph1.txt}
+ build-find-cliques
+
+ time cat $graph | find-cliques
+}
+
+get-lint() {
+ mkdir -p _tmp
+ wget --directory _tmp \
+ http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py
+ chmod +x _tmp/cpplint.py
+}
+
+lint() {
+ _tmp/cpplint.py find_cliques.cc fast_em.cc
+}
+
+build-fast-em() {
+ mkdir -p _tmp
+ local out=_tmp/fast_em
+
+ cpp-compiler -o $out fast_em.cc
+ ls -l $out
+}
+
+fast-em() {
+ build-fast-em
+ time _tmp/fast_em "$@"
+}
+
+"$@"
diff --git a/analysis/cpp/testdata/graph1.txt b/analysis/cpp/testdata/graph1.txt
new file mode 100644
index 0000000..0bfde00
--- /dev/null
+++ b/analysis/cpp/testdata/graph1.txt
@@ -0,0 +1,23 @@
+num_partitions 4
+ngram_size 2
+edge 0.ab 1.cd
+edge 0.xx 1.cd
+edge 0.ij 1.kl
+edge 0.qr 1.st
+edge 0.ab 1.le
+edge 0.qr 2.uv
+edge 0.ab 2.ef
+edge 0.ij 2.mn
+edge 0.ij 3.op
+edge 0.qr 3.wx
+edge 0.ab 3.gh
+edge 1.cd 2.ef
+edge 1.kl 2.mn
+edge 1.st 2.uv
+edge 1.kl 3.op
+edge 1.cd 3.gh
+edge 1.st 3.wx
+edge 2.uv 3.wx
+edge 2.ef 3.gh
+edge 2.ef 3.zz
+edge 2.mn 3.op
diff --git a/analysis/cpp/testdata/graph3.txt b/analysis/cpp/testdata/graph3.txt
new file mode 100644
index 0000000..c5e2f94
--- /dev/null
+++ b/analysis/cpp/testdata/graph3.txt
@@ -0,0 +1,7 @@
+num_partitions 3
+ngram_size 2
+edge 0.ab 1.cd
+edge 1.cd 2.ef
+edge 0.ab 2.ef
+edge 0.AB 1.CD
+edge 1.CD 2.EF
diff --git a/analysis/tensorflow/README.md b/analysis/tensorflow/README.md
new file mode 100644
index 0000000..eb96f27
--- /dev/null
+++ b/analysis/tensorflow/README.md
@@ -0,0 +1,10 @@
+RAPPOR in TensorFlow
+====================
+
+This directory contains an experimental implementation of the EM algorithm in
+[TensorFlow](http://tensorflow.org).
+
+Currently the C++ implementation in `analysis/cpp` is faster and can be used
+in production.
+
+
diff --git a/analysis/tensorflow/fast_em.py b/analysis/tensorflow/fast_em.py
new file mode 100755
index 0000000..ea001e4
--- /dev/null
+++ b/analysis/tensorflow/fast_em.py
@@ -0,0 +1,180 @@
+#!/usr/bin/python
+"""
+fast_em.py: Tensorflow implementation of expectation maximization for RAPPOR
+association analysis.
+
+TODO:
+ - Use TensorFlow ops for reading input (so that reading input can be
+ distributed)
+ - Reduce the number of ops (currently proportional to the number of reports).
+ May require new TensorFlow ops.
+ - Fix performance bug (v_split is probably being recomputed on every
+ iteration):
+ bin$ ./test.sh decode-assoc-cpp - 1.1 seconds (single-threaded C++)
+ bin$ ./test.sh decode-assoc-tensorflow - 226 seconds on GPU
+"""
+
+import sys
+
+import numpy as np
+import tensorflow as tf
+
+
+def log(msg, *args):
+ if args:
+ msg = msg % args
+ print >>sys.stderr, msg
+
+
+def ExpectTag(f, expected):
+ """Read and consume a 4 byte tag from the given file."""
+ b = f.read(4)
+ if b != expected:
+ raise RuntimeError('Expected %r, got %r' % (expected, b))
+
+
+def ReadListOfMatrices(f):
+ """
+ Read a big list of conditional probability matrices from a binary file.
+ """
+ ExpectTag(f, 'ne \0')
+ num_entries = np.fromfile(f, np.uint32, count=1)[0]
+ log('Number of entries: %d', num_entries)
+
+ ExpectTag(f, 'es \0')
+ entry_size = np.fromfile(f, np.uint32, count=1)[0]
+ log('Entry size: %d', entry_size)
+
+ ExpectTag(f, 'dat\0')
+ vec_length = num_entries * entry_size
+ v = np.fromfile(f, np.float64, count=vec_length)
+
+ log('Values read: %d', len(v))
+ log('v: %s', v[:10])
+ #print 'SUM', sum(v)
+
+ # NOTE: We're not reshaping because we're using one TensorFlow tensor object
+ # per matrix, since it makes the algorithm expressible with current
+ # TensorFlow ops.
+ #v = v.reshape((num_entries, entry_size))
+
+ return num_entries, entry_size, v
+
+
+def WriteTag(f, tag):
+ if len(tag) != 3:
+ raise AssertionError("Tags should be 3 bytes. Got %r" % tag)
+ f.write(tag + '\0') # NUL terminated
+
+
+def WriteResult(f, num_em_iters, pij):
+ WriteTag(f, 'emi')
+ emi = np.array([num_em_iters], np.uint32)
+ emi.tofile(f)
+
+ WriteTag(f, 'pij')
+ pij.tofile(f)
+
+
+def DebugSum(num_entries, entry_size, v):
+ """Sum the entries as a sanity check."""
+ cond_prob = tf.placeholder(tf.float64, shape=(num_entries * entry_size,))
+ debug_sum = tf.reduce_sum(cond_prob)
+ with tf.Session() as sess:
+ s = sess.run(debug_sum, feed_dict={cond_prob: v})
+ log('Debug sum: %f', s)
+
+
+def BuildEmIter(num_entries, entry_size, v):
+ # Placeholder for the value from the previous iteration.
+ pij_in = tf.placeholder(tf.float64, shape=(entry_size,))
+
+ # split along dimension 0
+ # TODO:
+ # - make sure this doesn't get run for every EM iteration
+ # - investigate using tf.tile() instead? (this may cost more memory)
+ v_split = tf.split(0, num_entries, v)
+
+ z_numerator = [report * pij_in for report in v_split]
+ sum_z = [tf.reduce_sum(report) for report in z_numerator]
+ z = [z_numerator[i] / sum_z[i] for i in xrange(num_entries)]
+
+ # Concat per-report tensors and reshape. This is probably inefficient?
+ z_concat = tf.concat(0, z)
+ z_concat = tf.reshape(z_concat, [num_entries, entry_size])
+
+ # This whole expression represents an EM iteration. Bind the pij_in
+ # placeholder, and get a new estimation of Pij.
+ em_iter_expr = tf.reduce_sum(z_concat, 0) / num_entries
+
+ return pij_in, em_iter_expr
+
+
+def RunEm(pij_in, entry_size, em_iter_expr, max_em_iters, epsilon=1e-6):
+ """Run the iterative EM algorithm (using the TensorFlow API).
+
+ Args:
+ num_entries: number of matrices (one per report)
+ entry_size: total number of cells in each matrix
+ v: numpy.ndarray (e.g. 7000 x 8 matrix)
+ max_em_iters: maximum number of EM iterations
+
+ Returns:
+ pij: numpy.ndarray (e.g. vector of length 8)
+ """
+ # Initial value is the uniform distribution
+ pij = np.ones(entry_size) / entry_size
+
+ i = 0 # visible outside loop
+
+ # Do EM iterations.
+ with tf.Session() as sess:
+ for i in xrange(max_em_iters):
+ print 'PIJ', pij
+ new_pij = sess.run(em_iter_expr, feed_dict={pij_in: pij})
+ dif = max(abs(new_pij - pij))
+ log('EM iteration %d, dif = %e', i, dif)
+ pij = new_pij
+
+ if dif < epsilon:
+ log('Early EM termination: %e < %e', max_dif, epsilon)
+ break
+
+ # If i = 9, then we did 10 iteratinos.
+ return i + 1, pij
+
+
+def sep():
+ print '-' * 80
+
+
+def main(argv):
+ input_path = argv[1]
+ output_path = argv[2]
+ max_em_iters = int(argv[3])
+
+ sep()
+ with open(input_path) as f:
+ num_entries, entry_size, cond_prob = ReadListOfMatrices(f)
+
+ sep()
+ DebugSum(num_entries, entry_size, cond_prob)
+
+ sep()
+ pij_in, em_iter_expr = BuildEmIter(num_entries, entry_size, cond_prob)
+ num_em_iters, pij = RunEm(pij_in, entry_size, em_iter_expr, max_em_iters)
+
+ sep()
+ log('Final Pij: %s', pij)
+
+ with open(output_path, 'wb') as f:
+ WriteResult(f, num_em_iters, pij)
+ log('Wrote %s', output_path)
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/analysis/tensorflow/fast_em.sh b/analysis/tensorflow/fast_em.sh
new file mode 100755
index 0000000..d1ddb79
--- /dev/null
+++ b/analysis/tensorflow/fast_em.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+#
+# Wrapper to run fast_em.py using TensorFlow configured for a GPU. CUDA
+# environment variables must be set.
+#
+# Usage:
+# ./fast_em.sh <args>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+
+fast-em() {
+ # Never returns
+ LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 \
+ CUDA_HOME=/usr/local/cuda-7.0 \
+ exec $THIS_DIR/fast_em.py "$@"
+}
+
+fast-em "$@"
diff --git a/apps/README.md b/apps/README.md
new file mode 100644
index 0000000..32c00d5
--- /dev/null
+++ b/apps/README.md
@@ -0,0 +1,58 @@
+RAPPOR Shiny Apps
+=================
+
+This directory contains web apps written using the [Shiny][shiny] web framework
+from [RStudio][rstudio].
+
+To run them, first install Shiny:
+
+ $ R
+ ...
+ > install.packages('shiny')
+ ...
+
+(You can view Shiny's platform requirements in
+[CRAN](http://cran.r-project.org/web/packages/shiny/index.html).)
+
+Then change to the app directory, and execute the `run_app.sh` script:
+
+ $ cd rappor/apps/rappor-analysis
+ $ ./run_app.sh
+ ...
+ Listening on http://0.0.0.0.:6789
+
+Visit http://localhost:6789/ in your browser.
+
+This code has been tested on Ubuntu Linux, but should work on other platforms
+that Shiny supports.
+
+Both of these apps use the underlying analysis code in `analysis/R`, just like
+the command line demo `demo.sh` does.
+
+rappor-analysis
+---------------
+
+This app "decodes" a RAPPOR data set. In other words, you can upload the
+`params`, `counts`, and `map` files, and view the inferred distribution, as
+well as debug info.
+
+These files are discussed in the RAPPOR [Data Flow][data-flow] doc.
+
+rappor-sim
+----------
+
+This app lets you simulate RAPPOR runs with different populations and
+parameters. This can help you choose collection parameters for a given
+situation / variable.
+
+Help
+----
+
+If you need help with these apps, please send a message to
+[rappor-discuss][group].
+
+
+[shiny]: http://shiny.rstudio.com/
+[rstudio]: http://rstudio.com/
+[data-flow]: http://google.github.io/rappor/doc/data-flow.html
+[group]: https://groups.google.com/forum/#!forum/rappor-discuss
diff --git a/apps/rappor-analysis/counts.csv b/apps/rappor-analysis/counts.csv
new file mode 100644
index 0000000..60eeb54
--- /dev/null
+++ b/apps/rappor-analysis/counts.csv
@@ -0,0 +1,8 @@
+1250276,626224,629830,638341,625107,629059,637733,627237,624856,626511,639218,625191,636727,652194,625005,626731,624685,627324,626537,625196,630818,632501,626862,628955,634134,625457,625492,630140,625310,625480,626724,625815,625012,626208,633380,627120,626236,630380,626096,643400,624581,633043,637383,637500,638889,638850,631609,629736,629781,631053,626856,625761,632677,626132,626524,640677,626872,625908,627774,645562,629118,626503,624811,625076,642209,624358,631656,625994,629471,627062,626615,628072,625104,634685,625712,627828,636920,628467,633497,625136,638445,640503,625158,626136,628873,628966,637418,636806,625767,625513,625005,629592,629663,625826,625291,629936,629981,633462,626834,634088,637785,626311,626268,631188,631489,627571,624682,625826,633909,625579,629158,627053,645113,632824,632781,629234,642502,626189,630451,627350,626339,625728,625176,633351,625507,624950,629719,629840,633691
+1250323,625902,627628,632084,633514,631008,628084,635489,626732,626260,634849,630196,630422,632669,628821,633097,626636,625641,628467,625592,632206,629610,625690,634899,626677,624795,643035,631852,627271,630507,628361,626473,628526,625776,625730,643589,629456,626836,626582,643557,628080,624552,627590,635085,638493,637141,626796,640932,625243,625471,636482,627287,626797,629338,626896,629384,628924,634787,626067,640203,627410,627300,630067,626616,631823,626115,627773,638473,636225,627615,625744,625323,625527,624919,625326,626282,631728,625763,625117,634744,626183,627635,635496,625418,625878,625656,632471,638729,624696,628542,633674,631028,625470,629210,625481,629249,626665,625463,642764,633562,625800,626148,625704,632663,626100,637283,628144,630706,637841,648419,629031,630624,629286,637650,633796,629327,625004,636779,626399,636405,627180,633733,624644,634934,633892,629981,624763,625490,626905
+1249338,627195,626020,630380,640061,624400,638641,625450,628965,627574,633128,627024,628953,624679,630959,629477,625165,624683,630564,629084,626893,627642,638929,628032,624605,624784,624866,625057,626514,625119,629974,636058,631212,632188,630549,633040,626200,635410,630642,625288,638902,625661,626883,631472,625034,628598,631335,633431,626116,625614,623893,624451,630719,625543,628438,629185,636862,624480,646320,633045,625899,631646,624480,632985,629712,627226,624685,630739,631352,624473,637188,626034,632661,642084,625588,623221,624143,627064,630239,632201,654697,623959,627282,634094,629300,629071,627907,628366,635018,629682,625013,625243,625787,625923,626455,624202,632402,626223,626239,629648,633212,627390,634006,624844,625376,623740,625066,624661,638579,627998,633964,626940,638640,625384,623877,634034,630904,627611,634589,641012,629763,639537,631585,633523,635568,625207,625550,627737,626509
+1250153,626278,632891,631440,628057,628399,632155,625425,630851,635905,628782,625907,626233,637552,646036,626268,625907,627308,624871,632340,632054,635781,626448,625755,629482,627888,628903,626740,625991,633765,627295,636278,631528,628661,624947,632013,625776,631390,629058,628824,626829,624329,638907,628410,625438,624753,634174,625978,637687,635135,636697,631489,641863,629839,626623,628555,636126,625301,630757,623565,641466,626623,625080,627400,640478,632570,625320,631842,628536,631022,640929,629782,626027,626572,625598,627779,626217,626792,633894,626197,625890,630099,628750,648340,625980,637857,625096,633319,633900,626561,625530,625522,635756,624511,627024,630723,630942,625503,631642,635463,625264,628220,634951,638631,632815,632761,626389,639287,630862,625804,628154,629082,632816,623961,625637,625300,634069,624859,626082,627940,631644,628774,627405,627828,627791,627485,635126,625856,625356
+1250771,626134,634334,635292,633584,629653,627784,630316,636331,635721,634720,625729,626505,625530,643654,627007,628882,636509,634843,627260,632665,630581,625935,626325,626154,626445,630192,631350,625497,626602,625785,635212,629901,631934,644065,625364,630139,625888,640241,642082,625550,628504,628584,625236,631908,634277,633968,628403,626574,628714,628817,626215,625343,626530,636299,625761,630238,635510,625792,626638,626019,625385,632164,642123,626560,631990,626524,629389,635020,642185,630844,641785,624691,625537,627970,637709,624936,632891,627287,624731,635772,626065,628167,639165,625399,627285,635614,624716,629091,635905,632939,644740,627730,636927,628454,626539,625964,629538,624913,627352,626349,634596,649781,633051,625859,641258,626367,626550,632163,631919,626057,626404,633310,626017,625033,625715,628007,627435,625799,629168,625984,627762,630920,636178,628996,636303,625454,626447,624878
+1249304,625217,634782,633262,627543,626709,626259,631738,625249,627135,624159,631445,623885,625673,627572,625728,637424,629831,631160,626828,625299,624976,632627,629163,633376,642828,628996,628394,625273,635767,634001,625608,626359,630964,629573,626843,631347,637799,632082,626361,625640,633128,625114,625259,624855,623835,627899,629085,633190,627906,625598,628426,628591,635388,627417,625277,626112,625438,629344,624804,635266,631869,627757,629650,634634,635120,626939,631203,624800,628294,629751,625873,625454,626784,643152,624555,626007,634075,624735,634702,635161,631006,633192,624404,645213,630656,626274,628042,626383,627361,626584,626495,634171,626099,645988,623858,633567,627545,627303,632595,625658,635634,627582,637475,632164,627445,624430,624834,634950,627360,625178,635287,628042,627459,625008,625474,624222,632271,631753,625880,634756,627221,634187,625343,629496,633121,638032,643509,627915
+1249371,629603,627151,624917,626164,635408,629423,628587,625809,628232,627395,633229,624598,625802,625063,624578,625570,625994,640692,627609,626231,624600,624999,624317,627324,627285,635594,628657,624688,634166,625095,635016,627523,625254,625486,625502,625508,628331,639758,625131,625854,632769,632613,626049,623943,625238,626959,644100,631137,631036,627617,638092,631651,633351,635808,626236,631018,623797,640254,625197,637365,630925,631001,629339,624785,629808,625059,625946,625454,626813,628688,638763,633084,630962,642853,633396,625614,625457,653628,627484,624393,624605,624069,631721,624595,629449,632112,630484,624621,638576,641526,627252,640009,632488,626154,627389,635583,630346,628182,641448,625741,626562,624782,629489,624792,625233,627673,636352,628509,629039,632497,627705,625475,625157,633092,627202,639182,624828,635046,628816,625704,629771,627364,625016,632477,623903,625941,630072,636475
+1250464,631025,631173,631404,626236,625778,628353,626059,628664,626525,626935,626403,635178,632979,625756,627707,634003,624578,631404,632468,627611,626047,625666,625608,638402,626190,626682,624400,624847,635537,645362,625061,626119,625017,626119,629502,630321,644285,627412,626713,627653,633124,624801,634043,630719,625353,625964,629839,647639,630311,640621,625757,624985,630278,628313,625287,628762,625189,627223,635090,629585,634519,627744,625372,631604,631082,633258,624555,629062,635039,629732,646917,627305,629446,625210,628958,626225,635697,650677,633385,628222,626950,632745,628532,624715,628624,632778,636588,633436,628344,628059,628093,628365,625570,625747,626402,629726,627742,630508,625999,641140,632459,631071,625998,626277,630781,629860,635807,635853,635068,634698,628617,625904,626756,624358,625808,628317,628376,625252,626063,624903,627357,647659,633890,636314,629070,630367,643689,628442
diff --git a/apps/rappor-analysis/map.csv b/apps/rappor-analysis/map.csv
new file mode 100644
index 0000000..09e8f55
--- /dev/null
+++ b/apps/rappor-analysis/map.csv
@@ -0,0 +1,300 @@
+"V_1",73,116,175,185,287,380,448,469,583,635,656,693,819,884,933,1018
+"V_2",13,76,154,226,329,344,467,483,521,529,724,734,815,846,983,1004
+"V_3",39,59,195,245,314,377,433,491,550,566,705,767,794,799,908,926
+"V_4",81,112,138,237,296,336,454,510,551,575,665,720,797,815,967,1023
+"V_5",45,108,241,252,260,364,467,487,515,520,669,751,858,886,946,955
+"V_6",3,43,135,236,319,356,426,486,598,614,664,766,842,875,944,1006
+"V_7",13,113,167,172,278,375,398,496,526,603,724,765,846,864,967,1020
+"V_8",64,226,242,312,374,397,472,581,601,739,762,773,867,926,1018,
+"V_9",10,39,178,237,288,336,434,500,614,624,714,734,822,860,974,1023
+"V_10",55,100,171,207,262,266,404,432,522,595,681,757,810,851,933,978
+"V_11",86,97,154,210,375,379,444,488,575,602,704,719,786,826,982,1005
+"V_12",78,112,151,247,314,335,390,436,558,614,662,748,826,843,909,996
+"V_13",21,114,163,231,278,326,416,440,514,543,678,701,786,854,915,974
+"V_14",6,59,163,249,336,378,398,451,539,557,665,743,830,860,937,944
+"V_15",81,87,196,214,289,299,430,449,617,621,643,688,828,867,920,997
+"V_16",12,128,141,237,303,315,421,462,595,617,741,758,779,841,925,1003
+"V_17",10,52,131,251,336,371,387,405,546,577,658,677,842,846,898,984
+"V_18",42,104,155,227,317,328,386,435,532,603,736,767,806,878,957,961
+"V_19",6,123,133,233,329,368,436,442,530,589,700,707,829,858,899,914
+"V_20",44,80,164,187,334,339,419,476,580,587,714,760,809,817,920,946
+"V_21",20,55,163,215,291,293,403,482,569,605,670,766,824,839,965,969
+"V_22",44,46,132,233,294,377,392,444,546,637,691,764,831,896,939,962
+"V_23",45,116,192,219,302,324,415,476,546,550,657,721,806,896,945,973
+"V_24",24,127,167,190,352,358,405,415,526,581,666,717,882,895,965,1001
+"V_25",12,64,167,187,358,366,437,471,538,615,732,744,816,857,996,1019
+"V_26",41,115,143,253,341,372,432,479,568,605,703,714,775,871,1002,1022
+"V_27",34,87,139,204,323,339,426,452,551,587,673,760,846,884,974,1021
+"V_28",91,96,172,218,262,368,448,454,554,582,642,741,821,833,994,998
+"V_29",92,99,142,173,264,293,389,393,544,545,700,725,840,866,897,992
+"V_30",34,80,149,215,268,314,398,504,551,553,663,722,795,861,950,1019
+"V_31",13,47,240,251,260,355,408,505,519,609,642,647,857,887,944,960
+"V_32",48,49,140,239,345,364,397,444,580,583,676,752,777,861,944,974
+"V_33",5,85,148,184,274,321,393,413,562,592,677,743,819,821,967,1012
+"V_34",64,103,181,196,270,311,394,465,516,620,742,745,820,865,912,1003
+"V_35",23,42,183,227,301,366,423,467,556,569,674,687,839,855,966,1024
+"V_36",42,58,173,238,259,376,409,436,574,579,677,734,792,839,956,1013
+"V_37",66,118,148,195,260,286,422,491,543,574,710,727,809,882,912,964
+"V_38",68,126,234,236,275,340,410,492,526,592,689,732,853,877,975,996
+"V_39",100,128,143,188,271,365,430,487,517,548,704,719,769,864,931,940
+"V_40",27,41,156,207,296,333,434,489,613,636,654,722,787,828,973,975
+"V_41",35,59,241,247,290,312,466,480,620,634,649,667,838,842,932,988
+"V_42",44,80,179,221,291,354,494,508,533,603,665,710,857,876,973,984
+"V_43",75,95,173,235,257,326,393,459,556,637,651,698,800,892,902,958
+"V_44",3,43,210,243,286,373,427,504,528,583,728,736,820,835,949,986
+"V_45",84,99,134,157,267,308,413,489,533,561,651,748,779,855,925,1007
+"V_46",110,123,209,243,303,320,471,491,569,611,709,717,805,806,940,952
+"V_47",86,98,171,178,290,310,388,413,600,603,642,717,840,889,957,960
+"V_48",2,77,170,175,265,368,403,417,546,606,669,675,774,822,939,987
+"V_49",99,105,217,223,259,296,455,485,514,545,644,692,814,892,904,971
+"V_50",22,112,187,197,258,276,401,455,518,637,670,721,778,818,954,962
+"V_51",24,95,173,187,308,343,480,489,631,761,768,770,883,957,979,
+"V_52",2,71,218,235,308,383,469,481,600,615,647,713,874,890,949,968
+"V_53",1,13,132,194,279,298,453,495,526,592,645,726,828,892,943,1017
+"V_54",27,56,136,215,352,371,453,577,617,686,751,879,896,1015,1018,
+"V_55",3,66,140,224,310,367,414,439,516,548,659,737,793,857,976,1020
+"V_56",7,37,146,151,275,315,462,509,613,634,679,711,820,907,934,
+"V_57",24,55,168,212,277,342,438,485,516,530,767,768,774,865,966,1018
+"V_58",49,76,160,203,270,350,424,440,530,613,706,729,859,880,925,981
+"V_59",13,111,215,249,316,326,386,507,535,557,690,692,875,889,936,1005
+"V_60",33,121,130,174,371,376,465,503,559,581,702,744,786,874,897,998
+"V_61",84,103,157,158,326,353,419,448,607,613,643,753,788,858,905,920
+"V_62",37,86,146,223,274,324,440,506,566,604,722,725,769,869,993,1006
+"V_63",60,118,239,248,304,372,492,495,520,578,656,712,867,894,983,985
+"V_64",13,119,175,180,270,340,398,433,556,597,651,672,773,823,916,928
+"V_65",9,17,217,256,271,357,395,453,563,630,673,724,816,843,939,977
+"V_66",71,110,183,189,328,379,447,492,629,634,676,738,784,882,931,932
+"V_67",55,124,218,221,284,302,415,439,587,628,688,729,839,840,935,
+"V_68",60,69,158,233,279,366,385,388,571,622,646,670,818,838,932,941
+"V_69",46,86,137,187,320,359,417,472,565,631,743,759,806,847,934,974
+"V_70",52,103,135,208,342,349,406,503,594,615,698,731,798,817,926,943
+"V_71",66,100,132,157,329,338,471,506,543,594,670,694,828,857,916,935
+"V_72",36,56,168,247,287,327,468,473,516,601,686,744,853,877,921,1002
+"V_73",13,116,245,256,317,346,411,431,522,532,644,678,822,839,992,1009
+"V_74",50,70,166,194,272,277,435,478,537,581,687,734,773,858,909,911
+"V_75",78,100,178,245,264,357,460,461,520,633,687,696,826,876,906,991
+"V_76",47,60,141,205,266,342,412,449,524,633,740,743,840,886,902,985
+"V_77",44,54,204,223,314,345,440,479,534,586,648,717,784,842,906,974
+"V_78",80,126,182,230,329,338,392,457,581,635,662,688,863,885,926,979
+"V_79",64,112,159,192,320,345,487,489,592,605,672,694,771,781,985,1007
+"V_80",30,120,198,200,343,368,436,454,598,636,647,749,794,847,993,997
+"V_81",105,126,196,238,271,292,471,490,613,639,643,730,808,865,951,956
+"V_82",49,70,152,182,265,338,421,433,527,610,671,724,847,856,908,939
+"V_83",30,37,166,187,262,314,386,454,593,609,712,733,813,831,943,1000
+"V_84",15,24,218,244,323,384,447,507,587,590,656,722,878,892,988,1009
+"V_85",38,72,189,204,295,348,480,486,576,637,686,705,785,867,936,952
+"V_86",88,126,145,246,352,360,393,456,538,629,660,749,776,862,967,981
+"V_87",26,74,175,191,296,365,434,453,580,633,657,702,786,795,975,996
+"V_88",37,87,181,247,274,311,389,401,579,601,681,693,824,892,905,959
+"V_89",39,40,160,194,268,367,429,471,559,617,673,737,803,805,987,1003
+"V_90",16,64,175,242,301,369,399,411,555,558,650,655,837,863,912,1018
+"V_91",7,117,211,216,289,292,387,457,589,637,713,748,772,845,949,1001
+"V_92",14,20,184,253,268,380,408,413,526,527,699,737,799,889,951,996
+"V_93",29,32,140,224,312,383,426,442,559,629,652,681,828,890,993,999
+"V_94",37,46,143,155,320,378,473,586,623,716,749,780,805,956,968,
+"V_95",57,68,147,234,262,369,472,509,550,616,676,730,816,845,937,939
+"V_96",7,53,182,235,262,384,478,500,530,570,709,732,778,804,912,952
+"V_97",38,125,144,242,274,286,456,493,529,610,674,694,770,862,908,994
+"V_98",58,77,193,248,335,336,410,503,582,593,687,743,843,855,947,977
+"V_99",6,59,129,233,265,302,414,426,519,568,653,755,821,868,928,1019
+"V_100",99,108,150,170,328,374,427,507,516,559,753,759,783,834,975,979
+"V_101",69,125,204,222,347,373,420,505,567,609,698,752,869,885,1022,1023
+"V_102",59,76,135,189,291,352,441,461,530,618,732,767,772,858,992,1023
+"V_103",15,53,154,184,261,308,462,466,557,571,700,720,813,888,933,940
+"V_104",49,120,168,191,353,376,445,490,566,604,646,763,796,834,947,967
+"V_105",6,101,141,229,269,317,399,486,530,546,653,749,815,868,951,972
+"V_106",76,83,193,237,376,377,494,508,582,630,704,710,774,794,947,1001
+"V_107",17,89,135,251,292,308,455,495,581,586,645,743,886,889,962,1021
+"V_108",77,91,172,209,274,330,424,487,549,632,649,658,782,788,903,996
+"V_109",6,127,157,192,271,353,445,456,619,621,647,653,803,833,897,1005
+"V_110",82,116,173,231,326,357,469,492,625,628,705,721,803,881,917,946
+"V_111",61,130,183,277,370,391,445,517,594,708,742,793,819,928,964,
+"V_112",76,114,189,237,341,378,486,508,519,528,739,756,869,879,974,1004
+"V_113",5,15,144,160,333,358,400,417,569,596,694,727,881,893,916,933
+"V_114",15,70,153,239,259,348,389,502,567,618,684,695,781,830,943,956
+"V_115",69,112,217,240,263,362,447,458,549,597,739,746,853,969,1000,
+"V_116",18,94,216,221,357,374,427,458,595,636,647,654,829,832,929,944
+"V_117",47,60,203,225,282,343,403,428,513,640,719,730,773,859,906,955
+"V_118",17,51,135,180,319,374,463,482,561,627,700,744,806,872,906,980
+"V_119",38,48,129,244,343,364,442,456,515,606,667,763,845,876,992,1022
+"V_120",69,101,219,242,315,378,491,495,597,602,670,748,770,784,930,1016
+"V_121",14,101,159,178,265,279,409,486,545,606,674,737,790,871,983,1013
+"V_122",55,110,221,233,316,342,454,480,597,602,644,762,787,866,912,921
+"V_123",72,124,133,160,304,308,424,458,586,601,678,747,769,834,1010,1024
+"V_124",44,96,156,158,374,384,399,455,558,569,676,697,802,823,906,950
+"V_125",52,116,173,175,349,359,499,500,513,529,732,749,849,867,926,976
+"V_126",18,50,165,176,310,322,499,507,557,613,655,699,814,868,998,1004
+"V_127",51,56,153,211,257,377,418,427,544,621,755,767,796,810,919,922
+"V_128",29,83,222,230,288,354,386,511,519,562,669,689,842,869,913,990
+"V_129",6,47,215,230,266,301,418,507,585,602,731,765,797,875,968,1009
+"V_130",35,97,138,144,326,383,471,502,532,554,659,711,779,782,972,999
+"V_131",44,99,154,219,300,307,386,427,531,631,686,723,815,852,904,917
+"V_132",67,86,151,225,294,353,391,407,574,575,720,754,782,813,914,999
+"V_133",32,84,151,164,293,302,404,437,593,625,660,756,773,803,942,960
+"V_134",73,80,143,234,318,355,412,451,524,537,655,688,782,842,920,1018
+"V_135",4,60,138,229,345,380,395,397,613,635,714,746,844,896,952,1023
+"V_136",9,62,161,227,267,290,395,485,516,628,744,755,793,802,942,975
+"V_137",6,53,134,189,271,373,395,447,533,589,641,652,817,869,901,923
+"V_138",9,107,206,223,261,339,499,511,613,632,653,767,775,801,919,931
+"V_139",26,41,136,158,328,343,434,453,540,575,727,752,790,861,942,1017
+"V_140",27,32,146,152,325,354,415,451,592,617,647,649,808,887,957,982
+"V_141",4,72,200,249,264,287,392,425,572,576,744,746,788,896,908,967
+"V_142",7,75,167,249,374,381,440,473,574,576,662,747,834,838,908,1001
+"V_143",44,58,147,167,303,369,409,427,595,619,741,756,824,878,908,945
+"V_144",12,41,247,253,335,375,406,459,579,607,740,763,772,826,919,1017
+"V_145",71,75,186,210,299,380,395,452,535,624,643,725,787,851,1005,1018
+"V_146",74,111,162,213,293,381,417,457,515,560,654,726,790,839,954,966
+"V_147",17,90,144,212,322,352,414,416,546,635,710,731,815,872,913,940
+"V_148",3,74,132,174,288,323,467,498,524,588,658,676,824,846,915,985
+"V_149",69,85,236,237,328,364,411,474,563,565,757,758,798,839,948,971
+"V_150",30,113,183,234,275,367,447,466,519,523,719,754,848,888,1000,1010
+"V_151",40,77,198,226,290,307,437,500,519,625,659,764,802,854,992,994
+"V_152",38,56,177,241,277,378,387,461,538,570,687,717,821,859,930,956
+"V_153",79,125,194,255,317,364,397,422,518,591,748,759,796,853,934,965
+"V_154",28,101,151,242,316,365,448,476,549,561,645,750,777,787,1008,1015
+"V_155",99,113,212,254,267,273,457,506,540,578,722,754,789,827,990,1022
+"V_156",6,97,201,208,272,369,444,509,516,525,707,759,829,865,938,1001
+"V_157",106,107,150,191,272,318,420,504,550,571,644,716,866,885,933,990
+"V_158",26,53,161,226,312,353,397,423,574,628,729,762,804,861,903,930
+"V_159",33,126,153,252,259,296,435,442,568,599,641,687,812,890,899,1003
+"V_160",96,117,157,201,288,344,389,467,620,628,692,760,827,864,910,973
+"V_161",103,109,184,204,271,321,412,433,528,626,667,735,769,821,930,955
+"V_162",12,123,150,170,306,381,407,448,580,613,654,711,832,918,956,
+"V_163",34,51,172,307,319,390,479,561,615,695,741,824,878,933,1002,
+"V_164",13,25,131,204,282,347,489,509,565,630,732,768,842,889,981,1012
+"V_165",8,100,146,236,279,323,441,483,529,545,699,711,796,880,959,992
+"V_166",79,126,166,255,324,362,458,507,543,581,668,686,781,787,951,952
+"V_167",53,125,202,214,275,343,493,505,589,622,670,672,825,868,904,971
+"V_168",33,80,129,191,284,378,441,453,548,585,671,680,806,893,987,997
+"V_169",48,75,155,183,278,365,453,491,537,590,677,752,769,786,926,989
+"V_170",77,102,137,181,270,344,387,446,513,629,728,736,808,818,922,999
+"V_171",30,78,162,177,373,378,414,478,620,637,718,752,788,824,955,988
+"V_172",67,126,177,179,321,366,455,474,580,612,654,742,789,852,971,1008
+"V_173",11,20,129,182,285,361,405,470,630,744,757,842,847,906,940,
+"V_174",59,69,130,189,277,361,473,510,615,635,731,749,771,786,922,1012
+"V_175",32,107,142,256,308,320,457,511,528,535,753,766,774,874,915,959
+"V_176",66,112,146,212,342,373,404,498,544,560,700,713,781,796,899,1002
+"V_177",88,124,201,228,268,361,395,441,580,604,645,751,775,801,908,929
+"V_178",41,89,139,233,268,370,424,496,519,563,650,658,805,863,898,1012
+"V_179",126,127,200,252,275,343,434,496,568,589,678,711,801,876,908,1024
+"V_180",5,116,134,210,314,354,412,511,522,620,738,746,835,845,951,992
+"V_181",109,122,137,177,257,308,490,505,577,589,653,676,784,894,904,936
+"V_182",57,86,172,180,335,368,410,452,525,577,708,717,810,846,918,1005
+"V_183",31,61,139,225,261,356,429,506,533,630,696,749,769,799,986,1005
+"V_184",16,70,173,198,314,316,482,508,605,614,656,699,818,868,950,987
+"V_185",14,126,137,186,287,345,385,487,593,612,679,702,815,817,897,943
+"V_186",87,115,150,216,321,375,475,494,603,624,664,767,797,865,965,1021
+"V_187",110,122,176,247,302,350,401,403,593,635,708,721,815,833,940,952
+"V_188",59,128,150,213,261,332,478,508,515,533,663,695,788,869,922,932
+"V_189",71,104,163,252,299,377,385,475,573,641,731,828,830,956,989,
+"V_190",56,86,135,189,282,374,391,496,538,623,723,724,818,820,977,989
+"V_191",47,72,158,166,259,290,484,496,605,612,690,746,822,866,939,949
+"V_192",26,118,194,210,270,372,394,421,588,601,659,725,792,835,993,1001
+"V_193",17,64,151,169,293,381,397,444,556,620,722,810,876,958,986,
+"V_194",87,88,139,176,290,339,395,492,533,558,710,744,790,827,917,978
+"V_195",3,112,181,214,367,375,399,412,524,564,647,724,820,886,1000,1014
+"V_196",81,96,156,207,312,332,413,436,554,626,694,741,822,891,939,1022
+"V_197",37,82,150,211,261,379,420,479,537,589,655,673,803,885,938,954
+"V_198",54,64,159,216,382,384,393,447,565,571,681,729,769,823,963,1021
+"V_199",91,118,135,235,357,383,465,480,544,639,693,733,872,891,909,936
+"V_200",31,91,168,172,259,354,436,473,545,563,652,694,803,874,922,946
+"V_201",74,93,152,218,296,328,475,490,570,609,643,768,784,801,968,1020
+"V_202",6,23,154,224,323,365,461,467,558,619,710,717,839,869,983,1011
+"V_203",72,126,216,255,296,381,395,431,591,600,702,704,870,885,944,1000
+"V_204",56,60,154,182,315,383,385,468,524,602,641,725,807,869,963,1008
+"V_205",68,79,141,215,258,370,389,485,623,633,695,795,882,944,955,
+"V_206",105,123,134,219,330,361,463,485,538,575,717,748,782,826,906,981
+"V_207",44,107,192,240,303,311,412,430,526,583,716,761,825,829,1008,1018
+"V_208",4,14,136,202,300,384,404,503,558,565,651,696,845,878,903,1017
+"V_209",53,107,173,204,304,381,393,396,625,627,666,684,795,896,900,941
+"V_210",25,71,167,272,302,400,453,537,600,668,738,786,847,922,996,
+"V_211",69,84,161,172,301,328,443,479,520,577,660,664,777,817,926,1021
+"V_212",14,25,134,212,339,348,486,491,569,580,699,740,802,815,927,982
+"V_213",72,89,199,246,258,305,391,430,604,625,668,724,826,857,987,1016
+"V_214",19,125,142,194,269,277,418,429,529,581,693,740,807,890,924,992
+"V_215",71,89,132,220,257,375,434,487,520,625,736,763,881,885,929,995
+"V_216",8,81,189,225,286,326,409,505,579,594,641,757,794,891,903,907
+"V_217",16,60,135,186,284,383,432,498,557,576,646,703,842,877,966,998
+"V_218",6,67,184,245,307,366,423,468,527,558,646,726,816,835,897,952
+"V_219",4,12,158,228,323,352,446,456,569,616,644,709,845,880,951,988
+"V_220",74,80,133,249,334,364,387,473,567,614,673,702,787,889,993,1017
+"V_221",43,120,187,212,289,302,392,409,560,622,654,722,778,834,970,1019
+"V_222",106,110,146,164,302,352,392,404,527,618,694,736,791,800,997,1012
+"V_223",97,110,190,225,261,275,419,512,544,637,644,764,795,878,917,925
+"V_224",48,60,161,243,277,302,419,506,551,588,705,713,803,884,1001,1002
+"V_225",91,123,147,214,325,331,403,447,532,573,693,748,790,871,954,965
+"V_226",8,102,132,234,282,355,428,458,538,640,684,701,870,876,976,984
+"V_227",39,71,137,192,286,319,447,477,531,586,738,745,772,846,935,976
+"V_228",54,108,219,223,296,302,428,487,514,565,684,725,775,776,949,1013
+"V_229",54,123,130,136,286,377,422,435,594,628,675,688,842,893,1014,1017
+"V_230",28,99,162,225,291,337,390,410,543,590,700,742,801,804,933,1013
+"V_231",70,113,160,208,278,318,423,496,549,626,752,758,852,853,935,959
+"V_232",27,81,154,234,265,349,388,459,602,610,655,662,821,829,907,944
+"V_233",77,106,133,194,280,333,403,485,527,635,687,727,803,884,922,1024
+"V_234",1,110,177,211,284,382,415,467,584,620,724,727,838,878,909,920
+"V_235",47,64,155,255,297,364,469,500,557,634,689,764,820,884,908,975
+"V_236",38,116,158,240,293,358,477,487,547,630,722,733,838,865,923,966
+"V_237",79,122,169,195,304,359,409,504,569,581,690,748,798,810,978,
+"V_238",1,109,170,199,263,365,424,496,582,625,680,704,770,801,942,951
+"V_239",66,121,139,217,299,345,420,494,554,600,649,733,812,852,930,969
+"V_240",89,104,190,240,272,307,391,399,529,557,699,745,786,815,921,996
+"V_241",82,93,152,250,260,355,401,410,531,554,690,724,815,848,935,1018
+"V_242",7,116,234,251,277,310,395,437,579,632,687,747,823,883,905,949
+"V_243",82,110,140,215,282,378,389,463,521,589,746,763,804,831,905,931
+"V_244",57,78,160,199,305,368,435,472,544,573,688,713,833,847,919,942
+"V_245",91,112,150,186,287,383,385,400,532,579,668,753,778,839,916,953
+"V_246",69,121,148,172,335,359,429,432,591,638,686,725,794,873,910,940
+"V_247",4,86,237,252,260,378,448,503,634,636,731,756,815,825,1003,1008
+"V_248",15,111,166,224,274,283,407,436,531,554,645,688,833,887,987,1004
+"V_249",4,121,160,170,341,349,494,506,517,619,724,734,775,866,897,1000
+"V_250",43,91,138,159,263,328,393,456,522,614,746,757,808,854,1010,1012
+"V_251",16,31,138,162,270,374,428,510,542,637,726,740,820,861,984,1012
+"V_252",19,26,129,223,307,314,389,503,513,609,711,737,787,806,944,1000
+"V_253",38,50,170,250,261,337,415,474,543,605,642,674,769,785,988,1019
+"V_254",42,85,159,175,306,329,415,464,555,574,685,688,821,836,992,1011
+"V_255",35,126,162,197,258,269,407,473,562,637,661,753,795,817,939,1007
+"V_256",29,37,133,174,258,339,463,511,517,613,717,744,811,888,918,1010
+"V_257",29,88,210,252,334,351,438,467,601,639,720,744,770,892,960,1012
+"V_258",102,105,140,243,281,339,434,501,561,640,645,696,795,796,969,1023
+"V_259",5,122,210,216,278,353,418,447,545,559,716,768,806,892,964,970
+"V_260",81,108,131,138,274,330,419,488,617,633,668,718,804,867,957,1014
+"V_261",17,119,192,226,289,298,427,445,541,611,690,721,808,851,953,954
+"V_262",18,67,158,168,301,384,457,496,519,565,678,733,803,832,971,982
+"V_263",53,64,166,185,307,356,395,473,515,553,671,707,788,829,937,947
+"V_264",39,116,187,189,287,355,424,471,552,622,680,748,830,848,940,993
+"V_265",103,126,163,170,354,374,467,482,550,605,649,672,823,824,976,1006
+"V_266",14,83,150,253,348,355,437,490,554,557,642,692,795,858,956,974
+"V_267",22,108,151,166,333,374,421,503,526,585,682,710,800,820,943,996
+"V_268",18,33,181,198,266,310,420,458,585,612,645,697,866,883,918,929
+"V_269",39,53,239,246,285,311,420,425,540,546,679,750,779,823,898,979
+"V_270",16,37,155,222,267,377,474,509,543,639,643,710,800,895,925,1005
+"V_271",5,87,138,204,272,371,405,476,525,567,744,762,792,861,986,1011
+"V_272",12,15,218,219,260,293,399,426,598,637,666,741,799,835,918,938
+"V_273",20,106,140,242,355,366,392,499,514,601,729,732,790,847,927,938
+"V_274",13,70,137,251,272,377,416,420,632,633,660,753,865,869,956,
+"V_275",15,81,161,195,273,286,388,444,525,615,675,706,803,853,922,940
+"V_276",25,39,174,213,336,351,398,435,556,567,759,762,804,808,906,936
+"V_277",42,46,143,158,282,368,464,500,522,616,686,715,845,883,986,1002
+"V_278",25,30,152,194,260,360,390,477,566,634,690,707,814,879,995,1024
+"V_279",6,22,224,345,367,410,481,532,569,694,721,854,859,1019,1023,
+"V_280",72,97,179,182,299,378,442,498,529,542,642,725,780,802,948,1008
+"V_281",34,125,190,214,269,338,434,447,550,631,653,749,858,893,958,980
+"V_282",35,73,232,245,310,341,418,511,534,639,721,747,859,878,966,1018
+"V_283",32,101,176,196,298,351,402,411,562,618,664,715,814,869,993,1013
+"V_284",44,111,166,243,273,382,414,467,522,561,670,682,773,823,897,964
+"V_285",25,100,165,171,289,380,398,469,537,636,735,745,805,874,998,1021
+"V_286",42,91,136,229,309,363,397,445,556,591,722,756,774,780,928,929
+"V_287",23,35,186,201,331,341,457,503,540,571,674,691,845,877,928,940
+"V_288",4,10,129,218,288,367,391,418,556,594,684,695,857,880,969,1008
+"V_289",18,35,142,208,274,330,414,436,609,613,683,737,856,891,917,943
+"V_290",101,107,162,241,373,378,392,434,523,568,668,749,795,801,911,987
+"V_291",33,97,149,225,277,346,419,428,526,590,696,745,829,896,926,959
+"V_292",84,86,191,200,310,318,412,457,542,626,742,744,808,851,938,1014
+"V_293",51,82,152,215,330,379,393,457,557,561,643,681,791,837,957,971
+"V_294",13,118,170,247,261,265,465,495,558,584,650,767,847,869,916,995
+"V_295",98,113,164,206,342,376,477,509,623,624,701,737,829,895,943,981
+"V_296",72,115,214,221,318,337,446,486,520,605,660,694,865,874,990,1022
+"V_297",32,109,161,191,263,371,469,474,575,581,703,743,868,885,933,987
+"V_298",66,76,181,238,341,373,401,446,517,560,690,768,856,884,961,
+"V_299",18,66,210,220,340,376,458,488,515,611,694,753,774,888,976,993
+"V_300",17,45,170,195,273,290,421,440,543,604,739,765,776,868,916,962
diff --git a/apps/rappor-analysis/params.csv b/apps/rappor-analysis/params.csv
new file mode 100644
index 0000000..b9107d5
--- /dev/null
+++ b/apps/rappor-analysis/params.csv
@@ -0,0 +1,2 @@
+"k","h","m","p","q","f"
+128,2,8,0.5,0.75,0
diff --git a/apps/rappor-analysis/run_app.sh b/apps/rappor-analysis/run_app.sh
new file mode 100755
index 0000000..b605730
--- /dev/null
+++ b/apps/rappor-analysis/run_app.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+#
+# Run the Shiny app in this directory.
+#
+# Usage:
+# ./run_app.sh [port]
+
+app_dir=$(dirname $0)
+port=${1:-6789}
+
+# host= makes it serve to other machines, not just localhost.
+exec R --vanilla --slave -e "shiny::runApp('$app_dir', host='0.0.0.0', port=$port)"
diff --git a/apps/rappor-analysis/server.R b/apps/rappor-analysis/server.R
new file mode 100755
index 0000000..1f3b868
--- /dev/null
+++ b/apps/rappor-analysis/server.R
@@ -0,0 +1,206 @@
+library(shiny)
+
+source("../../analysis/R/read_input.R")
+source("../../analysis/R/decode.R")
+
+# Random number associated with the session used in exported file names.
+seed <- sample(10^6, 1)
+
+PlotCohorts <- function(x, highlighted, color = "grey") {
+ n <- nrow(x)
+ k <- ncol(x)
+ if (n < 16) {
+ par(mfrow = c(n, 1), mai = c(0, .5, .5, 0))
+ } else if (n < 64) {
+ par(mfrow = c(n / 2, 2), mai = c(0, .5, .5, 0))
+ } else {
+ par(mfrow = c(n / 4, 4), mai = c(0, .5, .5, 0))
+ }
+ for (i in 1:n) {
+ cc <- rep(color, k)
+ if (!is.null(highlighted)) {
+ ind <- highlighted[which(ceiling(highlighted / k) == i)] %% k
+ cc[ind] <- "greenyellow"
+ }
+ barplot(x[i, ], main = paste0("Cohort ", i), col = cc, border = cc,
+ names.arg = "")
+ }
+}
+
+shinyServer(function(input, output, session) {
+ Params <- reactive({
+ param_file <- input$params
+ if (!is.null(param_file)) {
+ params <- ReadParameterFile(param_file$datapath)
+ updateSelectInput(session, "size", selected = params$k)
+ updateSelectInput(session, "hashes", selected = params$h)
+ updateSelectInput(session, "instances", selected = params$m)
+ updateSliderInput(session, "p", value = params$p)
+ updateSliderInput(session, "q", value = params$q)
+ updateSliderInput(session, "f", value = params$f)
+ } else {
+ params <- list(k = as.numeric(input$size),
+ h = as.numeric(input$hashes),
+ m = as.numeric(input$instances),
+ p = as.numeric(input$p),
+ q = as.numeric(input$q),
+ f = as.numeric(input$f))
+ }
+ params
+ })
+
+ Counts <- reactive({
+ params <- Params()
+ counts_file <- input$counts
+ if (is.null(counts_file)) {
+ return(NULL)
+ }
+
+ counts <- ReadCountsFile(counts_file$datapath, params)
+ updateNumericInput(session, "N", value = sum(counts[, 1]))
+ counts
+ })
+
+ output$countsUploaded <- reactive({
+ ifelse(is.null(input$counts), FALSE, TRUE)
+ })
+ outputOptions(output, 'countsUploaded', suspendWhenHidden=FALSE)
+
+ Map <- reactive({
+ params <- Params()
+ map_file <- input$map
+ if (is.null(map_file)) {
+ return(NULL)
+ }
+
+ map <- ReadMapFile(map_file$datapath, params)
+ updateSelectInput(session, "selected_string", choices = map$strs, selected = map$strs[1])
+ map
+ })
+
+ output$mapUploaded <- reactive({
+ ifelse(is.null(input$map), FALSE, TRUE)
+ })
+ outputOptions(output, 'mapUploaded', suspendWhenHidden=FALSE)
+
+ DecodingParams <- reactive({
+ list(alpha = as.numeric(input$alpha),
+ correction = input$correction)
+ })
+
+ Analyze <- reactive({
+ if (is.null(input$map) || is.null(input$counts)) {
+ return()
+ }
+ params <- Params()
+ map <- Map()
+ counts <- Counts()
+ decoding_params <- DecodingParams()
+
+ fit <- Decode(counts, map$map, params,
+ alpha = decoding_params$alpha,
+ correction = decoding_params$correction)
+ fit
+ })
+
+ # Results summary.
+ output$pr <- renderTable({
+ Analyze()$summary
+ },
+ include.rownames = FALSE, include.colnames = FALSE)
+
+ # Results table.
+ output$tab <- renderDataTable({
+ Analyze()$fit
+ },
+ options = list(iDisplayLength = 100))
+
+ # Results barplot.
+ output$res_barplot <- renderPlot({
+ fit <- Analyze()$fit
+
+ par(mai = c(2, 1, 1, .5))
+
+ bp <- barplot(fit$proportion, col = "palegreen",
+ main = "Discovered String Distribution")
+ abline(h = Analyze()$privacy[7, 2], col = "darkred", lty = 2, lwd = 2)
+ text(bp[, 1], 0, paste(fit$strings, " "), srt = 45, adj = c(1, 1), xpd = NA)
+ legend("topright", legend = "Detection Frequency", lty = 2, lwd = 2, col = "darkred",
+ bty = "n")
+ })
+
+ # Epsilon.
+ output$epsilon <- renderTable({
+ Analyze()$privacy
+ },
+ include.rownames = FALSE, include.colnames = FALSE, digits = 4)
+
+ output$map <- renderPlot({
+ image(as.matrix(Map()$map), col = c("white", "darkred"), xaxt = "n", yaxt = "n", bty = "n")
+ })
+
+ # Estimated bits patterns.
+ output$ests <- renderPlot({
+ ests <- Analyze()$ests
+ ind <- which(input$selected_string == Map()$strs)
+ high <- unlist(Map()$map_pos[ind, -1])
+ PlotCohorts(ests, high, color = "darkred")
+ })
+
+ # Collisions.
+ output$collisions <- renderPlot({
+ params <- Params()
+ map <- Map()
+ tab <- table(unlist(map$map_pos[, -1]))
+ tab <- tab[as.character(1:(params$k * params$m))]
+ tab[is.na(tab)] <- 0
+ tab <- matrix(tab, nrow = params$m, byrow = TRUE)
+
+ ind <- which(input$selected_string == map$strs)
+ high <- unlist(map$map_pos[ind, -1])
+
+ PlotCohorts(tab, high, color = "navajowhite")
+ })
+
+ # Observed counts.
+ output$counts <- renderPlot({
+ counts <- as.matrix(Analyze()$counts)
+ ind <- which(input$selected_string == Map()$strs)
+ high <- unlist(Map()$map_pos[ind, -1])
+ PlotCohorts(counts, high, color = "darkblue")
+ })
+
+ # Downloadable datasets.
+ output$download_fit <- downloadHandler(
+ filename = function() { paste("results_", seed, "_", date(), '.csv', sep='') },
+ content = function(file) {
+ write.csv(Analyze()$fit, file, row.names = FALSE)
+ }
+ )
+
+ output$download_summary <- downloadHandler(
+ filename = function() { paste("summary_", seed, "_", date(), '.csv', sep='') },
+ content = function(file) {
+ write.csv(rbind(Analyze()$summary, Analyze()$privacy, Analyze()$params),
+ file, row.names = FALSE)
+ }
+ )
+
+ output$example_params <- renderTable({
+ as.data.frame(ReadParameterFile("params.csv"))
+ },
+ include.rownames = FALSE)
+
+ output$example_counts <- renderTable({
+ counts <- ReadCountsFile("counts.csv")[, 1:15]
+ cbind(counts, rep("...", nrow(counts)))
+ },
+ include.rownames = FALSE, include.colnames = FALSE)
+
+ output$example_map <- renderTable({
+ map <- ReadMapFile("map.csv", ReadParameterFile("params.csv"))
+ map$map_pos[1:10, ]
+ },
+ include.rownames = FALSE, include.colnames = FALSE)
+
+})
diff --git a/apps/rappor-analysis/test.csv b/apps/rappor-analysis/test.csv
new file mode 100644
index 0000000..d6105ae
--- /dev/null
+++ b/apps/rappor-analysis/test.csv
@@ -0,0 +1,20 @@
+[1] String Estimate St.Dev P.value Proportion SNR
+<0 rows> (or 0-length row.names)
+SUMMARY
+ parameters values
+1 Candidate strings 300.000
+2 Detected strings 0.000
+3 Discovered Prop (out of N) 0.000
+4 Explained Variance 0.000
+5 Missing Variance 0.988
+6 Noise Variance 0.012
+7 Theoretical Noise Std. Dev. 2236.068
+PRIVACY
+ parameters values
+1 Effective p 0.500000000
+2 Effective q 0.750000000
+3 exp(e_1) 9.000000000
+4 e_1 2.197224577
+5 exp(e_inf) Inf
+6 e_inf Inf
+7 Detection frequency 0.001040297
diff --git a/apps/rappor-analysis/ui.R b/apps/rappor-analysis/ui.R
new file mode 100755
index 0000000..3230726
--- /dev/null
+++ b/apps/rappor-analysis/ui.R
@@ -0,0 +1,96 @@
+library(shiny)
+
+shinyUI(pageWithSidebar(
+ headerPanel("RAPPOR Analysis"),
+ sidebarPanel(
+ tabsetPanel(tabPanel("Input",
+ fileInput('params', 'Select Params File (optional)',
+ accept=c('txt/csv', 'text/comma-separated-values,text/plain', '.csv')),
+ fileInput('counts', 'Select Counts File',
+ accept=c('txt/csv', 'text/comma-separated-values,text/plain', '.csv')),
+ fileInput('map', 'Select Map File',
+ accept=c('txt/csv', 'text/comma-separated-values,text/plain', '.csv')),
+ br(),
+ br()
+ ),
+ tabPanel("RAPPOR",
+ selectInput("size", "Bloom filter size:",
+ c(64, 128, 256, 512, 1024, 2048, 4096),
+ selected = 128),
+ selectInput("hashes", "Number of hash functions:",
+ c(1, 2, 4, 8, 16, 32),
+ selected = 2),
+ selectInput("instances", "Number of cohorts:",
+ c(1, 2, 4, 8, 16, 32, 64),
+ selected = 8),
+ numericInput("N", "Number of reports", 0),
+ br(),
+ br(),
+ br()
+ ),
+ tabPanel("Privacy",
+ sliderInput("p", "Probability of reporting noise (p):",
+ min = .01, max = .99, value = .5, step = .01),
+ sliderInput("q", "Probability of reporting signal (q):",
+ min = .01, max = .99, value = .75, step = .01),
+ sliderInput("f", "Probability of lies (f):",
+ min = 0, max = .99, value = .5, step = .01),
+ br(),
+ htmlOutput("epsilon"),
+ br(),
+ helpText("* In addition to p, q and f, the number of hash functions (set in the RAPPOR tab) also effects privacy guarantees."),
+ br(),
+ br(),
+ br()
+ ),
+ tabPanel("Decoding",
+ sliderInput("alpha", "Alpha - probability of false positive:",
+ min = .01, max = 1, value = .05, step = .01),
+ br(),
+ selectInput("correction", "Multiple testing correction",
+ c("None", "Bonferroni", "FDR"),
+ selected = "FDR"),
+ br(),
+ br()
+ )
+ ),
+ conditionalPanel(
+ condition = "output.countsUploaded && output.mapUploaded",
+ helpText(actionButton("run", "Run Analysis"), align = "center")
+ ),
+ br(),
+ br(),
+ helpText("Version 0.1", align = "center"),
+ helpText(a("RAPPOR Paper", href="http://arxiv.org/abs/1407.6981"), align = "center")),
+ mainPanel(
+ conditionalPanel(
+ condition = "!output.countsUploaded || !output.mapUploaded",
+ helpText(h2("Welcome to the RAPPOR Analysis Tool")),
+ helpText("To analyze a RAPPOR collection, please upload three files:"),
+ helpText(h3("1. Params file"), "This file specifies the 6 parameters that were used to encode RAPPOR reports. An example is shown below. It must have column names in the header line, 6 columns in this order, and 1 row. "),
+ htmlOutput("example_params"),
+ helpText(h3("2. Counts file"), "Required. This file must have as many rows as cohorts. The first column contains the number of reports in the cohort. The remaining k columns specify the number of times the corresponding bit was set in all reports (in the corresponding cohort). This file cannot have a CSV header line."),
+ htmlOutput("example_counts"),
+ helpText(h3("3. Map file"), "Required. The first column contains candidate strings. The remaining columns show which bit each string is hashed to within each cohort. Indices are specified in the extended format, starting with index 1 (not 0!). Because we do not specify a cohort in the map file, indices must be adjusted in the following way. For example, if bits i and j are set in cohort 2, then their corresponding indices are i + k and j + k in the map file. The number of columns is equal to (h * m). This file cannot have a CSV header line."),
+ htmlOutput("example_map")
+ ),
+ conditionalPanel(
+ condition = "output.countsUploaded && output.mapUploaded",
+ tabsetPanel(
+ tabPanel("Results",
+ helpText(h3("Summary")), htmlOutput("pr"), br(),
+ downloadButton('download_summary', 'Download Summary'),
+ downloadButton('download_fit', 'Download Results'),
+ br(), br(), dataTableOutput("tab")),
+ tabPanel("Distribution", plotOutput("res_barplot", height = "800px")),
+ tabPanel("Observed Counts",
+ selectInput("selected_string", "Select String",
+ ""),
+ plotOutput("counts", height = "800px")),
+ tabPanel("Estimated Counts", plotOutput("ests", height = "800px")),
+ tabPanel("Collision Counts", plotOutput("collisions", height = "800px")),
+ tabPanel("Map", plotOutput("map", height = "800px"))
+ )
+ )
+ )
+ ))
diff --git a/apps/rappor-sim/run_app.sh b/apps/rappor-sim/run_app.sh
new file mode 100755
index 0000000..ae38c61
--- /dev/null
+++ b/apps/rappor-sim/run_app.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+#
+# Run the Shiny app in this directory.
+#
+# Usage:
+# ./run_app.sh [port]
+
+app_dir=$(dirname $0)
+port=${1:-6788}
+
+# Needed by source.rappor in analysis/R/*.R
+export RAPPOR_REPO=../../
+
+# host= makes it serve to other machines, not just localhost.
+exec R --vanilla --slave -e "shiny::runApp('$app_dir', host='0.0.0.0', port=$port)"
diff --git a/apps/rappor-sim/server.R b/apps/rappor-sim/server.R
new file mode 100755
index 0000000..f4a847b
--- /dev/null
+++ b/apps/rappor-sim/server.R
@@ -0,0 +1,156 @@
+library(shiny)
+source("../../analysis/R/decode.R")
+source("../../analysis/R/simulation.R")
+source("../../analysis/R/encode.R")
+
+Plot <- function(x, color = "grey") {
+ n <- nrow(x)
+ if (n < 16) {
+ par(mfrow = c(n, 1), mai = c(0, .5, .5, 0))
+ } else if (n < 64) {
+ par(mfrow = c(n / 2, 2), mai = c(0, .5, .5, 0))
+ } else {
+ par(mfrow = c(n / 4, 4), mai = c(0, .5, .5, 0))
+ }
+ for (i in 1:nrow(x)) {
+ barplot(x[i, ], main = paste0("Cohort ", i), col = color, border = color)
+ }
+}
+
+shinyServer(function(input, output) {
+ # Example state global variable.
+ es <- list()
+
+ # Example buttons states.
+ ebs <- rep(0, 3)
+
+ Params <- reactive({
+ list(k = as.numeric(input$size),
+ h = as.numeric(input$hashes),
+ m = as.numeric(input$instances),
+ p = as.numeric(input$p),
+ q = as.numeric(input$q),
+ f = as.numeric(input$f))
+ })
+
+ PopParams <- reactive({
+ list(as.numeric(input$nstrs),
+ as.numeric(input$nonzero),
+ input$decay,
+ as.numeric(input$expo),
+ as.numeric(input$background)
+ )
+ })
+
+ DecodingParams <- reactive({
+ list(as.numeric(input$alpha),
+ input$correction)
+ })
+
+ Sample <- reactive({
+ input$sample
+ N <- input$N
+ params <- Params()
+ pop_params <- PopParams()
+ decoding_params <- DecodingParams()
+ prop_missing <- input$missing
+ fit <- GenerateSamples(N, params, pop_params,
+ alpha = decoding_params[[1]],
+ correction = decoding_params[[2]],
+ prop_missing = prop_missing)
+ fit
+ })
+
+ # Results summary.
+ output$pr <- renderTable({
+ Sample()$summary
+ },
+ include.rownames = FALSE, include.colnames = FALSE)
+
+ # Results table.
+ output$tab <- renderDataTable({
+ Sample()$fit
+ },
+ options = list(iDisplayLength = 100))
+
+ # Epsilon.
+ output$epsilon <- renderTable({
+ Sample()$privacy
+ },
+ include.rownames = FALSE, include.colnames = FALSE, digits = 4)
+
+ # True distribution.
+ output$probs <- renderPlot({
+ samp <- Sample()
+ probs <- samp$probs
+ detected <- match(samp$fit[, 1], samp$strs)
+ detection_frequency <- samp$privacy[7, 2]
+ PlotPopulation(probs, detected, detection_frequency)
+ })
+
+ # True bits patterns.
+ output$truth <- renderPlot({
+ truth <- Sample()$truth
+ Plot(truth[, -1, drop = FALSE], color = "darkblue")
+ })
+
+ # Lasso plot.
+ output$lasso <- renderPlot({
+ fit <- Sample()$lasso
+ if (!is.null(fit)) {
+ plot(fit)
+ }
+ })
+
+ output$resid <- renderPlot({
+ resid <- Sample()$residual
+ params <- Params()
+ plot(resid, xlab = "Bloom filter bits", ylab = "Residuals")
+ abline(h = c(-1.96, 1.96), lty = 2, col = 2)
+ sq <- qnorm(.025 / length(resid))
+ abline(h = c(sq, -sq), lty = 2, col = 3, lwd = 2)
+ abline(h = c(-3, 3), lty = 2, col = 4, lwd = 2)
+ abline(v = params$k * (0:params$m), lty = 2, col = "blue")
+ legend("topright", legend = paste0("SD = ", round(sd(resid), 2)), bty = "n")
+ })
+
+ # Estimated bits patterns.
+ output$ests <- renderPlot({
+ ests <- Sample()$ests
+ Plot(ests, color = "darkred")
+ })
+
+ # Estimated vs truth.
+ output$ests_truth <- renderPlot({
+ plot(unlist(Sample()$ests), unlist(Sample()$truth[, -1]),
+ xlab = "Estimates", ylab = "Truth", pch = 19)
+ abline(0, 1, lwd = 4, col = "darkred")
+ })
+
+ output$example <- renderPlot({
+ params <- Params()
+ strs <- Sample()$strs
+ map <- Sample()$map
+ samp <- Sample()
+
+ # First run on app start.
+ value <- sample(strs, 1)
+ res <- Encode(value, map, strs, params, N = input$N)
+
+ if (input$new_user > ebs[1]) {
+ res <- Encode(es$value, map, strs, params, N = input$N)
+ ebs[1] <<- input$new_user
+ } else if (input$new_value > ebs[2]) {
+ res <- Encode(value, map, strs, params, cohort = es$cohort, id = es$id,
+ N = input$N)
+ ebs[2] <<- input$new_value
+ } else if (input$new_report > ebs[3]) {
+ res <- Encode(es$value, map, strs, params, B = es$B,
+ BP = es$BP, cohort = es$cohort, id = es$id, N = input$N)
+ ebs[3] <<- input$new_report
+ }
+ es <<- res
+ ExamplePlot(res, params$k, c(ebs, input$new_user, input$new_value, input$new_report))
+ })
+
+})
diff --git a/apps/rappor-sim/ui.R b/apps/rappor-sim/ui.R
new file mode 100755
index 0000000..9a1eb63
--- /dev/null
+++ b/apps/rappor-sim/ui.R
@@ -0,0 +1,92 @@
+library(shiny)
+
+shinyUI(pageWithSidebar(
+ headerPanel("RAPPOR Simulation"),
+ sidebarPanel(
+ tabsetPanel(
+ tabPanel("RAPPOR",
+ selectInput("size", "Bloom filter size:",
+ c(4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096),
+ selected = 128),
+ selectInput("hashes", "Number of hash functions:",
+ c(1, 2, 4, 8, 16, 32),
+ selected = 2),
+ selectInput("instances", "Number of cohorts:",
+ c(1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024),
+ selected = 8),
+ br(),
+ br(),
+ sliderInput("N", "Number of samples to generate:",
+ min = 100000, max = 10000000,
+ value = 1000000, step = 100000),
+ br(),
+ helpText(actionButton("sample", "Rerun Simulations"), align = "center"),
+ br(),
+ br(),
+ helpText("Version 0.1", align = "center"),
+ helpText(a("RAPPOR Repository", href="http://github.com/google/rappor"), align = "center")),
+ tabPanel("Privacy",
+ sliderInput("p", "Probability of reporting noise (p):",
+ min = .01, max = .99, value = .5, step = .01),
+ sliderInput("q", "Probability of reporting signal (q):",
+ min = .01, max = .99, value = .75, step = .01),
+ sliderInput("f", "Probability of lies (f):",
+ min = 0, max = .99, value = .5, step = .01),
+ br(),
+ htmlOutput("epsilon"),
+ br(),
+ helpText("* In addition to p, q and f, the number of hash functions (set in the RAPPOR tab) also effects privacy guarantees."),
+ br(),
+ br(),
+ br()
+ ),
+ tabPanel("Population",
+ sliderInput("nstrs", "Number of strings:",
+ min = 100, max = 10000, value = 300, step = 100),
+ br(),
+ sliderInput("nonzero", "Proportion of non-zero strings:",
+ min = .1, max = 1, value = 0.5, step = .1),
+ br(),
+ selectInput("decay", "Decay of non-zero strings",
+ c("Linear", "Exponential", "Constant"),
+ selected = "Exponential"),
+ br(),
+ conditionalPanel(condition = "input.decay == 'Exponential'",
+ sliderInput("expo", "Rate of exponential decay",
+ min = 1, max = 200, value = 10, step = 1)),
+ sliderInput("background", "Frequency of background strings:",
+ min = 0, max = .2, value = .05, step = .01),
+ br(),
+ br(),
+ br()
+ ),
+ tabPanel("Decoding",
+ sliderInput("alpha", "Alpha - probability of false positive:",
+ min = .01, max = .3, value = .05, step = .01),
+ br(),
+ selectInput("correction", "Multiple testing correction",
+ c("None", "Bonferroni", "FDR"),
+ selected = "FDR"),
+ br(),
+ sliderInput("missing", "Proportion of non-zero strings missing from decoding:",
+ min = 0, max = 1, value = 0, step = .1),
+ br()
+ )
+ )),
+ mainPanel(
+ tabsetPanel(
+ tabPanel("Life of a Report",
+ actionButton("new_user", "New Participant"),
+ actionButton("new_value", "New Value"),
+ actionButton("new_report", "New Report"),
+ plotOutput("example", height = "600px")),
+ tabPanel("Population", plotOutput("probs", height = "600px")),
+ tabPanel("Results", helpText(h3("Summary")), htmlOutput("pr"), br(), br(), dataTableOutput("tab")),
+ tabPanel("True Bits", plotOutput("truth", height = "800px")),
+ tabPanel("Estimated Bits", plotOutput("ests", height = "800px")),
+ tabPanel("Estimates vs Truth", plotOutput("ests_truth", height = "600px")),
+ # tabPanel("Lasso", plotOutput("lasso", height = "600px")),
+ tabPanel("Residuals", plotOutput("resid", height = "600px"))
+ )
+ )
+ ))
diff --git a/bin/README.md b/bin/README.md
new file mode 100644
index 0000000..f4262bf
--- /dev/null
+++ b/bin/README.md
@@ -0,0 +1,51 @@
+Command Line Tools
+==================
+
+This directory contains command line tools for RAPPOR analysis.
+
+Analysis Tools
+--------------
+
+### decode-dist
+
+Decode a distribution -- requires a "counts" file (summed bits from reports),
+map file, and a params file. See `test.sh decode-dist` in this dir for an
+example.
+
+### decode-assoc
+
+Decode a joint distribution between 2 variables ("association analysis"). See
+`test.sh decode-assoc-R` or `test.sh decode-assoc-cpp` in this dir for an
+example.
+
+Currently it only supports associating strings vs. booleans.
+
+### Setup
+
+Both of these tools are written in R, and require several R libraries to be
+installed (see `../setup.sh r-packages`).
+
+`decode-assoc` also shells out to a native binary written in C++ if
+`--em-executable` is passed. This requires a C++ compiler (see
+`analysis/cpp/run.sh`). You can run `test.sh decode-assoc-cpp` to test it.
+
+
+Helper Tools
+------------
+
+These are simple Python implementations of tools needed for analysis. At
+Google, Chrome uses alternative C++/Go implementations of these tools.
+
+### sum-bits
+
+Given a CSV file with RAPPOR reports (IRRs), produce a "counts" CSV file on
+stdout. This is the `m x (k+1)` matrix that is used in the R analysis (where m
+= #cohorts and k = report width in bits).
+
+### hash-candidates
+
+Given a list of candidates on stdin, produce a CSV file of hashes (the "map
+file"). Each row has `m x h` cells (where m = #cohorts and h = #hashes)
+
+See the `regtest.sh` script for examples of how these tools are invoked.
+
diff --git a/bin/decode-assoc b/bin/decode-assoc
new file mode 100644
index 0000000..aaa2050
--- /dev/null
+++ b/bin/decode-assoc
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Decode multidimensional reports.
+#
+# This is a tiny shell wrapper around R.
+
+readonly THIS_DIR=$(dirname $0)
+
+# NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string
+# concatenation to form the absolute path. (file.path() in R doesn't do what
+# we want.)
+
+readonly RAPPOR_REPO=$THIS_DIR/../
+
+# RAPPOR_REPO is used by source() statements to find .R files.
+export RAPPOR_REPO
+
+# Make sure to reuse the same process so it can be killed easily.
+exec $THIS_DIR/decode_assoc.R "$@"
diff --git a/bin/decode-dist b/bin/decode-dist
new file mode 100644
index 0000000..147e41c
--- /dev/null
+++ b/bin/decode-dist
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Decode a distribution from summed RAPPOR reports.
+#
+# This is a tiny shell wrapper around R.
+
+readonly THIS_DIR=$(dirname $0)
+
+# NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string
+# concatenation to form the absolute path. (file.path() in R doesn't do what
+# we want.)
+
+readonly RAPPOR_REPO=$THIS_DIR/../
+
+# RAPPOR_REPO is used by source() statements to find .R files.
+export RAPPOR_REPO
+
+# Make sure to reuse the same process so it can be killed easily.
+exec $THIS_DIR/decode_dist.R "$@"
diff --git a/bin/decode_assoc.R b/bin/decode_assoc.R
new file mode 100755
index 0000000..58e35f2
--- /dev/null
+++ b/bin/decode_assoc.R
@@ -0,0 +1,429 @@
+#!/usr/bin/env Rscript
+#
+# Command line tool to decode multidimensional reports. It's a simple wrapper
+# around functions in association.R.
+
+library(optparse)
+
+#
+# Command line parsing. Do this first before loading libraries to catch errors
+# quickly. Loading libraries in R is slow.
+#
+
+# Display an error string and quit.
+UsageError <- function(...) {
+ cat(sprintf(...))
+ cat('\n')
+ quit(status = 1)
+}
+
+option_list <- list(
+ make_option(
+ "--metric-name", dest="metric_name", default="",
+ help="Name of the metric; metrics contain variables (required)"),
+ make_option(
+ "--reports", default="",
+ help="CSV file with reports; each variable is a column (required)"),
+ make_option(
+ "--schema", default="",
+ help="CSV file with variable types and metadata (required)"),
+ make_option(
+ "--params-dir", dest="params_dir", default="",
+ help="Directory where parameter CSV files are stored (required)"),
+
+ make_option(
+ "--var1", default="",
+ help="Name of first variable (required)"),
+ make_option(
+ "--var2", default="",
+ help="Name of second variable (required)"),
+
+ make_option(
+ "--map1", default="",
+ help="Path to map file, if var1 is a string"),
+ make_option(
+ "--map2", default="",
+ help="Path to map file, if var2 is a string"),
+
+ make_option(
+ "--output-dir", dest="output_dir", default=".",
+ help="Output directory (default .)"),
+
+ make_option(
+ "--create-bool-map", dest="create_bool_map", default=FALSE,
+ action="store_true",
+ help="Hack to use string RAPPOR to analyze boolean variables."),
+ make_option(
+ "--remove-bad-rows", dest="remove_bad_rows", default=FALSE,
+ action="store_true",
+ help="Whether we should remove rows where any value is missing (by
+ default, the program aborts with an error)"),
+
+ # Options that speed it up
+ make_option(
+ "--reports-sample-size", dest="reports_sample_size", default=-1,
+ help="Only analyze a random sample of this size. This is for
+ limiting the execution time at the expense of accuracy."),
+ make_option(
+ "--num-cores", dest="num_cores", default=1,
+ help="Number of cores for mclapply to use. Speeds up the parts
+ of the computation proportional to the number of reports,
+ EXCEPT the EM step, which can be sped up by native code."),
+ make_option(
+ "--max-em-iters", dest="max_em_iters", default=1000,
+ help="Maximum number of EM iterations"),
+ make_option(
+ "--em-executable", dest="em_executable", default="",
+ help="Shell out to this executable for an accelerated implementation
+ of EM."),
+ make_option(
+ "--tmp-dir", dest="tmp_dir", default="/tmp",
+ help="Use this tmp dir to communicate with the EM executable")
+)
+
+ParseOptions <- function() {
+ # NOTE: This API is bad; if you add positional_arguments, the return value
+ # changes!
+ parser <- OptionParser(option_list = option_list)
+ opts <- parse_args(parser)
+
+ if (opts$metric_name == "") {
+ UsageError("--metric-name is required.")
+ }
+ if (opts$reports== "") {
+ UsageError("--reports is required.")
+ }
+ if (opts$schema == "") {
+ UsageError("--schema is required.")
+ }
+ if (opts$params_dir == "") {
+ UsageError("--params-dir is required.")
+ }
+ if (opts$var1 == "") {
+ UsageError("--var1 is required.")
+ }
+ if (opts$var2 == "") {
+ UsageError("--var2 is required.")
+ }
+
+ return(opts)
+}
+
+if (!interactive()) {
+ opts <- ParseOptions()
+}
+
+#
+# Load libraries and source our own code.
+#
+
+library(RJSONIO) # toJSON()
+
+# So we don't have to change pwd
+source.rappor <- function(rel_path) {
+ abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+ source(abs_path)
+}
+
+source.rappor("analysis/R/association.R")
+source.rappor("analysis/R/fast_em.R")
+source.rappor("analysis/R/read_input.R")
+source.rappor("analysis/R/util.R")
+
+options(stringsAsFactors = FALSE)
+options(max.print = 100) # So our structure() debug calls look better
+
+CreateAssocStringMap <- function(all_cohorts_map, params) {
+ # Processes the maps loaded using ReadMapFile and turns it into something
+ # that association.R can use. Namely, we want a map per cohort.
+ #
+ # Arguments:
+ # all_cohorts_map: map matrix, as for single variable analysis
+ # params: encoding parameters
+
+ if (nrow(all_cohorts_map) != (params$m * params$k)) {
+ stop(sprintf(
+ "Map matrix has invalid dimensions: m * k = %d, nrow(map) = %d",
+ params$m * params$k, nrow(all_cohorts_map)))
+ }
+
+ k <- params$k
+ map_by_cohort <- lapply(0 : (params$m-1), function(cohort) {
+ begin <- cohort * k
+ end <- (cohort + 1) * k
+ all_cohorts_map[(begin+1) : end, ]
+ })
+
+ list(all_cohorts_map = all_cohorts_map, map_by_cohort = map_by_cohort)
+}
+
+# Hack to create a map for booleans. We should use closed-form formulas instead.
+CreateAssocBoolMap <- function(params) {
+ names <- c("FALSE", "TRUE")
+
+ map_by_cohort <- lapply(1:params$m, function(unused_cohort) {
+ # The (1,1) cell is false and the (1,2) cell is true.
+ m <- sparseMatrix(c(1), c(2), dims = c(1, 2))
+ colnames(m) <- names
+ m
+ })
+
+ all_cohorts_map <- sparseMatrix(1:params$m, rep(2, params$m))
+ colnames(all_cohorts_map) <- names
+
+ list(map_by_cohort = map_by_cohort, all_cohorts_map = all_cohorts_map)
+}
+
+ResultMatrixToDataFrame <- function(m, string_var_name, bool_var_name) {
+ # Args:
+ # m: A 2D matrix as output by ComputeDistributionEM, e.g.
+ # bing.com yahoo.com google.com Other
+ # TRUE 0.2718526 0.1873424 0.19637704 0.003208933
+ # Other 0.1404581 0.1091826 0.08958427 0.001994163
+ # Returns:
+ # A flattened data frame, e.g.
+
+ # Name the dimensions of the matrix.
+ dim_names <- list()
+ # TODO: generalize this. Right now we're assuming the first dimension is
+ # boolean.
+ dim_names[[bool_var_name]] <- c('TRUE', 'FALSE')
+ dim_names[[string_var_name]] <- dimnames(m)[[2]]
+
+ dimnames(m) <- dim_names
+
+ # http://stackoverflow.com/questions/15885111/create-data-frame-from-a-matrix-in-r
+ fit_df <- as.data.frame(as.table(m))
+
+ # The as.table conversion gives you a Freq column. Call it "proportion" to
+ # be consistent with single variable analysis.
+ colnames(fit_df)[colnames(fit_df) == "Freq"] <- "proportion"
+
+ fit_df
+}
+
+main <- function(opts) {
+ Log("decode-assoc")
+ Log("argv:")
+ print(commandArgs(TRUE))
+
+ schema <- read.csv(opts$schema)
+ Log("Read %d vars from schema", nrow(schema))
+
+ schema1 <- schema[schema$metric == opts$metric_name &
+ schema$var == opts$var1, ]
+ if (nrow(schema1) == 0) {
+ UsageError("Couldn't find metric '%s', field '%s' in schema",
+ opts$metric_name, opts$var1)
+ }
+ schema2 <- schema[schema$metric == opts$metric_name &
+ schema$var== opts$var2, ]
+ if (nrow(schema2) == 0) {
+ UsageError("Couldn't find metric '%s', field '%s' in schema",
+ opts$metric_name, opts$var2)
+ }
+
+ if (schema1$params != schema2$params) {
+ UsageError('var1 and var2 should have the same params (%s != %s)',
+ schema1$params, schema2$params)
+ }
+ params_name <- schema1$params
+ params_path <- file.path(opts$params_dir, paste0(params_name, '.csv'))
+ params <- ReadParameterFile(params_path)
+
+ var1_type <- schema1$var_type
+ var2_type <- schema2$var_type
+
+ # Right now we're assuming that --var1 is a string and --var2 is a boolean.
+ # TODO: Remove these limitations.
+ if (var1_type != "string") {
+ UsageError("Variable 1 should be a string (%s is of type %s)", opts$var1,
+ var1_type)
+ }
+ if (var2_type != "boolean") {
+ UsageError("Variable 2 should be a boolean (%s is of type %s)", opts$var2,
+ var2_type)
+ }
+
+ if (opts$map1 == "") {
+ UsageError("--map1 must be provided when --var1 is a string (var = %s)",
+ opts$var1)
+ }
+
+ # Example cache speedup for 100k map file: 31 seconds to load map and write
+ # cache; vs 2.2 seconds to read cache.
+ string_params <- params
+ map <- LoadMapFile(opts$map1, string_params)
+
+ # Important: first column is cohort (integer); the rest are variables, which
+ # are ASCII bit strings.
+ reports <- read.csv(opts$reports, colClasses=c("character"), as.is = TRUE)
+
+ Log("Read %d reports. Preview:", nrow(reports))
+ print(head(reports))
+ cat('\n')
+
+ # Filter bad reports first
+ is_empty1 <- reports[[opts$var1]] == ""
+ is_empty2 <- reports[[opts$var2]] == ""
+ Log('Found %d blank values in %s', sum(is_empty1), opts$var1)
+ Log('Found %d blank values in %s', sum(is_empty2), opts$var2)
+
+ is_empty <- is_empty1 | is_empty2 # boolean vectors
+ Log('%d bad rows', sum(is_empty))
+ if (sum(is_empty) > 0) {
+ if (opts$remove_bad_rows) {
+ reports <- reports[!is_empty, ]
+ Log('Removed %d rows, giving %d rows', sum(is_empty), nrow(reports))
+ } else {
+ stop("Found bad rows and --remove-bad-rows wasn't passed")
+ }
+ }
+
+ N <- nrow(reports)
+
+ if (N == 0) {
+ # Use an arbitrary error code when there is nothing to analyze, so we can
+ # distinguish this from more serious failures.
+ Log("No reports to analyze. Exiting with code 9.")
+ quit(status = 9)
+ }
+
+ # Sample reports if specified.
+ if (opts$reports_sample_size != -1) {
+ if (N > opts$reports_sample_size) {
+ indices <- sample(1:N, opts$reports_sample_size)
+ reports <- reports[indices, ]
+ Log("Created a sample of %d reports", nrow(reports))
+ } else {
+ Log("Got less than %d reports, not sampling", opts$reports_sample_size)
+ }
+ }
+
+ num_vars <- 2 # hard-coded for now, since there is --var1 and --var2.
+
+ # Convert strings to integers
+ cohorts <- as.integer(reports$cohort)
+
+ # Hack for Chrome: like AdjustCounts in decode_dist.R.
+ cohorts <- cohorts %% params$m
+
+ # Assume the input has 0-based cohorts, and change to 1-based cohorts.
+ cohorts <- cohorts + 1
+
+ # i.e. create a list of length 2, with identical cohorts.
+ # NOTE: Basic RAPPOR doesn't need cohorts.
+ cohorts_list <- rep(list(cohorts), num_vars)
+
+ # TODO: We should use the closed-form formulas rather than calling the
+ # solver, and not require this flag.
+ if (!opts$create_bool_map) {
+ stop("ERROR: pass --create-bool-map to analyze booleans.")
+ }
+
+ bool_params <- params
+ # HACK: Make this the boolean. The Decode() step uses k. (Note that R makes
+ # a copy here)
+ bool_params$k <- 1
+
+ params_list <- list(bool_params, string_params)
+
+ Log('CreateAssocStringMap')
+ string_map <- CreateAssocStringMap(map$map, params)
+
+ Log('CreateAssocBoolMap')
+ bool_map <- CreateAssocBoolMap(params)
+
+ map_list <- list(bool_map, string_map)
+
+ string_var <- reports[[opts$var1]]
+ bool_var <- reports[[opts$var2]]
+
+ Log('Preview of string var:')
+ print(head(table(string_var)))
+ cat('\n')
+
+ Log('Preview of bool var:')
+ print(head(table(bool_var)))
+ cat('\n')
+
+ # Split ASCII strings into array of numerics (as required by association.R)
+
+ Log('Splitting string reports (%d cores)', opts$num_cores)
+ string_reports <- mclapply(string_var, function(x) {
+ # function splits strings and converts them to numeric values
+ # rev needed for endianness
+ rev(as.integer(strsplit(x, split = "")[[1]]))
+ }, mc.cores = opts$num_cores)
+
+ Log('Splitting bool reports (%d cores)', opts$num_cores)
+ # Has to be an list of length 1 integer vectors
+ bool_reports <- mclapply(bool_var, function(x) {
+ as.integer(x)
+ }, mc.cores = opts$num_cores)
+
+ reports_list <- list(bool_reports, string_reports)
+
+ Log('Association for %d vars', length(reports_list))
+
+ if (opts$em_executable != "") {
+ Log('Will shell out to %s for native EM implementation', opts$em_executable)
+ em_iter_func <- ConstructFastEM(opts$em_executable, opts$tmp_dir)
+ } else {
+ Log('Will use R implementation of EM (slow)')
+ em_iter_func <- EM
+ }
+
+ assoc_result <- ComputeDistributionEM(reports_list, cohorts_list, map_list,
+ ignore_other = FALSE,
+ params_list = params_list,
+ marginals = NULL,
+ estimate_var = FALSE,
+ num_cores = opts$num_cores,
+ em_iter_func = em_iter_func,
+ max_em_iters = opts$max_em_iters)
+
+ # This happens if the marginal can't be decoded.
+ if (is.null(assoc_result)) {
+ stop("ComputeDistributionEM failed.")
+ }
+
+ # NOTE: It would be nicer if reports_list, cohorts_list, etc. were indexed by
+ # names like 'domain' rather than numbers, and the result assoc_result$fit
+ # matrix had corresponding named dimensions. Instead we call
+ # ResultMatrixToDataFrame to do this.
+
+ fit <- assoc_result$fit
+ fit_df <- ResultMatrixToDataFrame(fit, opts$var1, opts$var2)
+
+ Log("Association results:")
+ print(fit_df)
+ cat('\n')
+
+ results_csv_path <- file.path(opts$output_dir, 'assoc-results.csv')
+ write.csv(fit_df, file = results_csv_path, row.names = FALSE)
+ Log("Wrote %s", results_csv_path)
+
+ # Measure elapsed time as close to the end as possible
+ total_elapsed_time <- proc.time()[['elapsed']]
+
+ metrics <- list(num_reports = N,
+ reports_sample_size = opts$reports_sample_size,
+ # fit is a matrix
+ estimate_dimensions = dim(fit),
+ # should sum to near 1.0
+ sum_estimates = sum(fit),
+ total_elapsed_time = total_elapsed_time,
+ em_elapsed_time = assoc_result$em_elapsed_time,
+ num_em_iters = assoc_result$num_em_iters)
+
+ metrics_json_path <- file.path(opts$output_dir, 'assoc-metrics.json')
+ writeLines(toJSON(metrics), con = metrics_json_path)
+ Log("Wrote %s", metrics_json_path)
+
+ Log('DONE decode-assoc')
+}
+
+if (!interactive()) {
+ main(opts)
+}
diff --git a/bin/decode_dist.R b/bin/decode_dist.R
new file mode 100755
index 0000000..5c83f74
--- /dev/null
+++ b/bin/decode_dist.R
@@ -0,0 +1,144 @@
+#!/usr/bin/env Rscript
+#
+# Command line tool to decode a RAPPOR data set. It is a simple wrapper for
+# Decode() in decode.R.
+
+library(optparse)
+
+#
+# Command line parsing. Do this first before loading libraries to catch errors
+# quickly. Loading libraries in R is slow.
+#
+
+# For command line error checking.
+UsageError <- function(...) {
+ cat(sprintf(...))
+ cat('\n')
+ quit(status = 1)
+}
+
+option_list <- list(
+ # Inputs
+ make_option("--map", default="", help="Map file (required)"),
+ make_option("--counts", default="", help="Counts file (required)"),
+ make_option("--params", default="", help="Params file (required)"),
+ make_option("--output-dir", dest="output_dir", default=".",
+ help="Output directory (default .)"),
+
+ make_option("--correction", default="FDR", help="Correction method"),
+ make_option("--alpha", default=.05, help="Alpha level"),
+
+ make_option("--adjust-counts-hack", dest="adjust_counts_hack",
+ default=FALSE, action="store_true",
+ help="Allow the counts file to have more rows than cohorts.
+ Most users should not use this.")
+)
+
+ParseOptions <- function() {
+ # NOTE: This API is bad; if you add positional_arguments, the return value
+ # changes!
+ parser <- OptionParser(option_list = option_list)
+ opts <- parse_args(parser)
+
+ if (opts$map == "") {
+ UsageError("--map is required.")
+ }
+ if (opts$counts == "") {
+ UsageError("--counts is required.")
+ }
+ if (opts$params == "") {
+ UsageError("--params is required.")
+ }
+ return(opts)
+}
+
+if (!interactive()) {
+ opts <- ParseOptions()
+}
+
+#
+# Load libraries and source our own code.
+#
+
+library(RJSONIO)
+
+# So we don't have to change pwd
+source.rappor <- function(rel_path) {
+ abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
+ source(abs_path)
+}
+
+source.rappor("analysis/R/read_input.R")
+source.rappor("analysis/R/decode.R")
+source.rappor("analysis/R/util.R")
+
+source.rappor("analysis/R/alternative.R")
+
+options(stringsAsFactors = FALSE)
+
+
+main <- function(opts) {
+ Log("decode-dist")
+ Log("argv:")
+ print(commandArgs(TRUE))
+
+ Log("Loading inputs")
+
+ # Run a single model of all inputs are specified.
+ params <- ReadParameterFile(opts$params)
+ counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
+ counts <- AdjustCounts(counts, params)
+
+
+ # The left-most column has totals.
+ num_reports <- sum(counts[, 1])
+
+ map <- LoadMapFile(opts$map, params)
+
+ Log("Decoding %d reports", num_reports)
+ res <- Decode(counts, map$map, params, correction = opts$correction,
+ alpha = opts$alpha)
+ Log("Done decoding")
+
+ if (nrow(res$fit) == 0) {
+ Log("FATAL: Analysis returned no strings.")
+ quit(status = 1)
+ }
+
+ # Write analysis results as CSV.
+ results_csv_path <- file.path(opts$output_dir, 'results.csv')
+ write.csv(res$fit, file = results_csv_path, row.names = FALSE)
+
+ # Write residual histograph as a png.
+ results_png_path <- file.path(opts$output_dir, 'residual.png')
+ png(results_png_path)
+ breaks <- pretty(res$residual, n = 200)
+ histogram <- hist(res$residual, breaks, plot = FALSE)
+ histogram$counts <- histogram$counts / sum(histogram$counts) # convert the histogram to frequencies
+ plot(histogram, main = "Histogram of the residual",
+ xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
+ dev.off()
+
+ res$metrics$total_elapsed_time <- proc.time()[['elapsed']]
+
+ # Write summary as JSON (scalar values).
+ metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
+ m <- toJSON(res$metrics)
+ writeLines(m, con = metrics_json_path)
+ Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)
+
+ # TODO:
+ # - These are in an 2 column 'parameters' and 'values' format. Should these
+ # just be a plain list?
+ # - Should any of these privacy params be in metrics.json?
+
+ Log("Privacy summary:")
+ print(res$privacy)
+ cat("\n")
+
+ Log('DONE')
+}
+
+if (!interactive()) {
+ main(opts)
+}
diff --git a/bin/hash-candidates b/bin/hash-candidates
new file mode 100644
index 0000000..ed65fcb
--- /dev/null
+++ b/bin/hash-candidates
@@ -0,0 +1,7 @@
+#!/bin/bash
+#
+# Shell wrapper around hash_candidates.py.
+
+readonly THIS_DIR=$(dirname $0)
+
+PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/hash_candidates.py "$@"
diff --git a/bin/hash_candidates.py b/bin/hash_candidates.py
new file mode 100755
index 0000000..e59295e
--- /dev/null
+++ b/bin/hash_candidates.py
@@ -0,0 +1,64 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Given a list of candidates on stdin, produce a file of hashes ("map file").
+"""
+
+import csv
+import sys
+
+import rappor
+
+
+def HashCandidates(params, stdin, stdout):
+ num_bloombits = params.num_bloombits
+ csv_out = csv.writer(stdout)
+
+ for line in stdin:
+ word = line.strip()
+ row = [word]
+ for cohort in xrange(params.num_cohorts):
+ bloom_bits = rappor.get_bloom_bits(word, cohort, params.num_hashes,
+ num_bloombits)
+ for bit_to_set in bloom_bits:
+ # bits are indexed from 1. Add a fixed offset for each cohort.
+ # NOTE: This detail could be omitted from the map file format, and done
+ # in R.
+ row.append(cohort * num_bloombits + (bit_to_set + 1))
+ csv_out.writerow(row)
+
+
+def main(argv):
+ try:
+ filename = argv[1]
+ except IndexError:
+ raise RuntimeError('Usage: hash_candidates.py <params file>')
+ with open(filename) as f:
+ try:
+ params = rappor.Params.from_csv(f)
+ except rappor.Error as e:
+ raise RuntimeError(e)
+
+ HashCandidates(params, sys.stdin, sys.stdout)
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, e.args[0]
+ sys.exit(1)
diff --git a/bin/hash_candidates_test.py b/bin/hash_candidates_test.py
new file mode 100755
index 0000000..2d0c4f1
--- /dev/null
+++ b/bin/hash_candidates_test.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python -S
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+hash_candidates_test.py: Tests for hash_candidates.py
+"""
+
+import cStringIO
+import unittest
+
+import rappor
+import hash_candidates # module under test
+
+
+STDIN = """\
+apple
+banana
+carrot
+"""
+
+EXPECTED_CSV_OUT = """\
+apple,5,1,26,26,38,34,63,62\r
+banana,12,14,28,24,37,34,62,49\r
+carrot,4,12,25,21,48,38,61,54\r
+"""
+
+
+class HashCandidatesTest(unittest.TestCase):
+
+ def setUp(self):
+ self.params = rappor.Params()
+ self.params.num_bloombits = 16
+ self.params.num_cohorts = 4
+ self.params.num_hashes = 2
+
+ def testHash(self):
+ stdin = cStringIO.StringIO(STDIN)
+ stdout = cStringIO.StringIO()
+
+ hash_candidates.HashCandidates(self.params, stdin, stdout)
+
+ self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue())
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/bin/sum-bits b/bin/sum-bits
new file mode 100644
index 0000000..bfa9b44
--- /dev/null
+++ b/bin/sum-bits
@@ -0,0 +1,7 @@
+#!/bin/bash
+#
+# Shell wrapper around sum_bits.py.
+
+readonly THIS_DIR=$(dirname $0)
+
+PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/sum_bits.py "$@"
diff --git a/bin/sum_bits.py b/bin/sum_bits.py
new file mode 100755
index 0000000..f211656
--- /dev/null
+++ b/bin/sum_bits.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom
+filter by cohort. This can then be analyzed by R.
+"""
+
+import csv
+import sys
+
+import rappor
+
+
+def SumBits(params, stdin, stdout):
+ csv_in = csv.reader(stdin)
+ csv_out = csv.writer(stdout)
+
+ num_cohorts = params.num_cohorts
+ num_bloombits = params.num_bloombits
+
+ sums = [[0] * num_bloombits for _ in xrange(num_cohorts)]
+ num_reports = [0] * num_cohorts
+
+ for i, row in enumerate(csv_in):
+ try:
+ (user_id, cohort, unused_bloom, unused_prr, irr) = row
+ except ValueError:
+ raise RuntimeError('Error parsing row %r' % row)
+
+ if i == 0:
+ continue # skip header
+
+ cohort = int(cohort)
+ num_reports[cohort] += 1
+
+ if not len(irr) == params.num_bloombits:
+ raise RuntimeError(
+ "Expected %d bits, got %r" % (params.num_bloombits, len(irr)))
+ for i, c in enumerate(irr):
+ bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0
+ if c == '1':
+ sums[cohort][bit_num] += 1
+ else:
+ if c != '0':
+ raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
+
+ for cohort in xrange(num_cohorts):
+ # First column is the total number of reports in the cohort.
+ row = [num_reports[cohort]] + sums[cohort]
+ csv_out.writerow(row)
+
+
+def main(argv):
+ try:
+ filename = argv[1]
+ except IndexError:
+ raise RuntimeError('Usage: sum_bits.py <params file>')
+ with open(filename) as f:
+ try:
+ params = rappor.Params.from_csv(f)
+ except rappor.Error as e:
+ raise RuntimeError(e)
+
+ SumBits(params, sys.stdin, sys.stdout)
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, e.args[0]
+ sys.exit(1)
diff --git a/bin/sum_bits_test.py b/bin/sum_bits_test.py
new file mode 100755
index 0000000..91c109f
--- /dev/null
+++ b/bin/sum_bits_test.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python -S
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+sum_bits_test.py: Tests for sum_bits.py
+"""
+
+import cStringIO
+import unittest
+
+import rappor
+import sum_bits # module under test
+
+
+CSV_IN = """\
+user_id,cohort,bloom,prr,rappor
+5,1,dummy,dummy,0000111100001111
+5,1,dummy,dummy,0000000000111100
+"""
+
+# NOTE: bit order is reversed.
+EXPECTED_CSV_OUT = """\
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
+2,1,1,2,2,1,1,0,0,1,1,1,1,0,0,0,0\r
+"""
+
+TOO_MANY_COLUMNS = """\
+user_id,cohort,rappor
+5,1,0000111100001111,extra
+"""
+
+
+class SumBitsTest(unittest.TestCase):
+
+ def setUp(self):
+ self.params = rappor.Params()
+ self.params.num_bloombits = 16
+ self.params.num_cohorts = 2
+
+ def testSum(self):
+ stdin = cStringIO.StringIO(CSV_IN)
+ stdout = cStringIO.StringIO()
+
+ sum_bits.SumBits(self.params, stdin, stdout)
+
+ self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue())
+
+ def testErrors(self):
+ stdin = cStringIO.StringIO(TOO_MANY_COLUMNS)
+ stdout = cStringIO.StringIO()
+
+ self.assertRaises(
+ RuntimeError, sum_bits.SumBits, self.params, stdin, stdout)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/bin/test.sh b/bin/test.sh
new file mode 100755
index 0000000..6b0381e
--- /dev/null
+++ b/bin/test.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+usage() {
+echo "
+
+ Simple smoke test for the decode-dist tool. This will fail if your machine
+ doesn't have the right R libraries.
+
+ Usage:
+ ./test.sh <function name>
+
+ Example:
+ ./test.sh decode-assoc-R-smoke # test pure R implementation
+ ./test.sh decode-assoc-cpp-smoke # test with analysis/cpp/fast_em.cc
+ ./test.sh decode-assoc-cpp-converge # run for longer with C++
+ ./test.sh decode-assoc-tensorflow
+"
+}
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
+readonly EM_CPP_EXECUTABLE=$RAPPOR_SRC/analysis/cpp/_tmp/fast_em
+
+source $RAPPOR_SRC/util.sh
+
+readonly ASSOC_TESTDATA_DIR=_tmp/decode-assoc-test
+readonly DIST_TESTDATA_DIR=_tmp/decode-dist-test
+
+# Clear the R cache for the map files.
+clear-cached-files() {
+ local dir=$1
+ find $dir -name '*.rda' | xargs --no-run-if-empty -- rm --verbose
+}
+
+write-dist-testdata() {
+ local input_dir=$DIST_TESTDATA_DIR/input
+
+ mkdir -p $input_dir
+
+ clear-cached-files $DIST_TESTDATA_DIR
+
+ # Right now, we copy a case from regtest.sh. (./demo.sh quick-python creates
+ # just this case)
+ local case_dir=$RAPPOR_SRC/_tmp/python/demo3
+
+ cp --verbose $case_dir/1/case_counts.csv $input_dir/counts.csv
+ cp --verbose $case_dir/case_map.csv $input_dir/map.csv
+ cp --verbose $case_dir/case_params.csv $input_dir/params.csv
+}
+
+decode-dist() {
+ write-dist-testdata
+
+ local output_dir=$DIST_TESTDATA_DIR
+
+ local input_dir=$DIST_TESTDATA_DIR/input
+
+ # Uses the ./demo.sh regtest files
+ time $RAPPOR_SRC/bin/decode-dist \
+ --counts $input_dir/counts.csv \
+ --map $input_dir/map.csv \
+ --params $input_dir/params.csv \
+ --output-dir $output_dir
+
+ echo
+ head $output_dir/results.csv
+ echo
+ cat $output_dir/metrics.json
+}
+
+write-assoc-testdata() {
+ # 'build' has intermediate build files, 'input' is the final input to the
+ # decode-assoc tool.
+ local build_dir=$ASSOC_TESTDATA_DIR/build
+ local input_dir=$ASSOC_TESTDATA_DIR/input
+
+ mkdir -p $build_dir $input_dir
+
+ clear-cached-files $ASSOC_TESTDATA_DIR
+
+ cat >$build_dir/true_values.csv <<EOF
+domain,flag..HTTPS
+google.com,1
+google.com,1
+google.com,1
+google.com,1
+google.com,0
+yahoo.com,1
+yahoo.com,0
+bing.com,1
+bing.com,1
+bing.com,0
+EOF
+
+ local num_bits=8
+ local num_hashes=1
+ local num_cohorts=128
+
+ local prob_p=0.25
+ local prob_q=0.75
+ local prob_f=0.5
+
+ # 10 items in the input. 50,000 items is enough to eyeball accuracy of
+ # results.
+ local assoc_testdata_count=5000
+
+ PYTHONPATH=$RAPPOR_SRC/client/python \
+ $RAPPOR_SRC/tests/rappor_sim.py \
+ --assoc-testdata $assoc_testdata_count \
+ --num-bits $num_bits \
+ --num-hashes $num_hashes \
+ --num-cohorts $num_cohorts \
+ -p $prob_p \
+ -q $prob_q \
+ -f $prob_f \
+ < $build_dir/true_values.csv \
+ > $input_dir/reports.csv
+
+ # Output two bad rows: each row is missing one of the columns.
+ cat >$build_dir/bad_rows.txt <<EOF
+c0,0,10101010,
+c0,0,,0
+EOF
+
+ # Make CSV file with the header
+ cat - $build_dir/bad_rows.txt > $input_dir/bad_rows.csv <<EOF
+client,cohort,domain,flag..HTTPS
+EOF
+
+ # Make reports file with bad rows
+ cat $input_dir/reports.csv $build_dir/bad_rows.txt > $input_dir/reports_bad_rows.csv
+
+ # Define a string variable and a boolean varaible.
+ cat >$input_dir/rappor-vars.csv <<EOF
+metric, var, var_type, params
+m,domain,string,m_params
+m,flag..HTTPS,boolean,m_params
+EOF
+
+ cat >$input_dir/m_params.csv <<EOF
+k,h,m,p,q,f
+$num_bits,$num_hashes,$num_cohorts,$prob_p,$prob_q,$prob_f
+EOF
+
+ # Add a string with a double quote to test quoting behavior
+ cat >$build_dir/domain_candidates.csv <<EOF
+google.com
+yahoo.com
+bing.com
+q"q
+EOF
+
+ # Hash candidates to create map.
+ $RAPPOR_SRC/bin/hash-candidates $input_dir/m_params.csv \
+ < $build_dir/domain_candidates.csv \
+ > $input_dir/domain_map.csv
+
+ banner "Wrote testdata in $input_dir (intermediate files in $build_dir)"
+}
+
+# Helper function to run decode-assoc with testdata.
+decode-assoc-helper() {
+ write-assoc-testdata
+
+ local output_dir=$1
+ shift
+
+ local build_dir=$ASSOC_TESTDATA_DIR/build
+ local input_dir=$ASSOC_TESTDATA_DIR/input
+
+ time $RAPPOR_SRC/bin/decode-assoc \
+ --metric-name m \
+ --schema $input_dir/rappor-vars.csv \
+ --reports $input_dir/reports.csv \
+ --params-dir $input_dir \
+ --var1 domain \
+ --var2 flag..HTTPS \
+ --map1 $input_dir/domain_map.csv \
+ --create-bool-map \
+ --max-em-iters 10 \
+ --num-cores 2 \
+ --output-dir $output_dir \
+ --tmp-dir $output_dir \
+ "$@"
+
+ head $output_dir/assoc-*
+
+ # Print true values for comparison
+ echo
+ echo "$build_dir/true_values.csv:"
+ cat "$build_dir/true_values.csv"
+}
+
+# Quick smoke test for R version.
+decode-assoc-R-smoke() {
+ local output_dir=_tmp/R
+ mkdir -p $output_dir
+ decode-assoc-helper $output_dir
+}
+
+# Test what happens when there are bad rows.
+decode-assoc-bad-rows() {
+ local output_dir=_tmp/bad
+ mkdir -p $output_dir
+
+ # Later flags override earlier ones
+
+ # Reports + bad rows
+ decode-assoc-helper $output_dir \
+ --reports _tmp/reports_bad_rows.csv \
+ --remove-bad-rows \
+ "$@"
+
+ # ONLY bad rows
+ decode-assoc-helper $output_dir \
+ --reports _tmp/bad_rows.csv \
+ --remove-bad-rows \
+ "$@"
+}
+
+build-em-executable() {
+ pushd $RAPPOR_SRC/analysis/cpp >/dev/null
+ ./run.sh build-fast-em
+ popd >/dev/null
+}
+
+decode-assoc-cpp-smoke() {
+ local output_dir=_tmp/cpp
+ mkdir -p $output_dir
+
+ build-em-executable
+
+ decode-assoc-helper $output_dir \
+ --em-executable "$EM_CPP_EXECUTABLE" "$@"
+}
+
+decode-assoc-cpp-converge() {
+ # With the data we have, this converges and exits before 1000 iterations.
+ decode-assoc-cpp-smoke --max-em-iters 1000
+}
+
+decode-assoc-tensorflow() {
+ local output_dir=_tmp/tensorflow
+ mkdir -p $output_dir
+
+ decode-assoc-helper $output_dir \
+ --em-executable $RAPPOR_SRC/analysis/tensorflow/fast_em.sh "$@"
+}
+
+decode-assoc-tensorflow-converge() {
+ decode-assoc-tensorflow --max-em-iters 1000
+}
+
+if test $# -eq 0 ; then
+ usage
+else
+ "$@"
+fi
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..44c5cd5
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+#
+# Build automation.
+#
+# Usage:
+# ./build.sh [function name]
+#
+# Important targets are:
+# cpp-client: Build the C++ client
+# doc: build docs with Markdown
+# fastrand: build Python extension module to speed up the client simulation
+#
+# If no function is specified all 3 targets will be built.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+log() {
+ echo 1>&2 "$@"
+}
+
+die() {
+ log "FATAL: $@"
+ exit 1
+}
+
+run-markdown() {
+ local md=`which markdown || echo "cat"`
+
+ # Markdown is output unstyled; make it a little more readable.
+ cat <<EOF
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="UTF-8">
+ <style type="text/css">
+ code { color: green; }
+ pre { margin-left: 3em; }
+ </style>
+ <!-- INSERT LATCH JS -->
+ </head>
+ <body style="margin: 0 auto; width: 40em; text-align: left;">
+ <!-- INSERT LATCH HTML -->
+EOF
+
+ $md "$@"
+
+ cat <<EOF
+ </body>
+ </html>
+EOF
+}
+
+run-dot() {
+ local in=$1
+ local out=$2
+
+ local msg="dot not found (perhaps 'sudo apt-get install graphviz')"
+ which dot >/dev/null || die "$msg"
+
+ log "Running dot"
+ # width, height
+ dot \
+ -Tpng -Gsize='2,4!' -Gdpi=300 \
+ -o $out $in
+}
+
+# Scan for TODOs. Does this belong somewhere else?
+todo() {
+ find . -name \*.py -o -name \*.R -o -name \*.sh -o -name \*.md \
+ | xargs --verbose -- grep -w TODO
+}
+
+#
+# Targets: build "doc" or "fastrand"
+#
+
+# Build dependencies: markdown tool.
+doc() {
+ mkdir -p _tmp _tmp/doc
+
+ # For now, just one file.
+ # TODO: generated docs
+ run-markdown <README.md >_tmp/README.html
+ run-markdown <doc/randomness.md >_tmp/doc/randomness.html
+
+ run-markdown <doc/data-flow.md >_tmp/doc/data-flow.html
+ run-dot doc/data-flow.dot _tmp/doc/data-flow.png
+
+ log 'Wrote docs to _tmp'
+}
+
+# Build dependencies: Python development headers. Most systems should have
+# this. On Ubuntu/Debian, the 'python-dev' package contains headers.
+fastrand() {
+ pushd tests >/dev/null
+ python setup.py build
+ # So we can 'import _fastrand' without installing
+ ln -s --force build/*/_fastrand.so .
+ ./fastrand_test.py
+
+ log 'fastrand built and tests PASSED'
+ popd >/dev/null
+}
+
+cpp-client() {
+ pushd client/cpp
+ mkdir --verbose -p _tmp
+ make _tmp/rappor_sim # this builds an executable using it
+ popd
+}
+
+if test $# -eq 0 ; then
+ cpp-client
+ doc
+ fastrand
+else
+ "$@"
+fi
diff --git a/client/cpp/Makefile b/client/cpp/Makefile
new file mode 100644
index 0000000..b4c7f7e
--- /dev/null
+++ b/client/cpp/Makefile
@@ -0,0 +1,158 @@
+# Build RAPPOR C++ code.
+
+default : \
+ _tmp/rappor_sim \
+ _tmp/encoder_demo \
+ _tmp/protobuf_encoder_demo \
+ _tmp/openssl_hash_impl_test
+
+# All intermediate files live in _tmp/
+clean :
+ rm -f --verbose _tmp/*
+
+# Use protobuf compiler to generate .cc and .h files. The .o and the .d depend
+# on .cc, so that is the target of this rule.
+
+_tmp/%.pb.cc : ../proto/%.proto
+ protoc --cpp_out _tmp --proto_path=../proto $<
+
+#
+# Generate .d Makefile fragments.
+#
+
+# CXX flags:
+# -MM: exclude system headers
+# -I _tmp: So that protobuf files found
+#
+# Makefile stuff:
+# $*: the part that matched the wildcard, e.g. 'rappor_sim' for '%.cc'
+# matching 'rappor_sim.cc'
+#
+# We use $< (first prereq) to generate .d and and .o files from .cc, because
+# it only needs the .cc file. We used $^ (all prereqs) to pass ALL the .o
+# files to the link step.
+
+_tmp/%.d : %.cc
+ ./dotd.sh $* $@ \
+ $(CXX) -I _tmp/ -MM $(CPPFLAGS) $<
+
+# Special case for .d file of generated source.
+_tmp/%.pb.d : _tmp/%.pb.cc
+ ./dotd.sh $*.pb $@ \
+ $(CXX) -I _tmp/ -MM $(CPPFLAGS) $<
+
+#
+# Include the Makefile fragments we generated, so that changes to headers will
+# rebuild both .d files and .o files. ('-include' suppresses the error if they
+# don't exist.)
+#
+# NOTE: We have to list them explicitly. Every time you add a source file, add
+# the corresponding .d file here.
+#
+
+-include \
+ _tmp/encoder.d \
+ _tmp/libc_rand_impl.d \
+ _tmp/openssl_hash_impl.d \
+ _tmp/openssl_hash_impl_test.d \
+ _tmp/protobuf_encoder.d \
+ _tmp/protobuf_encoder_demo.d \
+ _tmp/rappor_sim.d \
+ _tmp/unix_kernel_rand_impl.d \
+ _tmp/rappor.pb.d \
+ _tmp/example_app.pb.d
+
+# For example, -Wextra warns about unused params, but -Wall doesn't.
+CXXFLAGS = -Wall -Wextra #-Wpedantic
+
+#
+# Build object files (-c: compile only)
+#
+
+# NOTE: More prerequisites to _tmp/%.o (header files) are added by the .d
+# files, so we need $<.
+_tmp/%.o : %.cc
+ $(CXX) $(CXXFLAGS) -I _tmp/ -c -o $@ $<
+
+_tmp/%.pb.o : _tmp/%.pb.cc
+ $(CXX) $(CXXFLAGS) -I _tmp/ -c -o $@ $<
+
+#
+# Build executables
+#
+
+# CXX flag notes:
+# -lcrypto from openssl
+# -g for debug info
+#
+# You can add -std=c++0x for std::array, etc.
+
+# $^ : all prerequisites
+_tmp/rappor_sim : \
+ _tmp/encoder.o \
+ _tmp/libc_rand_impl.o \
+ _tmp/unix_kernel_rand_impl.o \
+ _tmp/openssl_hash_impl.o \
+ _tmp/rappor_sim.o
+ $(CXX) \
+ $(CXXFLAGS) \
+ -o $@ \
+ $^ \
+ -lcrypto \
+ -g
+
+# $^ : all prerequisites
+_tmp/encoder_demo: \
+ _tmp/encoder.o \
+ _tmp/unix_kernel_rand_impl.o \
+ _tmp/openssl_hash_impl.o \
+ _tmp/encoder_demo.o
+ $(CXX) \
+ $(CXXFLAGS) \
+ -o $@ \
+ $^ \
+ -lcrypto \
+ -g
+
+# -I _tmp for protobuf headers
+_tmp/protobuf_encoder_demo : \
+ _tmp/encoder.o \
+ _tmp/libc_rand_impl.o \
+ _tmp/unix_kernel_rand_impl.o \
+ _tmp/openssl_hash_impl.o \
+ _tmp/protobuf_encoder.o \
+ _tmp/protobuf_encoder_demo.o \
+ _tmp/example_app.pb.o \
+ _tmp/rappor.pb.o
+ $(CXX) \
+ $(CXXFLAGS) \
+ -I _tmp \
+ -o $@ \
+ $^ \
+ -lprotobuf \
+ -lcrypto \
+ -g
+
+_tmp/openssl_hash_impl_test : \
+ _tmp/openssl_hash_impl.o \
+ _tmp/openssl_hash_impl_test.o
+ $(CXX) \
+ $(CXXFLAGS) \
+ -o $@ \
+ $^ \
+ -lcrypto \
+ -g
+
+# Unittests are currently run manually, and require the Google gtest
+# framework version 1.7.0 or greater, found at
+# https://github.com/google/googletest/releases
+# TODO(mdeshon-google): Installer script
+unittest: _tmp/openssl_hash_impl_unittest _tmp/encoder_unittest
+ _tmp/openssl_hash_impl_unittest
+ _tmp/encoder_unittest
+
+_tmp/openssl_hash_impl_unittest: openssl_hash_impl_unittest.cc openssl_hash_impl.cc
+ $(CXX) -g -o $@ $^ -lssl -lcrypto -lgtest
+
+_tmp/encoder_unittest: encoder_unittest.cc encoder.cc unix_kernel_rand_impl.cc openssl_hash_impl.cc
+ $(CXX) -g -o $@ $^ -lssl -lcrypto -lgtest
diff --git a/client/cpp/README.md b/client/cpp/README.md
new file mode 100644
index 0000000..7ac73da
--- /dev/null
+++ b/client/cpp/README.md
@@ -0,0 +1,129 @@
+RAPPOR C++ Client
+=================
+
+We provide both a low level and high level client API. The low level API
+implements just the RAPPOR encoding algorithm on strings, with few
+dependencies.
+
+The high level API provides wrappers that bundle encoded values into Protocol
+Buffer messages.
+
+Build Instructions
+------------------
+
+You'll need a C++ compiler, the protobuf compiler, and a library that
+implements common hash functions (e.g. OpenSSL).
+
+On Ubuntu or Debian, the protobuf compiler and header files can be installed
+with:
+
+ sudo apt-get install protobuf-compiler libprotobuf-dev
+
+OpenSSL can be installed with:
+
+ sudo apt-get install libssl-dev
+
+Test
+----
+
+After installing dependencies, You can test it out easily on your machine:
+
+ ./demo.sh quick-cpp
+
+This builds the test harness using a Makefile, and then runs the regtest.sh
+simulation. The last few lines of output will look like this:
+
+ Done running all test instances
+ Instances succeeded: 1 failed: 0 running: 0 total: 1
+ Wrote _tmp/cpp/results.html
+ URL: file:///usr/local/google/home/andychu/git/rappor/_tmp/cpp/results.html
+
+Open the HTML file to see a plot and stats.
+
+
+Encoder
+-------
+
+The low level API is `Encoder`. You instantiatate it with RAPPOR encoding
+parameters and application dependencies. It has a method `EncodeString()` that
+takes an input string (no other types), sets an output parameter of type
+`rappor::Bits`, and returns success or failure.
+
+```cpp
+#include <cassert>
+
+#include "encoder.h"
+#include "openssl_hash_impl.h"
+#include "unix_kernel_rand_impl.h"
+
+int main(int argc, char** argv) {
+ FILE* fp = fopen("/dev/urandom", "r");
+ rappor::UnixKernelRand irr_rand(fp);
+
+ rappor::Deps deps(rappor::Md5, "client-secret", rappor::HmacSha256,
+ irr_rand);
+ rappor::Params params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+
+ const char* encoder_id = "metric-name";
+ rappor::Encoder encoder(encoder_id, params, deps);
+
+ // Now use it to encode values. The 'out' value can be sent over the
+ // network.
+ rappor::Bits out;
+ assert(encoder.EncodeString("foo", &out)); // returns false on error
+ printf("'foo' encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
+
+ // Raw bits
+ assert(encoder.EncodeBits(0x123, &out)); // returns false on error
+ printf("0x123 encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
+}
+```
+
+Dependencies
+------------
+
+`rappor::Deps` is a struct-like object that holds the dependencies needed by
+the API.
+
+The application must provide the following values:
+
+- cohort: An integer between 0 and `num_cohorts - 1`. Each value is assigned
+ with equal probability to a client process.
+- client_secret: A persistent client secret (used for deterministic randomness
+ in the PRR, i.e. "memoization" requirement).
+- hash_func - string hash function implementation (e.g. MD5)
+- hmac_func - HMAC-SHA256 implementation
+- irr_rand - randomness for the IRR
+
+We provide an implementation of `hash_func` and `hmac_func` and using OpenSSL.
+If your application already has a different implementation of these functions,
+you can implement the `HashFunc` and HmacFunc` interfaces.
+
+We provide two example implementations of `irr_rand`: one based on libc
+`rand()` (insecure, for demo only), and one based on Unix `/dev/urandom`.
+
+Error Handling
+--------------
+
+Note that incorrect usage of the `SimpleEncoder` and `Protobuf` constructors
+may cause *runtime assertions* (using `assert()`). For example, if
+Params.num\_bits is more than 32, the process will crash.
+
+Encoders should be initialized at application startup, with constant
+parameters, so this type of error should be seen early.
+
+The various `Encode()` members do *not* raise assertions. If those are used
+incorrectly, then the return value will be `false` to indicate an error. These
+failures should be handled by the application.
+
+Memory Management
+-----------------
+
+The `Encoder` instances contain pointers to `Params` and `Deps` instances, but
+don't own them. In the examples, all instances live the stack of `main()`, so
+you don't have to worry about them being destroyed.
diff --git a/client/cpp/dotd.sh b/client/cpp/dotd.sh
new file mode 100755
index 0000000..989d928
--- /dev/null
+++ b/client/cpp/dotd.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+#
+# dotd.sh
+#
+# Generate .d Makefile fragments, so we can use #include statements in source
+# for dependency info. Adapted from the GNU make manual:
+#
+# http://www.gnu.org/software/make/manual/html_node/Automatic-Prerequisites.html
+#
+# We are putting this in shell, so we just have 'sed in bash'. Not an unholy
+# mix of 'sed in bash in Make'.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+# Munge gcc -MM output into .d files.
+main() {
+ if [ ! -d _tmp ]; then mkdir _tmp; fi
+ local basename=$1
+ local dotd=$2 # .d output name
+ shift 2 # rest of args are gcc invocation
+
+ rm --verbose -f $dotd # in case of failure?
+
+ # Execute the gcc -MM invocation.
+ #
+ # Change
+ # rappor_sim.o: rappor.sim.cc
+ # to
+ # _tmp/rappor_sim.o _tmp/rappor_sim.d : rappor.sim.cc
+ "$@" | sed "s|\($basename\).o|_tmp/\1.o _tmp/\1.d |" > $dotd
+}
+
+main "$@"
diff --git a/client/cpp/encoder.cc b/client/cpp/encoder.cc
new file mode 100644
index 0000000..c50e035
--- /dev/null
+++ b/client/cpp/encoder.cc
@@ -0,0 +1,416 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "encoder.h"
+#include "openssl_hash_impl.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdarg.h> // va_list, etc.
+#include <vector>
+
+namespace rappor {
+
+void log(const char* fmt, ...) {
+ va_list args;
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+ fprintf(stderr, "\n");
+}
+
+//
+// Functions for debugging
+//
+
+static void PrintHex(const std::vector<uint8_t>& h) {
+ for (size_t i = 0; i < h.size(); ++i) {
+ fprintf(stderr, "%02x", h[i]);
+ }
+ fprintf(stderr, "\n");
+}
+
+// We use 1 *byte* of a HMAC-SHA256 value per BIT to generate the PRR. SHA256
+// has 32 bytes, so the max is 32 bits.
+static const int kMaxBits = 32;
+
+// Can't be more than the number of bytes in MD5.
+static const int kMaxHashes = 16;
+
+// Probabilities should be in the interval [0.0, 1.0].
+static void CheckValidProbability(float prob, const char* var_name) {
+ if (prob < 0.0f || prob > 1.0f) {
+ log("%s should be between 0.0 and 1.0 inclusive (got %.2f)", var_name,
+ prob);
+ assert(false);
+ }
+}
+
+// Used to 1) turn cohort into a string, and 2) Turn raw bits into a string.
+// Return by value since it's small.
+static std::string ToBigEndian(uint32_t u) {
+ std::string result(4, '\0');
+
+ // rely on truncation to char
+ result[0] = u >> 24;
+ result[1] = u >> 16;
+ result[2] = u >> 8;
+ result[3] = u;
+
+ return result;
+}
+
+static const char* kHmacCohortPrefix = "\x00";
+static const char* kHmacPrrPrefix = "\x01";
+
+
+//
+// Encoder
+//
+
+uint32_t Encoder::AssignCohort(const Deps& deps, int num_cohorts) {
+ std::vector<uint8_t> sha256;
+ if (!deps.hmac_func_(deps.client_secret_, kHmacCohortPrefix, &sha256)) {
+ log("HMAC failed");
+ assert(false);
+ }
+
+ // Either we are using SHA256 to have exactly 32 bytes,
+ // or we're using HmacDrbg for any number of bytes.
+ if ((sha256.size() == kMaxBits)
+ || (deps.hmac_func_ == rappor::HmacDrbg)) {
+ // Hash size ok.
+ } else {
+ log("Bad hash size.");
+ assert(false);
+ }
+
+ // Interpret first 4 bytes of sha256 as a uint32_t.
+ uint32_t c = *(reinterpret_cast<uint32_t*>(sha256.data()));
+ // e.g. for 128 cohorts, 0x80 - 1 = 0x7f
+ uint32_t cohort_mask = num_cohorts - 1;
+ return c & cohort_mask;
+}
+
+Encoder::Encoder(const std::string& encoder_id, const Params& params,
+ const Deps& deps)
+ : encoder_id_(encoder_id),
+ params_(params),
+ deps_(deps),
+ cohort_(AssignCohort(deps, params.num_cohorts_)),
+ cohort_str_(ToBigEndian(cohort_)) {
+
+ if (params_.num_bits_ <= 0) {
+ log("num_bits must be positive");
+ assert(false);
+ }
+ if (params_.num_hashes_ <= 0) {
+ log("num_hashes must be positive");
+ assert(false);
+ }
+ if (params_.num_cohorts_ <= 0) {
+ log("num_cohorts must be positive");
+ assert(false);
+ }
+
+ // Check Maximum values.
+ if (deps_.hmac_func_ == rappor::HmacDrbg) {
+ // Using HmacDrbg
+ if (params_.num_bits_ % 8 != 0) {
+ log("num_bits (%d) must be divisible by 8 when using HmacDrbg.",
+ params.num_bits_);
+ assert(false);
+ }
+ } else {
+ // Using SHA256
+ if (params_.num_bits_ > kMaxBits) {
+ log("num_bits (%d) can't be greater than %d", params_.num_bits_,
+ kMaxBits);
+ assert(false);
+ }
+ }
+
+ if (params_.num_hashes_ > kMaxHashes) {
+ log("num_hashes (%d) can't be greater than %d", params_.num_hashes_,
+ kMaxHashes);
+ assert(false);
+ }
+ int m = params_.num_cohorts_;
+ if ((m & (m - 1)) != 0) {
+ log("num_cohorts (%d) must be a power of 2 (and not 0)", m);
+ assert(false);
+ }
+ // TODO: check max cohorts?
+
+ CheckValidProbability(params_.prob_f_, "prob_f");
+ CheckValidProbability(params_.prob_p_, "prob_p");
+ CheckValidProbability(params_.prob_q_, "prob_q");
+}
+
+bool Encoder::MakeBloomFilter(const std::string& value, Bits* bloom_out) const {
+ const int num_bits = params_.num_bits_;
+ const int num_hashes = params_.num_hashes_;
+
+ Bits bloom = 0;
+
+ // 4 byte cohort string + true value
+ std::string hash_input(cohort_str_ + value);
+
+ // First do hashing.
+ std::vector<uint8_t> hash_output;
+ deps_.hash_func_(hash_input, &hash_output);
+
+ // Error check
+ if (hash_output.size() < static_cast<size_t>(num_hashes)) {
+ log("Hash function didn't return enough bytes");
+ return false;
+ }
+
+ // To determine which bit to set in the bloom filter, use a byte of the MD5.
+ for (int i = 0; i < num_hashes; ++i) {
+ int bit_to_set = hash_output[i] % num_bits;
+ bloom |= 1 << bit_to_set;
+ }
+
+ *bloom_out = bloom;
+ return true;
+}
+
+// Write a Bloom filter into a vector of bytes, used for num_bits > 32.
+bool Encoder::MakeBloomFilter(const std::string& value,
+ std::vector<uint8_t>* bloom_out) const {
+ const int num_bits = params_.num_bits_;
+ const int num_hashes = params_.num_hashes_;
+
+ bloom_out->resize(params_.num_bits_ / 8, 0);
+
+ // Generate the hash.
+ std::vector<uint8_t> hash_output;
+ deps_.hash_func_(std::string(cohort_str_ + value), &hash_output);
+
+ // Check that we have enough bytes of hash available.
+ int exponent = 0;
+ int bytes_needed = 0;
+ while ((1 << exponent) < num_bits) {
+ exponent++;
+ }
+ bytes_needed = ((exponent - 1) / 8) + 1;
+ if (bytes_needed > 4) {
+ log("Can only use 4 bytes of hash at a time, needed %d "
+ "to address %d bits.", bytes_needed, num_bits);
+ return false;
+ }
+ if (hash_output.size() < static_cast<size_t>(bytes_needed * num_hashes)) {
+ log("Hash function returned %d bytes, but we needed "
+ "%d bytes * %d hashes. Choose lower num_hashes or "
+ "a different hash function.",
+ hash_output.size(), bytes_needed, num_hashes);
+ return false;
+ }
+
+ // To determine which bit to set in the Bloom filter, use 1 or more
+ // bytes of the MD5.
+ int hash_byte = 0;
+ for (int i = 0; i < num_hashes; ++i) {
+ int bit_to_set = 0;
+ for (int j = 0; j < bytes_needed; ++j) {
+ bit_to_set |= hash_output[hash_byte] << (j * 8);
+ ++hash_byte;
+ }
+ bit_to_set %= num_bits;
+ // Start at end of array to be consistent with the Bits implementation.
+ int index = (bloom_out->size() - 1) - (bit_to_set / 8);
+ (*bloom_out)[index] |= 1 << (bit_to_set % 8);
+ }
+ return true;
+}
+
+// Helper method for PRR
+bool Encoder::GetPrrMasks(const Bits bits, Bits* uniform_out,
+ Bits* f_mask_out) const {
+ // Create HMAC(secret, value), and use its bits to construct f_mask and
+ // uniform bits.
+ std::vector<uint8_t> sha256;
+
+ std::string hmac_value = kHmacPrrPrefix + encoder_id_ + ToBigEndian(bits);
+
+ deps_.hmac_func_(deps_.client_secret_, hmac_value, &sha256);
+ if (sha256.size() != kMaxBits) { // sanity check
+ return false;
+ }
+
+ // We should have already checked this.
+ if (params_.num_bits_ > kMaxBits) {
+ log("num_bits exceeds maximum.");
+ assert(false);
+ }
+
+ uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
+
+ Bits uniform = 0;
+ Bits f_mask = 0;
+
+ for (int i = 0; i < params_.num_bits_; ++i) {
+ uint8_t byte = sha256[i];
+
+ uint8_t u_bit = byte & 0x01; // 1 bit of entropy
+ uniform |= (u_bit << i); // maybe set bit in mask
+
+ uint8_t rand128 = byte >> 1; // 7 bits of entropy
+ uint8_t noise_bit = (rand128 < threshold128);
+ f_mask |= (noise_bit << i); // maybe set bit in mask
+ }
+
+ *uniform_out = uniform;
+ *f_mask_out = f_mask;
+ return true;
+}
+
+bool Encoder::_EncodeBitsInternal(const Bits bits, Bits* prr_out,
+ Bits* irr_out) const {
+ // Compute Permanent Randomized Response (PRR).
+ Bits uniform;
+ Bits f_mask;
+ if (!GetPrrMasks(bits, &uniform, &f_mask)) {
+ log("GetPrrMasks failed");
+ return false;
+ }
+
+ Bits prr = (bits & ~f_mask) | (uniform & f_mask);
+ *prr_out = prr;
+
+ // Compute Instantaneous Randomized Response (IRR).
+
+ // NOTE: These can fail if say a read() from /dev/urandom fails.
+ Bits p_bits;
+ Bits q_bits;
+ if (!deps_.irr_rand_.GetMask(params_.prob_p_, params_.num_bits_, &p_bits)) {
+ log("PMask failed");
+ return false;
+ }
+ if (!deps_.irr_rand_.GetMask(params_.prob_q_, params_.num_bits_, &q_bits)) {
+ log("QMask failed");
+ return false;
+ };
+
+ Bits irr = (p_bits & ~prr) | (q_bits & prr);
+ *irr_out = irr;
+
+ return true;
+}
+
+bool Encoder::_EncodeStringInternal(const std::string& value, Bits* bloom_out,
+ Bits* prr_out, Bits* irr_out) const {
+ if (!MakeBloomFilter(value, bloom_out)) {
+ log("Bloom filter calculation failed");
+ return false;
+ }
+ return _EncodeBitsInternal(*bloom_out, prr_out, irr_out);
+}
+
+bool Encoder::EncodeBits(const Bits bits, Bits* irr_out) const {
+ Bits unused_prr;
+ return _EncodeBitsInternal(bits, &unused_prr, irr_out);
+}
+
+bool Encoder::EncodeString(const std::string& value, Bits* irr_out) const {
+ Bits unused_bloom;
+ Bits unused_prr;
+ return _EncodeStringInternal(value, &unused_bloom, &unused_prr, irr_out);
+}
+
+static uint8_t shifted(const Bits& bits, const int& index) {
+ // For an array of bytes, select the appopriate byte from a 4-byte
+ // integer value. Bytes are enumerated in big-endian order, i.e.
+ // index = 0 is the MSB, index = 3 is the LSB.
+ int shift = 8 * (3 - (index % 4)); // Byte 0 shifts by 24 bits, 1 by 16, etc.
+ return (uint8_t)((bits >> shift) & 0xFF); // Return the correct byte.
+}
+
+bool Encoder::EncodeString(const std::string& value,
+ std::vector<uint8_t>* irr_out) const {
+ std::vector<uint8_t> bloom_out;
+ std::vector<uint8_t> hmac_out;
+ std::vector<uint8_t> uniform;
+ std::vector<uint8_t> f_mask;
+ const int num_bits = params_.num_bits_;
+
+ uniform.resize(num_bits / 8, 0);
+ f_mask.resize(num_bits / 8, 0);
+ irr_out->resize(num_bits / 8, 0);
+
+ // Set bloom_out.
+ if (!MakeBloomFilter(value, &bloom_out)) {
+ log("Bloom filter calculation failed");
+ return false;
+ }
+
+ // Set hmac_out.
+ hmac_out.resize(num_bits); // Signal to HmacDrbg about desired output size.
+ // Call HmacDrbg
+ std::string hmac_value = kHmacPrrPrefix + encoder_id_;
+ for (int i = 0; i < bloom_out.size(); ++i) {
+ hmac_value.append(reinterpret_cast<char *>(&bloom_out[i]), 1);
+ }
+ deps_.hmac_func_(deps_.client_secret_, hmac_value, &hmac_out);
+ if (hmac_out.size() != num_bits) {
+ log("Needed %d bytes from Hmac function, received %d bytes.",
+ num_bits, hmac_out.size());
+ return false;
+ }
+
+ // We'll be using 7 bits of each byte of the MAC as our random
+ // number for the f_mask.
+ uint8_t threshold128 = static_cast<uint8_t>(params_.prob_f_ * 128);
+
+ // Construct uniform and f_mask bitwise.
+ for (int i = 0; i < num_bits; i++) {
+ uint8_t byte = hmac_out[i];
+ uint8_t u_bit = byte & 0x01; // 1 bit of entropy.
+ int vector_index = (num_bits - 1 - i) / 8;
+ uint8_t rand128 = byte >> 1; // 7 bits of entropy.
+ uint8_t noise_bit = (rand128 < threshold128);
+ uniform[vector_index] |= (u_bit << (i % 8));
+ f_mask[vector_index] |= (noise_bit << (i % 8));
+ }
+
+ for (int i = 0; i < bloom_out.size(); i++) {
+ Bits p_bits;
+ Bits q_bits;
+ uint8_t prr;
+ prr = (bloom_out[i] & ~f_mask[i]) | (uniform[i] & f_mask[i]);
+ // GetMask operates on Uint32, so we generate a new p_bits every 4
+ // bytes, and use each of its bytes once.
+ if (i % 4 == 0) {
+ // Need new p_bits, q_bits values to work with.
+ if (!deps_.irr_rand_.GetMask(params_.prob_p_, 32, &p_bits)) {
+ log("PMask failed");
+ return false;
+ }
+ if (!deps_.irr_rand_.GetMask(params_.prob_q_, 32, &q_bits)) {
+ log("QMask failed");
+ return false;
+ }
+ }
+ (*irr_out)[i] = (shifted(p_bits, i) & ~prr)
+ | (shifted(q_bits, i) & prr);
+ }
+}
+
+void Encoder::set_cohort(uint32_t cohort) {
+ cohort_ = cohort;
+ cohort_str_ = ToBigEndian(cohort_);
+}
+
+} // namespace rappor
diff --git a/client/cpp/encoder.h b/client/cpp/encoder.h
new file mode 100644
index 0000000..323e5a2
--- /dev/null
+++ b/client/cpp/encoder.h
@@ -0,0 +1,130 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// RAPPOR encoder.
+//
+// See README.md and encoder_demo.cc for an example.
+
+#ifndef RAPPOR_H_
+#define RAPPOR_H_
+
+#include <string>
+
+#include "rappor_deps.h" // for dependency injection
+
+namespace rappor {
+
+// For debug logging
+void log(const char* fmt, ...);
+
+// RAPPOR encoding parameters.
+class Params {
+ public:
+ Params(int num_bits, int num_hashes, int num_cohorts,
+ float prob_f, float prob_p, float prob_q)
+ : num_bits_(num_bits),
+ num_hashes_(num_hashes),
+ num_cohorts_(num_cohorts),
+ prob_f_(prob_f),
+ prob_p_(prob_p),
+ prob_q_(prob_q) {
+ }
+
+ // Accessors
+ int num_bits() { return num_bits_; }
+ int num_hashes() { return num_hashes_; }
+ int num_cohorts() { return num_cohorts_; }
+ float prob_f() { return prob_f_; }
+ float prob_p() { return prob_p_; }
+ float prob_q() { return prob_q_; }
+
+ private:
+ friend class Encoder;
+
+ // k: size of bloom filter, PRR, and IRR. 0 < k <= 32.
+ int num_bits_;
+
+ // number of bits set in the Bloom filter ("h")
+ int num_hashes_;
+
+ // Total number of cohorts ("m"). Note that the cohort assignment is what
+ // is used in the client, not m. We include it here for documentation (it
+ // can be unset, unlike the other params.)
+ int num_cohorts_;
+
+ float prob_f_; // noise probability for PRR, quantized to 1/128
+
+ float prob_p_; // noise probability for IRR, quantized to 1/128
+ float prob_q_; // noise probability for IRR, quantized to 1/128
+};
+
+// Encoder: take client values and transform them with the RAPPOR privacy
+// algorithm.
+class Encoder {
+ public:
+ // Note that invalid parameters cause runtime assertions in the constructor.
+ // Encoders are intended to be created at application startup with constant
+ // arguments, so errors should be caught early.
+
+ // encoder_id: A unique ID for this encoder -- typically the name of the
+ // metric being encoded, so that different metrics have different PRR
+ // mappings.
+ // params: RAPPOR encoding parameters, which affect privacy and decoding.
+ // (held by reference; it must outlive the Encoder)
+ // deps: application-supplied dependencies.
+ // (held by reference; it must outlive the Encoder)
+ Encoder(const std::string& encoder_id, const Params& params,
+ const Deps& deps);
+
+ // Encode raw bits (represented as an integer), setting output parameter
+ // irr_out. Only valid when the return value is 'true' (success).
+ bool EncodeBits(const Bits bits, Bits* irr_out) const;
+
+ // Encode a string, setting output parameter irr_out. Only valid when the
+ // return value is 'true' (success).
+ bool EncodeString(const std::string& value, Bits* irr_out) const;
+ // For use with HmacDrbg hash function and any num_bits divisible by 8.
+ bool EncodeString(const std::string& value,
+ std::vector<uint8_t>* irr_out) const;
+
+ // For testing/simulation use only.
+ bool _EncodeBitsInternal(const Bits bits, Bits* prr_out, Bits* irr_out)
+ const;
+ bool _EncodeStringInternal(const std::string& value, Bits* bloom_out,
+ Bits* prr_out, Bits* irr_out) const;
+
+ // Accessor for the assigned cohort.
+ uint32_t cohort() { return cohort_; }
+ // Set a cohort manually, if previously generated.
+ void set_cohort(uint32_t cohort);
+
+ private:
+ bool MakeBloomFilter(const std::string& value, Bits* bloom_out) const;
+ bool MakeBloomFilter(const std::string& value,
+ std::vector<uint8_t>* bloom_out) const;
+ bool GetPrrMasks(const Bits bits, Bits* uniform, Bits* f_mask) const;
+
+ // static helper function for initialization
+ static uint32_t AssignCohort(const Deps& deps, int num_cohorts);
+
+ const std::string encoder_id_;
+ const Params& params_;
+ const Deps& deps_;
+ uint32_t cohort_;
+ std::string cohort_str_;
+};
+
+} // namespace rappor
+
+#endif // RAPPOR_H_
diff --git a/client/cpp/encoder_demo.cc b/client/cpp/encoder_demo.cc
new file mode 100644
index 0000000..1c278cd
--- /dev/null
+++ b/client/cpp/encoder_demo.cc
@@ -0,0 +1,56 @@
+// Copyright 2014 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Sample code for encoder.cc.
+//
+// This is the code in README.md. It's here to make sure it actually builds
+// and runs.
+
+#include <cassert> // assert
+
+#include "encoder.h"
+#include "openssl_hash_impl.h"
+#include "unix_kernel_rand_impl.h"
+
+int main(int argc, char** argv) {
+ // Suppress unused variable warnings
+ (void) argc;
+ (void) argv;
+
+ FILE* fp = fopen("/dev/urandom", "r");
+ rappor::UnixKernelRand irr_rand(fp);
+
+ rappor::Deps deps(rappor::Md5, "client-secret", rappor::HmacSha256,
+ irr_rand);
+ rappor::Params params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+
+ const char* encoder_id = "metric-name";
+ rappor::Encoder encoder(encoder_id, params, deps);
+
+ // Now use it to encode values. The 'out' value can be sent over the
+ // network.
+ rappor::Bits out;
+ assert(encoder.EncodeString("foo", &out)); // returns false on error
+ printf("'foo' encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
+
+ // Raw bits
+ assert(encoder.EncodeBits(0x123, &out)); // returns false on error
+ printf("0x123 encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
+}
+
diff --git a/client/cpp/encoder_unittest.cc b/client/cpp/encoder_unittest.cc
new file mode 100644
index 0000000..0f48604
--- /dev/null
+++ b/client/cpp/encoder_unittest.cc
@@ -0,0 +1,289 @@
+#include <gtest/gtest.h>
+#include <stdexcept>
+
+#include "encoder.h"
+#include "openssl_hash_impl.h"
+#include "unix_kernel_rand_impl.h"
+
+ // We need the same "random" inputs to the IRR
+ // each time to have reproducible tests.
+FILE* mock_urandom(void) {
+ int i;
+ FILE *fp;
+ fp = tmpfile();
+ for (i = 0; i < 1024; i++) {
+ fputc((i * 17) % 256, fp);
+ }
+ fflush(fp);
+ fp = freopen(NULL, "r", fp);
+ return fp;
+}
+
+class EncoderTest : public ::testing::Test {
+ protected:
+ EncoderTest() {
+ encoder_id = std::string("metric-name").c_str();
+ fp = mock_urandom();
+ irr_rand = new rappor::UnixKernelRand(fp);
+ }
+
+ virtual ~EncoderTest() {
+ fclose(fp);
+ delete irr_rand;
+ delete deps;
+ delete params;
+ delete encoder;
+ }
+
+FILE* fp; const char* encoder_id;
+ rappor::UnixKernelRand *irr_rand;
+ rappor::Deps *deps;
+ rappor::Params *params;
+ rappor::Encoder *encoder;
+ rappor::Bits bits_out;
+ std::vector<uint8_t> bits_vector;
+};
+
+// Uses HmacSha256 and 32-bit outputs.
+class EncoderUint32Test : public EncoderTest {
+ protected:
+ EncoderUint32Test() {
+ deps = new rappor::Deps(rappor::Md5, "client-secret", rappor::HmacSha256,
+ *irr_rand);
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ encoder = new rappor::Encoder(encoder_id, *params, *deps);
+ }
+};
+
+// Uses HmacDrbg and variable-size vector outputs.
+class EncoderUnlimTest : public EncoderTest {
+ protected:
+ EncoderUnlimTest() {
+ deps = new rappor::Deps(rappor::Md5, "client-secret", rappor::HmacDrbg,
+ *irr_rand);
+ params = new rappor::Params(64, // num_bits (k)
+ 2, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ encoder = new rappor::Encoder(encoder_id, *params, *deps);
+ }
+};
+
+
+///// EncoderUint32Test
+TEST_F(EncoderUint32Test, EncodeStringUint32) {
+ ASSERT_TRUE(encoder->EncodeString("foo", &bits_out));
+ ASSERT_EQ(2281639167, bits_out);
+ ASSERT_EQ(3, encoder->cohort());
+}
+
+TEST_F(EncoderUint32Test, EncodeStringUint32Cohort) {
+ encoder->set_cohort(4); // Set pre-selected cohort.
+ ASSERT_TRUE(encoder->EncodeString("foo", &bits_out));
+ ASSERT_EQ(2281637247, bits_out);
+ ASSERT_EQ(4, encoder->cohort());
+}
+
+TEST_F(EncoderUint32Test, EncodeBitsUint32) {
+ ASSERT_TRUE(encoder->EncodeBits(0x123, &bits_out));
+ ASSERT_EQ(2784956095, bits_out);
+ ASSERT_EQ(3, encoder->cohort());
+}
+
+// Negative tests
+// num_bits is negative.
+TEST_F(EncoderUint32Test, NumBitsMustBePositiveDeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ delete params;
+ params = new rappor::Params(-1, // num_bits (k) [BAD]
+ 2, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+}
+
+// num_hashes is negative.
+TEST_F(EncoderUint32Test, NumHashesMustBePositiveDeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ -1, // num_hashes (h) [BAD]
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+}
+
+// num_cohorts is negative.
+TEST_F(EncoderUint32Test, NumCohortsMustBePositiveDeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ -1, // num_cohorts (m) [BAD]
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Encoder.*Assertion.*failed");
+}
+
+// Invalid probabilities.
+TEST_F(EncoderUint32Test, InvalidProbabilitiesDeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ // prob_f negative.
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 1, // num_cohorts (m)
+ -0.1, // probability f for PRR [BAD]
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+ // prob_f > 1.
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 1, // num_cohorts (m)
+ 1.1, // probability f for PRR [BAD]
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+ // prob_p < 0.
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 1, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ -0.1, // probability p for IRR [BAD]
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+ // prob_p > 1.
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 1, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 1.1, // probability p for IRR [BAD]
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+ // prob_q < 0.
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 1, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ -0.1); // probability q for IRR [BAD]
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+ // prob_q > 1.
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 2, // num_hashes (h)
+ 1, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 1.1); // probability q for IRR [BAD]
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+}
+
+// num_bits 64 when only 32 bits are possible.
+TEST_F(EncoderUint32Test, Sha256NoMoreThan32BitsDeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ delete params;
+ params = new rappor::Params(64, // num_bits (k)
+ 2, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+}
+
+// num_hashes too high.
+TEST_F(EncoderUint32Test, NumHashesNoMoreThan16DeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ delete params;
+ params = new rappor::Params(32, // num_bits (k)
+ 17, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+}
+
+// EncoderString with 4-byte vector and HMACSHA256 and
+// EncoderString with Uint32 and HMACSHA256 should match.
+TEST_F(EncoderUint32Test, StringUint32AndStringVectorMatch) {
+ ASSERT_TRUE(encoder->EncodeString("foo", &bits_out));
+ ASSERT_EQ(2281639167, bits_out);
+ std::vector<uint8_t> expected_out(4);
+ expected_out[0] = (bits_out & 0xFF000000) >> 24;
+ expected_out[1] = (bits_out & 0x00FF0000) >> 16;
+ expected_out[2] = (bits_out & 0x0000FF00) >> 8;
+ expected_out[3] = bits_out & 0x000000FF;
+
+ // Reset the mock randomizer.
+ delete irr_rand;
+ delete deps;
+ delete encoder;
+ fclose(fp);
+ fp = mock_urandom();
+ irr_rand = new rappor::UnixKernelRand(fp);
+ deps = new rappor::Deps(rappor::Md5, "client-secret", rappor::HmacSha256,
+ *irr_rand);
+ encoder = new rappor::Encoder(encoder_id, *params, *deps);
+ ASSERT_TRUE(encoder->EncodeString("foo", &bits_vector));
+ ASSERT_EQ(expected_out, bits_vector);
+}
+
+///// EncoderUnlimTest
+
+TEST_F(EncoderUnlimTest, EncodeStringUint64) {
+ static const uint8_t ex[] = { 134, 255, 11, 255, 252, 119, 240, 223 };
+ std::vector<uint8_t> expected_vector(ex, ex + sizeof(ex));
+
+ ASSERT_TRUE(encoder->EncodeString("foo", &bits_vector));
+ ASSERT_EQ(expected_vector, bits_vector);
+ ASSERT_EQ(93, encoder->cohort());
+}
+
+// Negative tests.
+TEST_F(EncoderUnlimTest, NumBitsNotMultipleOf8DeathTest) {
+ ::testing::FLAGS_gtest_death_test_style = "threadsafe";
+ delete params;
+ params = new rappor::Params(63, // num_bits (k) [BAD]
+ 17, // num_hashes (h)
+ 128, // num_cohorts (m)
+ 0.25, // probability f for PRR
+ 0.75, // probability p for IRR
+ 0.5); // probability q for IRR
+ EXPECT_DEATH(rappor::Encoder(encoder_id, *params, *deps),
+ "Assertion.*failed");
+}
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/client/cpp/libc_rand_impl.cc b/client/cpp/libc_rand_impl.cc
new file mode 100644
index 0000000..e4d4016
--- /dev/null
+++ b/client/cpp/libc_rand_impl.cc
@@ -0,0 +1,44 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// IMPORTANT: This is for demo /simulation purposes only. Use a better random
+// function in production applications.
+
+#include "libc_rand_impl.h"
+
+#include <assert.h>
+#include <stdint.h> // uint64_t
+#include <stdlib.h> // srand
+
+namespace rappor {
+
+//
+// LibcRand
+//
+
+// Similar to client/python/fastrand.c
+bool LibcRand::GetMask(float prob, int num_bits, Bits* mask_out) const {
+ int rand_threshold = static_cast<int>(prob * RAND_MAX);
+ Bits mask = 0;
+
+ for (int i = 0; i < num_bits; ++i) {
+ // NOTE: could use rand_r(), which is more thread-safe
+ Bits bit = (rand() < rand_threshold);
+ mask |= (bit << i);
+ }
+ *mask_out = mask;
+ return true; // no possible failure
+}
+
+} // namespace rappor
diff --git a/client/cpp/libc_rand_impl.h b/client/cpp/libc_rand_impl.h
new file mode 100644
index 0000000..370168c
--- /dev/null
+++ b/client/cpp/libc_rand_impl.h
@@ -0,0 +1,36 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A RAPPOR random implementation using libc's rand().
+//
+// IMPORTANT: This is for demo /simulation purposes only. Use a better random
+// function in production applications.
+
+#ifndef LIBC_RAND_IMPL_H_
+#define LIBC_RAND_IMPL_H_
+
+#include "rappor_deps.h"
+
+namespace rappor {
+
+class LibcRand : public IrrRandInterface {
+ public:
+ virtual ~LibcRand() {}
+
+ virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const;
+};
+
+} // namespace rappor
+
+#endif // LIBC_RAND_IMPL_H_
diff --git a/client/cpp/openssl_hash_impl.cc b/client/cpp/openssl_hash_impl.cc
new file mode 100644
index 0000000..82993d8
--- /dev/null
+++ b/client/cpp/openssl_hash_impl.cc
@@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "openssl_hash_impl.h"
+
+#include <stdlib.h>
+#include <string>
+
+#include <openssl/evp.h> // EVP_sha256
+#include <openssl/hmac.h> // HMAC
+#include <openssl/md5.h> // MD5
+#include <openssl/sha.h> // SHA256_DIGEST_LENGTH
+
+namespace rappor {
+
+// of type HmacFunc in rappor_deps.h
+bool HmacSha256(const std::string& key, const std::string& value,
+ std::vector<uint8_t>* output) {
+ output->resize(SHA256_DIGEST_LENGTH, 0);
+
+ // Returns a pointer on success, or NULL on failure.
+ unsigned char* result = HMAC(
+ EVP_sha256(), key.c_str(), key.size(),
+ // std::string has 'char', OpenSSL wants unsigned char.
+ reinterpret_cast<const unsigned char*>(value.c_str()),
+ value.size(),
+ output->data(),
+ NULL);
+
+ return (result != NULL);
+}
+
+// Of type HmacFunc in rappor_deps.h
+//
+// The length of the passed-in output vector determines how many
+// bytes are returned.
+//
+// No reseed operation, but recommended reseed_interval <= 2^48 updates.
+// Since we're seeding for each value and typically don't need
+// so many bytes, we should be OK.
+bool HmacDrbg(const std::string& key, const std::string& value,
+ std::vector<uint8_t>* output) {
+ const unsigned char k_array[] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+ };
+ std::string v;
+ std::vector<uint8_t> temp_output;
+ int num_bytes = output->size();
+ if (num_bytes == 0) {
+ // By default return 32 bytes for Uint32 applications.
+ num_bytes = 32;
+ }
+
+ v.append(32u, 0x01);
+ temp_output.resize(32, 0);
+
+ std::string temp_str(v);
+ temp_str.append(std::string("\0", 1));
+ // provided_data is key|value.
+ temp_str.append(key);
+ temp_str.append(value);
+
+ output->resize(0);
+
+ // Instantiate.
+ if (!HmacSha256(std::string(k_array, k_array + 32), temp_str, &temp_output)) {
+ return false;
+ }
+ std::string k(temp_output.begin(), temp_output.end());
+ if (!HmacSha256(k, v, &temp_output)) {
+ return false;
+ }
+ v = std::string(temp_output.begin(), temp_output.end());
+ if (!HmacSha256(k, v + std::string("\1", 1) + key + value, &temp_output)) {
+ return false;
+ }
+ k = std::string(temp_output.begin(), temp_output.end());
+ if (!HmacSha256(k, v, &temp_output)) {
+ return false;
+ }
+ v = std::string(temp_output.begin(), temp_output.end());
+
+ while (output->size() < num_bytes) {
+ // Generate.
+ if (!HmacSha256(k, v, &temp_output)) {
+ return false;
+ }
+ v = std::string(temp_output.begin(), temp_output.end());
+ output->insert(output->end(), temp_output.begin(), temp_output.end());
+ }
+ output->resize(num_bytes);
+ return true;
+}
+
+// of type HashFunc in rappor_deps.h
+bool Md5(const std::string& value, std::vector<uint8_t>* output) {
+ output->resize(MD5_DIGEST_LENGTH, 0);
+
+ // std::string has 'char', OpenSSL wants unsigned char.
+ MD5(reinterpret_cast<const unsigned char*>(value.c_str()),
+ value.size(), output->data());
+ return true; // OpenSSL MD5 doesn't return an error code
+}
+
+} // namespace rappor
diff --git a/client/cpp/openssl_hash_impl.h b/client/cpp/openssl_hash_impl.h
new file mode 100644
index 0000000..dd37844
--- /dev/null
+++ b/client/cpp/openssl_hash_impl.h
@@ -0,0 +1,33 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// OpenSSL implementation of RAPPOR dependencies.
+
+#ifndef OPENSSL_IMPL_H_
+#define OPENSSL_IMPL_H_
+
+#include "rappor_deps.h"
+
+namespace rappor {
+
+bool HmacSha256(const std::string& key, const std::string& value,
+ std::vector<uint8_t>* output);
+// Pass output vector of desired length.
+bool HmacDrbg(const std::string& key, const std::string& value,
+ std::vector<uint8_t>* output);
+bool Md5(const std::string& value, std::vector<uint8_t>* output);
+
+} // namespace rappor
+
+#endif // OPENSSL_IMPL_H_
diff --git a/client/cpp/openssl_hash_impl_unittest.cc b/client/cpp/openssl_hash_impl_unittest.cc
new file mode 100644
index 0000000..947e139
--- /dev/null
+++ b/client/cpp/openssl_hash_impl_unittest.cc
@@ -0,0 +1,145 @@
+#include <gtest/gtest.h>
+
+#include "openssl_hash_impl.h"
+
+
+TEST(OpensslHashImplTest, Md5) {
+ std::vector<uint8_t> output;
+ rappor::Md5("test", &output);
+ static const uint8_t ex[] = {
+ 0x09, 0x8f, 0x6b, 0xcd, 0x46, 0x21, 0xd3, 0x73,
+ 0xca, 0xde, 0x4e, 0x83, 0x26, 0x27, 0xb4, 0xf6
+ };
+ std::vector<uint8_t> expected(ex, ex + sizeof(ex));
+ ASSERT_EQ(expected, output);
+}
+
+TEST(OpensslHashImplTest, HmacSha256) {
+ std::vector<uint8_t> output;
+ rappor::HmacSha256("key", "value", &output);
+ static const uint8_t ex[] = {
+ 0x90, 0xfb, 0xfc, 0xf1, 0x5e, 0x74, 0xa3, 0x6b,
+ 0x89, 0xdb, 0xdb, 0x2a, 0x72, 0x1d, 0x9a, 0xec,
+ 0xff, 0xdf, 0xdd, 0xdc, 0x5c, 0x83, 0xe2, 0x7f,
+ 0x75, 0x92, 0x59, 0x4f, 0x71, 0x93, 0x24, 0x81, };
+ std::vector<uint8_t> expected(ex, ex + sizeof(ex));
+ ASSERT_EQ(expected, output);
+
+ // Make sure nulls are handled properly.
+ //
+ // An empty value with key "key"
+ // $ echo -n -e "" | openssl dgst -hmac "key" -sha256 -binary | xxd
+ // 00000000: 5d5d 1395 63c9 5b59 67b9 bd9a 8c9b 233a ]]..c.[Yg.....#:
+ // 00000010: 9ded b450 7279 4cd2 32dc 1b74 8326 07d0 ...PryL.2..t.&..
+ rappor::HmacSha256("key", "", &output);
+ static const uint8_t exempty[] = {
+ 0x5d, 0x5d, 0x13, 0x95, 0x63, 0xc9, 0x5b, 0x59,
+ 0x67, 0xb9, 0xbd, 0x9a, 0x8c, 0x9b, 0x23, 0x3a,
+ 0x9d, 0xed, 0xb4, 0x50, 0x72, 0x79, 0x4c, 0xd2,
+ 0x32, 0xdc, 0x1b, 0x74, 0x83, 0x26, 0x07, 0xd0
+ };
+ std::vector<uint8_t> expected_empty(exempty, exempty + sizeof(exempty));
+ ASSERT_EQ(expected_empty, output);
+
+ // A single null value with key "key"
+ // $ echo -n -e "\x00" | openssl dgst -hmac "key" -sha256 -binary | xxd
+ // 00000000: 8a8d fb96 56dc cf21 b7ea 5269 1124 3b75 ....V..!..Ri.$;u
+ // 00000010: 68f4 3281 5f1c d43a 4277 1f2d b4aa a525 h.2._..:Bw.-...%
+ rappor::HmacSha256("key", std::string("\0", 1), &output);
+ static const uint8_t exnull[] = {
+ 0x8a, 0x8d, 0xfb, 0x96, 0x56, 0xdc, 0xcf, 0x21,
+ 0xb7, 0xea, 0x52, 0x69, 0x11, 0x24, 0x3b, 0x75,
+ 0x68, 0xf4, 0x32, 0x81, 0x5f, 0x1c, 0xd4, 0x3a,
+ 0x42, 0x77, 0x1f, 0x2d, 0xb4, 0xaa, 0xa5, 0x25
+ };
+ std::vector<uint8_t> expected_null(exnull, exnull + sizeof(exnull));
+ ASSERT_EQ(expected_null, output);
+
+ // A null value with something after it, with key "key"
+ // $ echo -n -e "\x00a" | openssl dgst -hmac "key" -sha256 -binary | xxd
+ // 00000000: 5787 df47 c2c4 8664 5a6a f898 44c3 4636 W..G...dZj..D.F6
+ // 00000010: fc5b b78b 1b87 29a0 6ca8 7556 7b75 c05a .[....).l.uV{u.Z
+ rappor::HmacSha256("key", std::string("\0a", 2), &output);
+ static const uint8_t exnulltrail[] = {
+ 0x57, 0x87, 0xdf, 0x47, 0xc2, 0xc4, 0x86, 0x64,
+ 0x5a, 0x6a, 0xf8, 0x98, 0x44, 0xc3, 0x46, 0x36,
+ 0xfc, 0x5b, 0xb7, 0x8b, 0x1b, 0x87, 0x29, 0xa0,
+ 0x6c, 0xa8, 0x75, 0x56, 0x7b, 0x75, 0xc0, 0x5a
+ };
+ std::vector<uint8_t> expected_null_trailing(
+ exnulltrail, exnulltrail + sizeof(exnulltrail));
+ ASSERT_EQ(expected_null_trailing, output);
+ std::string s = std::string("\0a", 2);
+ rappor::HmacSha256("key", s, &output);
+ ASSERT_EQ(expected_null_trailing, output);
+}
+
+TEST(OpensslHashImplTest, HmacDrbgNist) {
+ std::vector<uint8_t> output;
+ // Expected output for NIST tests.
+ static const uint8_t exnist[] = {
+ 0xD6, 0x7B, 0x8C, 0x17, 0x34, 0xF4, 0x6F, 0xA3,
+ 0xF7, 0x63, 0xCF, 0x57, 0xC6, 0xF9, 0xF4, 0xF2,
+ 0xDC, 0x10, 0x89, 0xBD, 0x8B, 0xC1, 0xF6, 0xF0,
+ 0x23, 0x95, 0x0B, 0xFC, 0x56, 0x17, 0x63, 0x52,
+ 0x08, 0xC8, 0x50, 0x12, 0x38, 0xAD, 0x7A, 0x44,
+ 0x00, 0xDE, 0xFE, 0xE4, 0x6C, 0x64, 0x0B, 0x61,
+ 0xAF, 0x77, 0xC2, 0xD1, 0xA3, 0xBF, 0xAA, 0x90,
+ 0xED, 0xE5, 0xD2, 0x07, 0x40, 0x6E, 0x54, 0x03
+ };
+ std::vector<uint8_t> expected_nist(
+ exnist, exnist + sizeof(exnist));
+
+ // NIST test data, from
+ // http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/HMAC_DRBG.pdf
+ // p.148, requested security strength 128, Requested hash algorithm SHA-256
+ output.resize(64);
+ rappor::HmacDrbg(
+ std::string(
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
+ "\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13"
+ "\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D"
+ "\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27"
+ "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31"
+ "\x32\x33\x34\x35\x36\x20\x21\x22\x23\x24"
+ "\x25\x26\x27", 63), // provided_data
+ "", &output);
+ ASSERT_EQ(expected_nist, output);
+
+ // Since in our use case we concatenate the key and value
+ // to produce the provided_data portion of the DRBG, let's
+ // split the above key into key|value as an additional
+ // test case.
+ output.resize(64);
+ rappor::HmacDrbg(
+ std::string(
+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
+ "\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13"
+ "\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D"
+ "\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27", 40),
+ std::string(
+ "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31"
+ "\x32\x33\x34\x35\x36\x20\x21\x22\x23\x24"
+ "\x25\x26\x27", 23), // provided_data
+ &output);
+ ASSERT_EQ(expected_nist, output);
+}
+
+TEST(OpensslHashImplTest, HmacDrbgTextStrings) {
+ std::vector<uint8_t> output;
+ output.resize(30);
+ rappor::HmacDrbg("key", "value", &output); // Truncated to 30 bytes.
+ static const uint8_t ex[] = {
+ 0x89, 0xD7, 0x1B, 0xB8, 0xA3, 0x7D, 0x80, 0xC2,
+ 0x6E, 0x63, 0x9C, 0xBD, 0x68, 0xF3, 0x60, 0x7A,
+ 0xA9, 0x4D, 0xEE, 0xF4, 0x25, 0xA7, 0xAF, 0xBB,
+ 0xF8, 0xD0, 0x09, 0x92, 0xAF, 0x92
+ };
+ std::vector<uint8_t> expected(ex, ex + sizeof(ex));
+ ASSERT_EQ(expected, output);
+}
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/client/cpp/rappor_deps.h b/client/cpp/rappor_deps.h
new file mode 100644
index 0000000..264a066
--- /dev/null
+++ b/client/cpp/rappor_deps.h
@@ -0,0 +1,75 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This header declares the dependencies that the application must provide to
+// the RAPPOR.
+
+#ifndef RAPPOR_DEPS_H_
+#define RAPPOR_DEPS_H_
+
+#include <stdint.h> // for uint32_t
+#include <string>
+#include <vector>
+
+namespace rappor {
+
+// rappor::Bits type is used for Bloom Filter, PRR, and IRR
+typedef uint32_t Bits;
+
+// rappor::Encoder needs a hash function for the bloom filter, and an HMAC
+// function for the PRR.
+
+typedef bool HashFunc(const std::string& value, std::vector<uint8_t>* output);
+typedef bool HmacFunc(const std::string& key, const std::string& value,
+ std::vector<uint8_t>* output);
+
+// Interface that the encoder use to generate randomness for the IRR.
+// Applications should implement this based on their platform and requirements.
+class IrrRandInterface {
+ public:
+ virtual ~IrrRandInterface() {}
+ // Compute a bitmask with each bit set to 1 with probability 'prob'.
+ // Returns false if there is an error.
+ virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const = 0;
+};
+
+// Dependencies
+// - hash_func: hash function for the Bloom Filter client step
+// - client_secret: key for deterministic randomness in the PRR
+// - hmac_func: function for deterministic randomness in the PRR
+// - irr_rand: randomness for the IRR
+
+class Deps {
+ public:
+ Deps(HashFunc* const hash_func, const std::string& client_secret,
+ HmacFunc* const hmac_func, const IrrRandInterface& irr_rand)
+ : hash_func_(hash_func),
+ client_secret_(client_secret),
+ hmac_func_(hmac_func),
+ irr_rand_(irr_rand) {
+ }
+
+ private:
+ friend class Encoder;
+
+ HashFunc* hash_func_; // for bloom filter
+ const std::string client_secret_; // for PRR; copy of constructor param
+ HmacFunc* hmac_func_; // PRR
+ const IrrRandInterface& irr_rand_; // IRR
+};
+
+} // namespace rappor
+
+#endif // RAPPOR_DEPS_H_
+
diff --git a/client/cpp/rappor_sim.cc b/client/cpp/rappor_sim.cc
new file mode 100644
index 0000000..6e91630
--- /dev/null
+++ b/client/cpp/rappor_sim.cc
@@ -0,0 +1,229 @@
+// Copyright 2014 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdio.h>
+#include <time.h> // time
+
+#include <cassert> // assert
+#include <cstdlib> // strtol, strtof
+#include <iostream>
+#include <vector>
+
+#include "encoder.h"
+#include "libc_rand_impl.h"
+#include "unix_kernel_rand_impl.h"
+#include "openssl_hash_impl.h"
+
+// Like atoi, but with basic (not exhaustive) error checking.
+bool StringToInt(const char* s, int* result) {
+ bool ok = true;
+ char* end; // mutated by strtol
+
+ *result = strtol(s, &end, 10); // base 10
+ // If strol didn't consume any characters, it failed.
+ if (end == s) {
+ ok = false;
+ }
+ return ok;
+}
+
+// Like atof, but with basic (not exhaustive) error checking.
+bool StringToFloat(const char* s, float* result) {
+ bool ok = true;
+ char* end; // mutated by strtof
+
+ *result = strtof(s, &end);
+ // If strof didn't consume any characters, it failed.
+ if (end == s) {
+ ok = false;
+ }
+ return ok;
+}
+
+// Copy a report into a string, which can go in a protobuf.
+void BitsToString(rappor::Bits b, std::string* output, int num_bytes) {
+ output->assign(num_bytes, '\0');
+ for (int i = 0; i < num_bytes; ++i) {
+ // "little endian" string
+ (*output)[i] = b & 0xFF; // last byte
+ b >>= 8;
+ }
+}
+
+// Print a report, with the most significant bit first.
+void PrintBitString(const std::string& s) {
+ for (int i = s.size() - 1; i >= 0; --i) {
+ unsigned char byte = s[i];
+ for (int j = 7; j >= 0; --j) {
+ bool bit = byte & (1 << j);
+ std::cout << (bit ? "1" : "0");
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ if (argc != 7) {
+ rappor::log(
+ "Usage: rappor_encode <num bits> <num hashes> <num cohorts> p q f");
+ exit(1);
+ }
+
+ int num_bits, num_hashes, num_cohorts;
+ float prob_p, prob_q, prob_f;
+
+ bool ok1 = StringToInt(argv[1], &num_bits);
+ bool ok2 = StringToInt(argv[2], &num_hashes);
+ bool ok3 = StringToInt(argv[3], &num_cohorts);
+
+ bool ok4 = StringToFloat(argv[4], &prob_p);
+ bool ok5 = StringToFloat(argv[5], &prob_q);
+ bool ok6 = StringToFloat(argv[6], &prob_f);
+
+ if (!ok1) {
+ rappor::log("Invalid number of bits: '%s'", argv[1]);
+ exit(1);
+ }
+ if (!ok2) {
+ rappor::log("Invalid number of hashes: '%s'", argv[2]);
+ exit(1);
+ }
+ if (!ok3) {
+ rappor::log("Invalid number of cohorts: '%s'", argv[3]);
+ exit(1);
+ }
+ if (!ok4) {
+ rappor::log("Invalid float p: '%s'", argv[4]);
+ exit(1);
+ }
+ if (!ok5) {
+ rappor::log("Invalid float q: '%s'", argv[5]);
+ exit(1);
+ }
+ if (!ok6) {
+ rappor::log("Invalid float f: '%s'", argv[6]);
+ exit(1);
+ }
+
+ rappor::Params params(num_bits, num_hashes, num_cohorts, prob_f, prob_p,
+ prob_q);
+
+ //rappor::log("k: %d, h: %d, m: %d", params.num_bits(), params.num_hashes(), params.num_cohorts());
+ //rappor::log("f: %f, p: %f, q: %f", prob_f, prob_p, prob_q);
+
+ int num_bytes = params.num_bits() / 8;
+
+ // TODO: Add a flag for
+ // - -r libc / kernel
+ // - -c openssl / nacl crpto
+
+ rappor::IrrRandInterface* irr_rand;
+ if (false) {
+ FILE* fp = fopen("/dev/urandom", "r");
+ irr_rand = new rappor::UnixKernelRand(fp);
+ } else {
+ int seed = time(NULL);
+ srand(seed); // seed with nanoseconds
+ irr_rand = new rappor::LibcRand();
+ }
+
+ std::string line;
+
+ // CSV header
+ std::cout << "client,cohort,bloom,prr,irr\n";
+
+ // Consume header line
+ std::getline(std::cin, line);
+ if (line != "client,cohort,value") {
+ rappor::log("Expected CSV header 'client,cohort,value'");
+ return 1;
+ }
+
+ while (true) {
+ std::getline(std::cin, line); // no trailing newline
+ // rappor::log("Got line %s", line.c_str());
+
+ if (line.empty()) {
+ break; // EOF
+ }
+
+ size_t comma1_pos = line.find(',');
+ if (comma1_pos == std::string::npos) {
+ rappor::log("Expected , in line '%s'", line.c_str());
+ return 1;
+ }
+ size_t comma2_pos = line.find(',', comma1_pos + 1);
+ if (comma2_pos == std::string::npos) {
+ rappor::log("Expected second , in line '%s'", line.c_str());
+ return 1;
+ }
+
+ // The C++ API substr(pos, length) not (pos, end)
+
+ // everything before comma
+ std::string client_str = line.substr(0, comma1_pos);
+ // everything between first and second comma.
+ // TODO(andychu): Remove unused second column.
+ std::string unused = line.substr(comma1_pos + 1, comma2_pos-comma1_pos);
+ // everything after
+ std::string value = line.substr(comma2_pos + 1);
+
+ rappor::Deps deps(rappor::Md5, client_str /*client_secret*/,
+ rappor::HmacSha256, *irr_rand);
+
+ // For now, construct a new encoder every time. We could construct one for
+ // each client. We are simulating many clients reporting the same metric,
+ // so the encoder ID is constant.
+ rappor::Encoder e("metric-name", params, deps);
+
+ // rappor::log("CLIENT %s VALUE %s COHORT %d", client_str.c_str(),
+ // value.c_str(), cohort);
+
+ rappor::Bits bloom;
+ rappor::Bits prr;
+ rappor::Bits irr;
+ bool ok = e._EncodeStringInternal(value, &bloom, &prr, &irr);
+
+ // NOTE: Are there really encoding errors?
+ if (!ok) {
+ rappor::log("Error encoding string %s", line.c_str());
+ break;
+ }
+
+ std::string bloom_str;
+ BitsToString(bloom, &bloom_str, num_bytes);
+
+ std::string prr_str;
+ BitsToString(prr, &prr_str, num_bytes);
+
+ std::string irr_str;
+ BitsToString(irr, &irr_str, num_bytes);
+
+ // Output CSV row.
+
+ std::cout << client_str;
+ std::cout << ',';
+ std::cout << e.cohort(); // cohort the encoder assigned
+ std::cout << ',';
+ PrintBitString(bloom_str);
+ std::cout << ',';
+ PrintBitString(prr_str);
+ std::cout << ',';
+ PrintBitString(irr_str);
+
+ std::cout << "\n";
+ }
+
+ // Cleanup
+ delete irr_rand;
+}
diff --git a/client/cpp/run.sh b/client/cpp/run.sh
new file mode 100755
index 0000000..cb7bf82
--- /dev/null
+++ b/client/cpp/run.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+#
+# Usage:
+# ./run.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+setup() {
+ # need libprotobuf-dev for headers to compile against.
+ sudo apt-get install protobuf-compiler libprotobuf-dev
+
+ # OpenSSL dev headers
+ sudo apt-get install libssl-dev
+}
+
+init() {
+ mkdir --verbose -p _tmp
+}
+
+rappor-sim() {
+ make _tmp/rappor_sim
+ _tmp/rappor_sim "$@"
+}
+
+protobuf-encoder-demo() {
+ make _tmp/protobuf_encoder_demo
+ _tmp/protobuf_encoder_demo "$@"
+}
+
+rappor-sim-demo() {
+ rappor-sim 16 2 128 0.25 0.75 0.5 <<EOF
+client,cohort,value
+c1,1,v1
+c1,1,v2
+c2,2,v3
+c2,2,v4
+EOF
+}
+
+empty-input() {
+ echo -n '' | rappor-sim 58 2 128 .025 0.75 0.5
+}
+
+# This outputs an HMAC and MD5 value. Compare with Python/shell below.
+
+openssl-hash-impl-test() {
+ make _tmp/openssl_hash_impl_test
+ _tmp/openssl_hash_impl_test "$@"
+}
+
+test-hmac-sha256() {
+ #echo -n foo | sha256sum
+ python -c '
+import hashlib
+import hmac
+import sys
+
+secret = sys.argv[1]
+body = sys.argv[2]
+m = hmac.new(secret, body, digestmod=hashlib.sha256)
+print m.hexdigest()
+' "key" "value"
+}
+
+test-md5() {
+ echo -n value | md5sum
+}
+
+# -M: all headers
+# -MM: exclude system headers
+
+# -MF: file to write the dependencies to
+
+# -MD: like -M -MF
+# -MMD: -MD, but only system headers
+
+# -MP: workaround
+
+
+deps() {
+ # -MM seems like the one we want.
+ gcc -I _tmp -MM protobuf_encoder_test.cc unix_kernel_rand_impl.cc
+ #gcc -I _tmp -MMD -MP protobuf_encoder_test.cc unix_kernel_rand_impl.cc
+}
+
+count() {
+ wc -l *.h *.cc | sort -n
+}
+
+encoder-demo() {
+ make _tmp/encoder_demo && _tmp/encoder_demo
+}
+cpplint() {
+ ../../analysis/cpp/_tmp/cpplint.py "$@"
+}
+
+"$@"
diff --git a/client/cpp/unix_kernel_rand_impl.cc b/client/cpp/unix_kernel_rand_impl.cc
new file mode 100644
index 0000000..e2a848c
--- /dev/null
+++ b/client/cpp/unix_kernel_rand_impl.cc
@@ -0,0 +1,40 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "unix_kernel_rand_impl.h"
+
+#include <stdint.h> // uint64_t
+
+namespace rappor {
+
+const int kMaxBitWidth = 32; // also in encoder.cc
+
+bool UnixKernelRand::GetMask(float prob, int num_bits, Bits* mask_out) const {
+ uint8_t rand_buf[kMaxBitWidth];
+ size_t num_elems = fread(&rand_buf, sizeof(uint8_t), num_bits, fp_);
+ if (num_elems != static_cast<size_t>(num_bits)) { // fread error
+ return false;
+ }
+ uint8_t threshold_256 = static_cast<uint8_t>(prob * 256);
+
+ Bits mask = 0;
+ for (int i = 0; i < num_bits; ++i) {
+ uint8_t bit = (rand_buf[i] < threshold_256);
+ mask |= (bit << i);
+ }
+ *mask_out = mask;
+ return true;
+}
+
+} // namespace rappor
diff --git a/client/cpp/unix_kernel_rand_impl.h b/client/cpp/unix_kernel_rand_impl.h
new file mode 100644
index 0000000..4d191d5
--- /dev/null
+++ b/client/cpp/unix_kernel_rand_impl.h
@@ -0,0 +1,43 @@
+// Copyright 2015 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A RAPPOR random implementation using bytes from a file like /dev/urandom or
+// /dev/random.
+
+#ifndef UNIX_KERNEL_RAND_IMPL_H_
+#define UNIX_KERNEL_RAND_IMPL_H_
+
+#include <stdint.h> // uint8_t
+#include <stdio.h> // FILE*
+
+#include "rappor_deps.h"
+
+namespace rappor {
+
+class UnixKernelRand : public IrrRandInterface {
+ public:
+ explicit UnixKernelRand(FILE* fp)
+ : fp_(fp) {
+ }
+ virtual ~UnixKernelRand() {}
+
+ virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const;
+
+ private:
+ FILE* fp_; // open device, e.g. /dev/urandom
+};
+
+} // namespace rappor
+
+#endif // UNIX_KERNEL_RAND_IMPL_H_
diff --git a/client/java/com/google/android/rappor/Encoder.java b/client/java/com/google/rappor/Encoder.java
index a8fb57c..a8fb57c 100644
--- a/client/java/com/google/android/rappor/Encoder.java
+++ b/client/java/com/google/rappor/Encoder.java
diff --git a/client/java/com/google/android/rappor/HmacDrbg.java b/client/java/com/google/rappor/HmacDrbg.java
index db99700..db99700 100644
--- a/client/java/com/google/android/rappor/HmacDrbg.java
+++ b/client/java/com/google/rappor/HmacDrbg.java
diff --git a/client/javatest/com/google/android/rappor/EncoderTest.java b/client/javatest/com/google/rappor/EncoderTest.java
index 896322f..896322f 100644
--- a/client/javatest/com/google/android/rappor/EncoderTest.java
+++ b/client/javatest/com/google/rappor/EncoderTest.java
diff --git a/client/javatest/com/google/android/rappor/HmacDrbgTest.java b/client/javatest/com/google/rappor/HmacDrbgTest.java
index e8b0f49..e8b0f49 100644
--- a/client/javatest/com/google/android/rappor/HmacDrbgTest.java
+++ b/client/javatest/com/google/rappor/HmacDrbgTest.java
diff --git a/client/python/rappor.py b/client/python/rappor.py
new file mode 100755
index 0000000..721dc3b
--- /dev/null
+++ b/client/python/rappor.py
@@ -0,0 +1,334 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""RAPPOR client library.
+
+Privacy is ensured without a third party by only sending RAPPOR'd data over the
+network (as opposed to raw client data).
+
+Note that we use MD5 for the Bloom filter hash function. (security not required)
+"""
+import csv
+import hashlib
+import hmac
+import json
+import struct
+import sys
+
+from random import SystemRandom
+
+class Error(Exception):
+ pass
+
+
+def log(msg, *args):
+ if args:
+ msg = msg % args
+ print >>sys.stderr, msg
+
+
+class Params(object):
+ """RAPPOR encoding parameters.
+
+ These affect privacy/anonymity. See the paper for details.
+ """
+ def __init__(self):
+ self.num_bloombits = 16 # Number of bloom filter bits (k)
+ self.num_hashes = 2 # Number of bloom filter hashes (h)
+ self.num_cohorts = 64 # Number of cohorts (m)
+ self.prob_p = 0.50 # Probability p
+ self.prob_q = 0.75 # Probability q
+ self.prob_f = 0.50 # Probability f
+
+ # For testing
+ def __eq__(self, other):
+ return self.__dict__ == other.__dict__
+
+ def __repr__(self):
+ return repr(self.__dict__)
+
+ def to_json(self):
+ """Convert this instance to JSON.
+
+ The names are be compatible with the apps/api server.
+ """
+ return json.dumps({
+ 'numBits': self.num_bloombits,
+ 'numHashes': self.num_hashes,
+ 'numCohorts': self.num_cohorts,
+ 'probPrr': self.prob_f,
+ 'probIrr0': self.prob_p,
+ 'probIrr1': self.prob_q,
+ })
+
+ # NOTE:
+ # - from_csv is currently used in sum_bits.py
+ # - to_csv is in rappor_sim.print_params
+ @staticmethod
+ def from_csv(f):
+ """Read the RAPPOR parameters from a CSV file.
+
+ Args:
+ f: file handle
+
+ Returns:
+ Params instance.
+
+ Raises:
+ rappor.Error: when the file is malformed.
+ """
+ c = csv.reader(f)
+ ok = False
+ p = Params()
+ for i, row in enumerate(c):
+
+ if i == 0:
+ if row != ['k', 'h', 'm', 'p', 'q', 'f']:
+ raise Error('Header %s is malformed; expected k,h,m,p,q,f' % row)
+
+ elif i == 1:
+ try:
+ # NOTE: May raise exceptions
+ p.num_bloombits = int(row[0])
+ p.num_hashes = int(row[1])
+ p.num_cohorts = int(row[2])
+ p.prob_p = float(row[3])
+ p.prob_q = float(row[4])
+ p.prob_f = float(row[5])
+ except (ValueError, IndexError) as e:
+ raise Error('Row is malformed: %s' % e)
+ ok = True
+
+ else:
+ raise Error('Params file should only have two rows')
+
+ if not ok:
+ raise Error("Expected second row with params")
+
+ return p
+
+
+class _SecureRandom(object):
+ """Returns an integer where each bit has probability p of being 1."""
+
+ def __init__(self, prob_one, num_bits):
+ self.prob_one = prob_one
+ self.num_bits = num_bits
+
+ def __call__(self):
+ p = self.prob_one
+ rand = SystemRandom()
+ r = 0
+
+ for i in xrange(self.num_bits):
+ bit = rand.random() < p
+ r |= (bit << i) # using bool as int
+ return r
+
+
+class SecureIrrRand(object):
+ """Python's os.random()"""
+
+ def __init__(self, params):
+ """
+ Args:
+ params: rappor.Params
+ """
+ num_bits = params.num_bloombits
+ # IRR probabilities
+
+ self.p_gen = _SecureRandom(params.prob_p, num_bits)
+ self.q_gen = _SecureRandom(params.prob_q, num_bits)
+
+
+def to_big_endian(i):
+ """Convert an integer to a 4 byte big endian string. Used for hashing."""
+ # https://docs.python.org/2/library/struct.html
+ # - Big Endian (>) for consistent network byte order.
+ # - L means 4 bytes when using >
+ return struct.pack('>L', i)
+
+
+def get_bloom_bits(word, cohort, num_hashes, num_bloombits):
+ """Return an array of bits to set in the bloom filter.
+
+ In the real report, we bitwise-OR them together. In hash candidates, we put
+ them in separate entries in the "map" matrix.
+ """
+ value = to_big_endian(cohort) + word # Cohort is 4 byte prefix.
+ md5 = hashlib.md5(value)
+
+ digest = md5.digest()
+
+ # Each has is a byte, which means we could have up to 256 bit Bloom filters.
+ # There are 16 bytes in an MD5, in which case we can have up to 16 hash
+ # functions per Bloom filter.
+ if num_hashes > len(digest):
+ raise RuntimeError("Can't have more than %d hashes" % md5)
+
+ #log('hash_input %r', value)
+ #log('Cohort %d', cohort)
+ #log('MD5 %s', md5.hexdigest())
+
+ return [ord(digest[i]) % num_bloombits for i in xrange(num_hashes)]
+
+
+def get_prr_masks(secret, word, prob_f, num_bits):
+ h = hmac.new(secret, word, digestmod=hashlib.sha256)
+ #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())
+
+ # Now go through each byte
+ digest_bytes = h.digest()
+ assert len(digest_bytes) == 32
+
+ # Use 32 bits. If we want 64 bits, it may be fine to generate another 32
+ # bytes by repeated HMAC. For arbitrary numbers of bytes it's probably
+ # better to use the HMAC-DRBG algorithm.
+ if num_bits > len(digest_bytes):
+ raise RuntimeError('%d bits is more than the max of %d', num_bits, len(d))
+
+ threshold128 = prob_f * 128
+
+ uniform = 0
+ f_mask = 0
+
+ for i in xrange(num_bits):
+ ch = digest_bytes[i]
+ byte = ord(ch)
+
+ u_bit = byte & 0x01 # 1 bit of entropy
+ uniform |= (u_bit << i) # maybe set bit in mask
+
+ rand128 = byte >> 1 # 7 bits of entropy
+ noise_bit = (rand128 < threshold128)
+ f_mask |= (noise_bit << i) # maybe set bit in mask
+
+ return uniform, f_mask
+
+
+def bit_string(irr, num_bloombits):
+ """Like bin(), but uses leading zeroes, and no '0b'."""
+ s = ''
+ bits = []
+ for bit_num in xrange(num_bloombits):
+ if irr & (1 << bit_num):
+ bits.append('1')
+ else:
+ bits.append('0')
+ return ''.join(reversed(bits))
+
+
+class Encoder(object):
+ """Obfuscates values for a given user using the RAPPOR privacy algorithm."""
+
+ def __init__(self, params, cohort, secret, irr_rand):
+ """
+ Args:
+ params: RAPPOR Params() controlling privacy
+ cohort: integer cohort, for Bloom hashing.
+ secret: secret string, for the PRR to be a deterministic function of the
+ reported value.
+ irr_rand: IRR randomness interface.
+ """
+ # RAPPOR params. NOTE: num_cohorts isn't used. p and q are used by
+ # irr_rand.
+ self.params = params
+ self.cohort = cohort # associated: MD5
+ self.secret = secret # associated: HMAC-SHA256
+ self.irr_rand = irr_rand # p and q used
+
+ def _internal_encode_bits(self, bits):
+ """Helper function for simulation / testing.
+
+ Returns:
+ The PRR and IRR. The PRR should never be sent over the network.
+ """
+ # Compute Permanent Randomized Response (PRR).
+ uniform, f_mask = get_prr_masks(
+ self.secret, to_big_endian(bits), self.params.prob_f,
+ self.params.num_bloombits)
+
+ # Suppose bit i of the Bloom filter is B_i. Then bit i of the PRR is
+ # defined as:
+ #
+ # 1 with prob f/2
+ # 0 with prob f/2
+ # B_i with prob 1-f
+
+ # Uniform bits are 1 with probability 1/2, and f_mask bits are 1 with
+ # probability f. So in the expression below:
+ #
+ # - Bits in (uniform & f_mask) are 1 with probability f/2.
+ # - (bloom_bits & ~f_mask) clears a bloom filter bit with probability
+ # f, so we get B_i with probability 1-f.
+ # - The remaining bits are 0, with remaining probability f/2.
+
+ prr = (bits & ~f_mask) | (uniform & f_mask)
+
+ #log('U %s / F %s', bit_string(uniform, num_bits),
+ # bit_string(f_mask, num_bits))
+
+ #log('B %s / PRR %s', bit_string(bloom_bits, num_bits),
+ # bit_string(prr, num_bits))
+
+ # Compute Instantaneous Randomized Response (IRR).
+ # If PRR bit is 0, IRR bit is 1 with probability p.
+ # If PRR bit is 1, IRR bit is 1 with probability q.
+ p_bits = self.irr_rand.p_gen()
+ q_bits = self.irr_rand.q_gen()
+
+ irr = (p_bits & ~prr) | (q_bits & prr)
+
+ return prr, irr # IRR is the rappor
+
+ def _internal_encode(self, word):
+ """Helper function for simulation / testing.
+
+ Returns:
+ The Bloom filter bits, PRR, and IRR. The first two values should never
+ be sent over the network.
+ """
+ bloom_bits = get_bloom_bits(word, self.cohort, self.params.num_hashes,
+ self.params.num_bloombits)
+
+ bloom = 0
+ for bit_to_set in bloom_bits:
+ bloom |= (1 << bit_to_set)
+
+ prr, irr = self._internal_encode_bits(bloom)
+ return bloom, prr, irr
+
+ def encode_bits(self, bits):
+ """Encode a string with RAPPOR.
+
+ Args:
+ bits: An integer representing bits to encode.
+
+ Returns:
+ An integer that is the IRR (Instantaneous Randomized Response).
+ """
+ _, irr = self._internal_encode_bits(bits)
+ return irr
+
+ def encode(self, word):
+ """Encode a string with RAPPOR.
+
+ Args:
+ word: the string that should be privately transmitted.
+
+ Returns:
+ An integer that is the IRR (Instantaneous Randomized Response).
+ """
+ _, _, irr = self._internal_encode(word)
+ return irr
diff --git a/client/python/rappor_test.py b/client/python/rappor_test.py
new file mode 100755
index 0000000..1aa6288
--- /dev/null
+++ b/client/python/rappor_test.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+rappor_test.py: Tests for rappor.py
+"""
+import cStringIO
+import copy
+import math
+import random
+import unittest
+
+import rappor # module under test
+
+
+class RapporParamsTest(unittest.TestCase):
+
+ def setUp(self):
+ self.typical_instance = rappor.Params()
+ ti = self.typical_instance # For convenience
+ ti.num_cohorts = 64 # Number of cohorts
+ ti.num_hashes = 2 # Number of bloom filter hashes
+ ti.num_bloombits = 16 # Number of bloom filter bits
+ ti.prob_p = 0.40 # Probability p
+ ti.prob_q = 0.70 # Probability q
+ ti.prob_f = 0.30 # Probability f
+
+ def testFromCsv(self):
+ f = cStringIO.StringIO('k,h,m,p,q,f\n32,2,64,0.5,0.75,0.6\n')
+ params = rappor.Params.from_csv(f)
+ self.assertEqual(32, params.num_bloombits)
+ self.assertEqual(64, params.num_cohorts)
+
+ # Malformed header
+ f = cStringIO.StringIO('k,h,m,p,q\n32,2,64,0.5,0.75,0.6\n')
+ self.assertRaises(rappor.Error, rappor.Params.from_csv, f)
+
+ # Missing second row
+ f = cStringIO.StringIO('k,h,m,p,q,f\n')
+ self.assertRaises(rappor.Error, rappor.Params.from_csv, f)
+
+ # Too many rows
+ f = cStringIO.StringIO('k,h,m,p,q,f\n32,2,64,0.5,0.75,0.6\nextra')
+ self.assertRaises(rappor.Error, rappor.Params.from_csv, f)
+
+ def testGetBloomBits(self):
+ for cohort in xrange(0, 64):
+ b = rappor.get_bloom_bits('foo', cohort, 2, 16)
+ #print 'cohort', cohort, 'bloom', b
+
+ def testGetPrr(self):
+ bloom = 1
+ num_bits = 8
+ for word in ('v1', 'v2', 'v3'):
+ masks = rappor.get_prr_masks('secret', word, 0.5, num_bits)
+ print 'masks', masks
+
+ def testToBigEndian(self):
+ b = rappor.to_big_endian(1)
+ print repr(b)
+ self.assertEqual(4, len(b))
+
+ def testEncoder(self):
+ # Test encoder with deterministic random function.
+ params = copy.copy(self.typical_instance)
+ params.prob_f = 0.5
+ params.prob_p = 0.5
+ params.prob_q = 0.75
+
+ # return these 3 probabilities in sequence.
+ rand = MockRandom([0.0, 0.6, 0.0], params)
+
+ e = rappor.Encoder(params, 0, 'secret', rand)
+
+ irr = e.encode("abc")
+
+ self.assertEquals(64493, irr) # given MockRandom, this is what we get
+
+
+class MockRandom(object):
+ """Returns one of three random values in a cyclic manner.
+
+ Mock random function that involves *some* state, as needed for tests that
+ call randomness several times. This makes it difficult to deal exclusively
+ with stubs for testing purposes.
+ """
+
+ def __init__(self, cycle, params):
+ self.p_gen = MockRandomCall(params.prob_p, cycle, params.num_bloombits)
+ self.q_gen = MockRandomCall(params.prob_q, cycle, params.num_bloombits)
+
+class MockRandomCall:
+ def __init__(self, prob, cycle, num_bits):
+ self.cycle = cycle
+ self.n = len(self.cycle)
+ self.prob = prob
+ self.num_bits = num_bits
+
+ def __call__(self):
+ counter = 0
+ r = 0
+ for i in xrange(0, self.num_bits):
+ rand_val = self.cycle[counter]
+ counter += 1
+ counter %= self.n # wrap around
+ r |= ((rand_val < self.prob) << i)
+ return r
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/demo.sh b/demo.sh
new file mode 100755
index 0000000..93da78d
--- /dev/null
+++ b/demo.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+#
+# Demo of RAPPOR. Automating Python and R scripts. See README.
+#
+# Usage:
+# ./demo.sh [function name]
+#
+# End to end demo of rappor. Notable functions include:
+# quick-python: Runs a demo using the python client
+# quick-cpp: Runs a demo using the c++ client
+# If no function is specified the above two will be run consecutivly.
+#
+# This takes a minute or so. It runs a subset of tests from regtest.sh and
+# writes an HTML summary.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+. util.sh
+
+readonly THIS_DIR=$(dirname $0)
+readonly REPO_ROOT=$THIS_DIR
+readonly CLIENT_DIR=$REPO_ROOT/client/python
+
+# All the Python tools need this
+export PYTHONPATH=$CLIENT_DIR
+
+#
+# Semi-automated demos
+#
+
+# Run rappor-sim through the Python profiler.
+rappor-sim-profile() {
+ local dist=$1
+ shift
+
+ # For now, just dump it to a text file. Sort by cumulative time.
+ time python -m cProfile -s cumulative \
+ tests/rappor_sim.py \
+ -i _tmp/$dist.csv \
+ "$@" \
+ | tee _tmp/profile.txt
+}
+
+quick-python() {
+ ./regtest.sh run-seq '^demo3' python
+}
+
+quick-cpp() {
+ # For now we build it first. Don't want to build it in parallel.
+ ./build.sh cpp-client
+
+ ./regtest.sh run-seq '^demo3' cpp
+}
+
+quick() {
+ quick-python
+ quick-cpp
+}
+
+# TODO: Port these old bad cases to regtest_spec.py.
+
+# Running the demo of the exponential distribution with 10000 reports (x7,
+# which is 70000 values).
+#
+# - There are 50 real values, but we add 1000 more candidates, to get 1050 candidates.
+# - And then we remove the two most common strings, v1 and v2.
+# - With the current analysis, we are getting sum(proportion) = 1.1 to 1.7
+
+# TODO: Make this sharper by including only one real value?
+
+bad-case() {
+ local num_additional=${1:-1000}
+ run-dist exp 10000 $num_additional 'v1|v2'
+}
+
+# Force it to be less than 1
+pcls-test() {
+ USE_PCLS=1 bad-case
+}
+
+# Only add 10 more candidates. Then we properly get the 0.48 proportion.
+ok-case() {
+ run-dist exp 10000 10 'v1|v2'
+}
+
+if test $# -eq 0 ; then
+ quick
+else
+ "$@"
+fi
diff --git a/doc/data-flow.dot b/doc/data-flow.dot
new file mode 100644
index 0000000..aa4bcda
--- /dev/null
+++ b/doc/data-flow.dot
@@ -0,0 +1,83 @@
+// Based on http://graphviz.org/content/cluster
+
+// Node types:
+// Rectangle: process
+// Oval: data
+// Diamond: debug/simulation data
+
+digraph G {
+ //rankdir="LR"; // left to right layout
+
+ // http://www.graphviz.org/content/color-names
+ colorscheme=pastel13;
+
+ subgraph cluster_0 {
+ graph [ fontsize=24 ];
+ label = "Reporting";
+ style=filled;
+ color=2;
+
+ node [style=filled, color=white, fontsize=12];
+
+ gen_sim_input -> dist_csv -> rappor_sim;
+
+ rappor_sim -> out;
+ rappor_sim -> params;
+ rappor_sim -> hist;
+ rappor_sim -> true_inputs;
+
+ // Process
+ rappor_sim [label="rappor_sim"];
+
+ // Data
+ dist_csv [shape=box, label="dist.csv"];
+ out [shape=box, label="dist_out.csv"];
+ params [shape=box, label="dist_params.csv"];
+
+ // simulation data
+ hist [shape=box, style=dotted, color=black, label="dist_hist.csv"];
+ true_inputs [shape=box, style=dotted, color=black, label="dist_true_inputs.txt"];
+ }
+
+ subgraph cluster_1 {
+ graph [ fontsize=24 ];
+ label = "Analysis";
+ style = filled;
+ color=3;
+
+ node [style=filled, color=white, fontsize=12];
+
+ sum_bits -> counts;
+
+ // sum_bits needs the params to construct the matrix. Technically it could
+ // infer it, but this is simple.
+ params -> sum_bits;
+
+ // only in the simulation
+ true_inputs -> demo_sh -> candidates [style=dotted];
+
+ candidates -> hash_candidates -> map;
+ params -> hash_candidates;
+
+ params -> analyze;
+ map -> analyze;
+ counts -> analyze;
+ hist -> analyze [style=dotted]; // only for comparison
+
+ analyze -> plot_png;
+
+ // Processes
+ analyze [label="analyze.R"];
+ demo_sh [label="demo.sh", style=dotted, color=black];
+
+ // Data
+ counts [shape=box, label="dist_count.csv"];
+ candidates [shape=box, label="dist_candidates.txt"];
+ map [shape=box, label="dist_map.csv"];
+
+ plot_png [shape=box, label="dist.png"];
+
+ }
+
+ out -> sum_bits;
+}
diff --git a/doc/data-flow.md b/doc/data-flow.md
new file mode 100644
index 0000000..1b58cbf
--- /dev/null
+++ b/doc/data-flow.md
@@ -0,0 +1,239 @@
+RAPPOR Data Flow
+================
+
+This doc explains the simulation tools and data formats in the [RAPPOR
+repository](https://github.com/google/rappor). We'll focus on the code, and
+describe the algorithm only informally. For details, see the [paper][].
+
+Overview
+--------
+
+Start with this command:
+
+ $ ./demo.sh run
+
+It takes a minute or so to run. The dependencies listed in the
+[README][] must be installed. At the end, it will say:
+
+ Wrote _tmp/report.html. Open this in your browser.
+
+It should look like [this][example].
+
+The following diagram shows what processes and files are involved in the demo.
+Ovals represent **processes**; rectangles represent **data**. The dotted lines
+denote components that are involved in the simulation, but wouldn't be used in
+a "real" setting.
+
+In most configurations, reporting (in blue) is done by client machines, while
+analysis (in green) is done by a server.
+
+<img src="data-flow.png" alt="Diagram of RAPPOR Data Flow" />
+
+In the simulation, reporting consists of these steps:
+
+ 1. Generate simulated input data with different distributions.
+ 2. Obscure each value with the RAPPOR privacy-preserving reporting mechanism.
+
+Analysis consists of these steps:
+
+ 1. Aggregate the reports by summing bits (i.e. make a counting Bloom filter)
+ 2. Come up with candidate strings, and hash them in the same manner as the
+ client.
+ 3. Using the reports, RAPPOR parameters, and candidate strings as input,
+ infer the distribution of true values. We don't see the values themselves.
+ We plot the true and inferred distributions side by side for comparison.
+
+This process is described in detail below.
+
+1. Generating Simulated Input
+-----------------------------
+
+The `tests/gen_sim_input.py` tool generates CSV data, like this:
+
+<!-- TODO: a realistic data set would be nice? How could we generate one? -->
+
+**exp.csv**
+
+ client, true_value
+ 1, v6
+ 1, v3
+ 1, v3
+ 1, v5
+ 1, v13
+ 1, v1
+ 1, v8
+ 2, v2
+ 2, v3
+ 2, v1
+ 2, v8
+ 2, v1
+ 2, v30
+ 2, v10
+ 3, v4
+ ...
+
+*(spaces added for clarity)*
+
+By default we generate 700,000 rows: 7 random values from `v1` to `v50` for
+each client. These can be thought of as a variable being reported over time.
+
+We're simulating an environment where there are many RAPPOR clients, and a
+single server does the RAPPOR analysis on the accumulated data.
+
+The `client` is represented by an integer ID. The `true_value` should **not**
+be sent over the network because we wish to preserve the client's privacy.
+
+
+2. RAPPOR Reporting
+-------------------
+
+The `tests/rappor_sim.py` tool uses the Python client library
+(`client/python/rappor.py`) to obscure the `v1` .. `vN` strings. We want to
+infer the distribution of these strings over the entire population, but we
+don't want to know any individual values.
+
+After the RAPPOR transformation, we get another CSV file with 700,000 rows.
+Each client is assigned a cohort.
+
+**exp_out.csv**
+
+ client, cohort, rappor
+ 1, 63, 1111101011110111
+ 1, 15, 1110110011111100
+ 1, 12, 0110101111100101
+ 1, 0, 1111100111110111
+ 1, 3, 1001110111110011
+ 1, 14, 1011111010110011
+ 1, 33, 0111010100101011
+ 2, 40, 0011011010101001
+ 2, 35, 1010110101110100
+ 2, 58, 1110110110111110
+ 2, 38, 0010001111001010
+ 2, 5, 1110111011100101
+ 2, 36, 0111010100111111
+ 2, 39, 0101101000101101
+ 3, 32, 0011100111111110
+ ...
+
+*(spaces added for clarity)*
+
+We also get a one-row CSV file that contains the RAPPOR parameters:
+
+**exp_params.csv**
+
+ k,h,m,p,q,f
+ 16,2,64,0.5,0.75,0.5
+
+These are described in the [paper][]. The parameters that the clients use
+must be known to the server, or the decoding will fail. In addition, all
+clients must use the same parameters for a given variable.
+
+The `rappor_sim.py` process also writes these files:
+
+- `exp_hist.csv`: The true histogram of input values. This is used only for
+ comparison. In the real world you obviously won't have this.
+- `exp_true_inputs.txt`: A list of the unique values reported, e.g. `v1` ..
+ `v50`. You won't have this either, in general. To use RAPPOR, you must
+ supply *candidate strings*, described below.
+
+3. Report Aggregation
+---------------------
+
+`sum_bits.py` takes the `exp_out.csv` output, and produces the "counts" file:
+
+**exp_counts.csv**
+
+ 11116,6273,6433,6347,6385,6290,6621,6359,6747,6623,6321,6696,6282,6652,6368,6286,6222
+ 10861,6365,6263,6170,6258,6107,6633,6171,6226,6123,6286,6254,6408,6182,6442,6195,6187
+ ...
+
+The file has 64 rows, because the simulation has 64 cohorts by default (`m =
+64`). This parameter should be adjusted based on the number of unique true
+values expected. <!-- TODO: more detail -->
+
+There are 17 columns. The left-most column is the total number of reports in
+the cohort. The remaining 16 columns correspond to the `k = 16` bits in the
+Bloom filter. Each column contains the number of reports with that bit set
+to 1.
+
+So, in general, the "counts" file is a `(k+1) * m` matrix.
+
+4. Candidate Strings
+--------------------
+
+In the simulation, we assume that the analyst will come up with a *superset* of
+the candidate strings. This is done in the `more-candidates` /
+`print-candidates` functions in `demo.sh`.
+
+You can also test what happens if you omit true strings from the candidates, by
+editing the invocation of `print-candidates` in `run-dist`:
+
+ # Example of omitting true values. Generate candidates from
+ # exp_true_inputs.txt, omitting values v1 and v2.
+
+ print-candidates $dist 'v1|v2' > _tmp/${dist}_candidates.txt
+
+In general, coming up with candidates is an application- or metric-specific
+process.
+
+The candidates are hashed by `hash_candidates.py` to create the "map" file,
+before being passed to R for analysis.
+
+**exp_map.csv**
+
+ v1,8,13,30,22,37,37,53,53,77,67,89,86,97,97,118,128,139,136,157,<truncated>
+ v10,13,2,25,28,42,45,58,60,68,66,91,89,108,102,113,125,130,131,<truncated>
+
+The map file has one row per candidate. In this case, there are 60 rows:
+50 for the true values and 10 for "fake" values, which make the candidates a
+superset of the true input.
+
+The left most column is the raw candidate string. Then there are 128 more
+columns: for `m = 64` cohorts times `k = 2` hash functions in the Bloom filter.
+
+<!-- TODO: more detail about setting params? Examples of coming up with
+candidate strings? -->
+
+5. RAPPOR Analysis
+------------------
+
+Once you have the `counts`, `params`, and `map` files, you can pass it to the
+`tests/analyze.R` tool, which is a small wrapper around the `analyze/R`
+library.
+
+You will get a plot of the true distribution vs. the distribution recovered
+with the RAPPOR privacy algorithm.
+
+[View the example output][example].
+
+You can change the simulation parameters and RAPPOR parameters via flags, and
+compare the resulting distributions.
+
+For example, if you expect more unique values from clients, you should also use
+more cohorts (i.e. raise `m`), to prevent hash function collisions from
+degrading the result quality.
+
+<!-- TODO:
+ - how to change flags
+ - more detail on what the various parameters do
+ - association analysis
+ - basic RAPPOR
+ - longitudinal privacy
+-->
+
+Conclusion
+----------
+
+RAPPOR allows you infer statistics about populations while preserving the
+privacy of individual clients. In this doc, we walked through a simple demo.
+However, there are other variations of RAPPOR and settings in which you can use
+RAPPOR, which we'll write more about.
+
+Feel free to send feedback on this doc to
+[rappor-discuss@googlegroups.com](https://groups.google.com/forum/#!forum/rappor-discuss).
+
+
+[README]: https://github.com/google/rappor/blob/master/README.md
+[paper]: http://arxiv.org/abs/1407.6981
+[example]: http://google.github.io/rappor/examples/report.html
+
diff --git a/doc/randomness.md b/doc/randomness.md
new file mode 100644
index 0000000..7136cdc
--- /dev/null
+++ b/doc/randomness.md
@@ -0,0 +1,38 @@
+Generating Random Bits for RAPPOR
+=================================
+
+To ensure privacy, an application using RAPPOR must generate random bits in an
+unpredictable manner. In other words, an adversary that can predict the
+sequence of random bits used can determine the true values being reported.
+
+Generating random numbers is highly platform-specific -- even
+language-specific. So, libraries implementing RAPPOR should be parameterized
+by an interface to generate random bits. (This can be thought of as
+"dependency injection".)
+
+<!-- TODO: details on the interfaces, once we have them in more than one
+ language -->
+
+
+For now, we have collected some useful links.
+
+Linux
+-----
+
+* [Myths about /dev/urandom](http://www.2uo.de/myths-about-urandom/) -- Nice
+ article explaining implementation aspects of `/dev/urandom` and `/dev/random`
+ on Linux. (Summary: just use `/dev/urandom`, with caveats explained)
+
+* [LWN on getrandom](http://lwn.net/Articles/606141/)
+ ([patch](http://lwn.net/Articles/605828/)) -- A very recent addition to the
+ Linux kernel. As of this writing (11/2014), it's safe to say that very few
+ applications use it. The relevant change, involving an issue mentioned in
+ the first link, involves the situation at system boot, when there is little
+ entropy available.
+
+
+<!-- TODO: other platforms. Chrome uses /dev/urandom on Linux. What about
+ other platforms? -->
+
+<!-- TODO: when we have a C/C++ client, explain provide sample implementation
+ using simple C functions -->
diff --git a/docs.sh b/docs.sh
new file mode 100755
index 0000000..3754261
--- /dev/null
+++ b/docs.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+
+build() {
+ ./build.sh doc
+}
+
+copy() {
+ cp -a ./_tmp/doc/* ./gh-pages/doc/
+ echo "After commiting changes, you can publish them by running: ./docs.sh publish"
+}
+
+publish() {
+ git subtree push --prefix gh-pages origin gh-pages
+}
+
+if test $# -eq 0 ; then
+ build
+ copy
+else
+ "$@"
+fi
+
+
diff --git a/gh-pages/doc/data-flow.html b/gh-pages/doc/data-flow.html
new file mode 100644
index 0000000..9462282
--- /dev/null
+++ b/gh-pages/doc/data-flow.html
@@ -0,0 +1,252 @@
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="UTF-8">
+ <style type="text/css">
+ code { color: green; }
+ pre { margin-left: 3em; }
+ </style>
+ <!-- INSERT LATCH JS -->
+ </head>
+ <body style="margin: 0 auto; width: 40em; text-align: left;">
+ <!-- INSERT LATCH HTML -->
+<h1>RAPPOR Data Flow</h1>
+
+<p>This doc explains the simulation tools and data formats in the <a href="https://github.com/google/rappor">RAPPOR
+repository</a>. We'll focus on the code, and
+describe the algorithm only informally. For details, see the <a href="http://arxiv.org/abs/1407.6981">paper</a>.</p>
+
+<h2>Overview</h2>
+
+<p>Start with this command:</p>
+
+<pre><code>$ ./demo.sh run
+</code></pre>
+
+<p>It takes a minute or so to run. The dependencies listed in the
+<a href="https://github.com/google/rappor/blob/master/README.md">README</a> must be installed. At the end, it will say:</p>
+
+<pre><code>Wrote _tmp/report.html. Open this in your browser.
+</code></pre>
+
+<p>It should look like <a href="http://google.github.io/rappor/examples/report.html">this</a>.</p>
+
+<p>The following diagram shows what processes and files are involved in the demo.
+Ovals represent <strong>processes</strong>; rectangles represent <strong>data</strong>. The dotted lines
+denote components that are involved in the simulation, but wouldn't be used in
+a "real" setting.</p>
+
+<p>In most configurations, reporting (in blue) is done by client machines, while
+analysis (in green) is done by a server.</p>
+
+<p><img src="data-flow.png" alt="Diagram of RAPPOR Data Flow" /></p>
+
+<p>In the simulation, reporting consists of these steps:</p>
+
+<ol>
+<li>Generate simulated input data with different distributions.</li>
+<li>Obscure each value with the RAPPOR privacy-preserving reporting mechanism.</li>
+</ol>
+
+<p>Analysis consists of these steps:</p>
+
+<ol>
+<li>Aggregate the reports by summing bits (i.e. make a counting Bloom filter)</li>
+<li>Come up with candidate strings, and hash them in the same manner as the
+client.</li>
+<li>Using the reports, RAPPOR parameters, and candidate strings as input,
+infer the distribution of true values. We don't see the values themselves.
+We plot the true and inferred distributions side by side for comparison.</li>
+</ol>
+
+<p>This process is described in detail below.</p>
+
+<h2>1. Generating Simulated Input</h2>
+
+<p>The <code>tests/gen_sim_input.py</code> tool generates CSV data, like this:</p>
+
+<!-- TODO: a realistic data set would be nice? How could we generate one? -->
+
+<p><strong>exp.csv</strong></p>
+
+<pre><code>client, true_value
+1, v6
+1, v3
+1, v3
+1, v5
+1, v13
+1, v1
+1, v8
+2, v2
+2, v3
+2, v1
+2, v8
+2, v1
+2, v30
+2, v10
+3, v4
+...
+</code></pre>
+
+<p><em>(spaces added for clarity)</em></p>
+
+<p>By default we generate 700,000 rows: 7 random values from <code>v1</code> to <code>v50</code> for
+each client. These can be thought of as a variable being reported over time.</p>
+
+<p>We're simulating an environment where there are many RAPPOR clients, and a
+single server does the RAPPOR analysis on the accumulated data.</p>
+
+<p>The <code>client</code> is represented by an integer ID. The <code>true_value</code> should <strong>not</strong>
+be sent over the network because we wish to preserve the client's privacy.</p>
+
+<h2>2. RAPPOR Reporting</h2>
+
+<p>The <code>tests/rappor_sim.py</code> tool uses the Python client library
+(<code>client/python/rappor.py</code>) to obscure the <code>v1</code> .. <code>vN</code> strings. We want to
+infer the distribution of these strings over the entire population, but we
+don't want to know any individual values.</p>
+
+<p>After the RAPPOR transformation, we get another CSV file with 700,000 rows.
+Each client is assigned a cohort.</p>
+
+<p><strong>exp_out.csv</strong></p>
+
+<pre><code>client, cohort, rappor
+1, 63, 1111101011110111
+1, 15, 1110110011111100
+1, 12, 0110101111100101
+1, 0, 1111100111110111
+1, 3, 1001110111110011
+1, 14, 1011111010110011
+1, 33, 0111010100101011
+2, 40, 0011011010101001
+2, 35, 1010110101110100
+2, 58, 1110110110111110
+2, 38, 0010001111001010
+2, 5, 1110111011100101
+2, 36, 0111010100111111
+2, 39, 0101101000101101
+3, 32, 0011100111111110
+...
+</code></pre>
+
+<p><em>(spaces added for clarity)</em></p>
+
+<p>We also get a one-row CSV file that contains the RAPPOR parameters:</p>
+
+<p><strong>exp_params.csv</strong></p>
+
+<pre><code>k,h,m,p,q,f
+16,2,64,0.5,0.75,0.5
+</code></pre>
+
+<p>These are described in the <a href="http://arxiv.org/abs/1407.6981">paper</a>. The parameters that the clients use
+must be known to the server, or the decoding will fail. In addition, all
+clients must use the same parameters for a given variable.</p>
+
+<p>The <code>rappor_sim.py</code> process also writes these files:</p>
+
+<ul>
+<li><code>exp_hist.csv</code>: The true histogram of input values. This is used only for
+comparison. In the real world you obviously won't have this.</li>
+<li><code>exp_true_inputs.txt</code>: A list of the unique values reported, e.g. <code>v1</code> ..
+<code>v50</code>. You won't have this either, in general. To use RAPPOR, you must
+supply <em>candidate strings</em>, described below.</li>
+</ul>
+
+<h2>3. Report Aggregation</h2>
+
+<p><code>sum_bits.py</code> takes the <code>exp_out.csv</code> output, and produces the "counts" file:</p>
+
+<p><strong>exp_counts.csv</strong></p>
+
+<pre><code>11116,6273,6433,6347,6385,6290,6621,6359,6747,6623,6321,6696,6282,6652,6368,6286,6222
+10861,6365,6263,6170,6258,6107,6633,6171,6226,6123,6286,6254,6408,6182,6442,6195,6187
+...
+</code></pre>
+
+<p>The file has 64 rows, because the simulation has 64 cohorts by default (<code>m =
+64</code>). This parameter should be adjusted based on the number of unique true
+values expected. <!-- TODO: more detail --></p>
+
+<p>There are 17 columns. The left-most column is the total number of reports in
+the cohort. The remaining 16 columns correspond to the <code>k = 16</code> bits in the
+Bloom filter. Each column contains the number of reports with that bit set
+to 1.</p>
+
+<p>So, in general, the "counts" file is a <code>(k+1) * m</code> matrix.</p>
+
+<h2>4. Candidate Strings</h2>
+
+<p>In the simulation, we assume that the analyst will come up with a <em>superset</em> of
+the candidate strings. This is done in the <code>more-candidates</code> /
+<code>print-candidates</code> functions in <code>demo.sh</code>.</p>
+
+<p>You can also test what happens if you omit true strings from the candidates, by
+editing the invocation of <code>print-candidates</code> in <code>run-dist</code>:</p>
+
+<pre><code># Example of omitting true values. Generate candidates from
+# exp_true_inputs.txt, omitting values v1 and v2.
+
+print-candidates $dist 'v1|v2' &gt; _tmp/${dist}_candidates.txt
+</code></pre>
+
+<p>In general, coming up with candidates is an application- or metric-specific
+process.</p>
+
+<p>The candidates are hashed by <code>hash_candidates.py</code> to create the "map" file,
+before being passed to R for analysis.</p>
+
+<p><strong>exp_map.csv</strong></p>
+
+<pre><code>v1,8,13,30,22,37,37,53,53,77,67,89,86,97,97,118,128,139,136,157,&lt;truncated&gt;
+v10,13,2,25,28,42,45,58,60,68,66,91,89,108,102,113,125,130,131,&lt;truncated&gt;
+</code></pre>
+
+<p>The map file has one row per candidate. In this case, there are 60 rows:
+50 for the true values and 10 for "fake" values, which make the candidates a
+superset of the true input.</p>
+
+<p>The left most column is the raw candidate string. Then there are 128 more
+columns: for <code>m = 64</code> cohorts times <code>k = 2</code> hash functions in the Bloom filter.</p>
+
+<!-- TODO: more detail about setting params? Examples of coming up with
+candidate strings? -->
+
+<h2>5. RAPPOR Analysis</h2>
+
+<p>Once you have the <code>counts</code>, <code>params</code>, and <code>map</code> files, you can pass it to the
+<code>tests/analyze.R</code> tool, which is a small wrapper around the <code>analyze/R</code>
+library.</p>
+
+<p>You will get a plot of the true distribution vs. the distribution recovered
+with the RAPPOR privacy algorithm.</p>
+
+<p><a href="http://google.github.io/rappor/examples/report.html">View the example output</a>.</p>
+
+<p>You can change the simulation parameters and RAPPOR parameters via flags, and
+compare the resulting distributions.</p>
+
+<p>For example, if you expect more unique values from clients, you should also use
+more cohorts (i.e. raise <code>m</code>), to prevent hash function collisions from
+degrading the result quality.</p>
+
+<!-- TODO:
+ - how to change flags
+ - more detail on what the various parameters do
+ - association analysis
+ - basic RAPPOR
+ - longitudinal privacy
+-->
+
+<h2>Conclusion</h2>
+
+<p>RAPPOR allows you infer statistics about populations while preserving the
+privacy of individual clients. In this doc, we walked through a simple demo.
+However, there are other variations of RAPPOR and settings in which you can use
+RAPPOR, which we'll write more about.</p>
+
+<p>Feel free to send feedback on this doc to
+<a href="https://groups.google.com/forum/#!forum/rappor-discuss">rappor-discuss@googlegroups.com</a>.</p>
+ </body>
+ </html>
diff --git a/gh-pages/doc/data-flow.png b/gh-pages/doc/data-flow.png
new file mode 100644
index 0000000..7d3b315
--- /dev/null
+++ b/gh-pages/doc/data-flow.png
Binary files differ
diff --git a/gh-pages/doc/randomness.html b/gh-pages/doc/randomness.html
new file mode 100644
index 0000000..926ac60
--- /dev/null
+++ b/gh-pages/doc/randomness.html
@@ -0,0 +1,49 @@
+ <!DOCTYPE html>
+ <html>
+ <head>
+ <meta charset="UTF-8">
+ <style type="text/css">
+ code { color: green; }
+ pre { margin-left: 3em; }
+ </style>
+ <!-- INSERT LATCH JS -->
+ </head>
+ <body style="margin: 0 auto; width: 40em; text-align: left;">
+ <!-- INSERT LATCH HTML -->
+<h1>Generating Random Bits for RAPPOR</h1>
+
+<p>To ensure privacy, an application using RAPPOR must generate random bits in an
+unpredictable manner. In other words, an adversary that can predict the
+sequence of random bits used can determine the true values being reported.</p>
+
+<p>Generating random numbers is highly platform-specific -- even
+language-specific. So, libraries implementing RAPPOR should be parameterized
+by an interface to generate random bits. (This can be thought of as
+"dependency injection".)</p>
+
+<!-- TODO: details on the interfaces, once we have them in more than one
+ language -->
+
+<p>For now, we have collected some useful links.</p>
+
+<h2>Linux</h2>
+
+<ul>
+<li><p><a href="http://www.2uo.de/myths-about-urandom/">Myths about /dev/urandom</a> -- Nice
+article explaining implementation aspects of <code>/dev/urandom</code> and <code>/dev/random</code>
+on Linux. (Summary: just use <code>/dev/urandom</code>, with caveats explained)</p></li>
+<li><p><a href="http://lwn.net/Articles/606141/">LWN on getrandom</a>
+(<a href="http://lwn.net/Articles/605828/">patch</a>) -- A very recent addition to the
+Linux kernel. As of this writing (11/2014), it's safe to say that very few
+applications use it. The relevant change, involving an issue mentioned in
+the first link, involves the situation at system boot, when there is little
+entropy available.</p></li>
+</ul>
+
+<!-- TODO: other platforms. Chrome uses /dev/urandom on Linux. What about
+ other platforms? -->
+
+<!-- TODO: when we have a C/C++ client, explain provide sample implementation
+ using simple C functions -->
+ </body>
+ </html>
diff --git a/gh-pages/examples/exp_report/dist.png b/gh-pages/examples/exp_report/dist.png
new file mode 100644
index 0000000..42210af
--- /dev/null
+++ b/gh-pages/examples/exp_report/dist.png
Binary files differ
diff --git a/gh-pages/examples/gauss_report/dist.png b/gh-pages/examples/gauss_report/dist.png
new file mode 100644
index 0000000..5103b3c
--- /dev/null
+++ b/gh-pages/examples/gauss_report/dist.png
Binary files differ
diff --git a/gh-pages/examples/report.html b/gh-pages/examples/report.html
new file mode 100644
index 0000000..99c04d0
--- /dev/null
+++ b/gh-pages/examples/report.html
@@ -0,0 +1,75 @@
+<!DOCTYPE html>
+<html>
+<head>
+ <title>RAPPOR Demo</title>
+</head>
+
+<body style="text-align: center">
+ <h2>RAPPOR Demo</h2>
+
+ <!-- These strings will be replaced by a sed script. -->
+
+ <!-- SIM_PARAMS -->
+
+ <h3>Simulation Input</h3>
+ <table align="center">
+ <tr>
+ <td>Number of clients</td>
+ <td align="right">100,000</td>
+ </tr>
+ <tr>
+ <td>Total values reported / obfuscated</td>
+ <td align="right">700,000</td>
+ </tr>
+ <tr>
+ <td>Unique values reported / obfuscated</td>
+ <td align="right">50</td>
+ </tr>
+ </table>
+
+
+ <!-- RAPPOR_PARAMS -->
+
+ <h3>RAPPOR Parameters</h3>
+ <table align="center">
+ <tr>
+ <td><b>k</b></td>
+ <td>Size of Bloom filter in bits</td>
+ <td align="right">16</td>
+ </tr>
+ <tr>
+ <td><b>h</b></td>
+ <td>Hash functions in Bloom filter</td>
+ <td align="right">2</td>
+ </tr>
+ <tr>
+ <td><b>m</b></td>
+ <td>Number of Cohorts</td>
+ <td align="right">64</td>
+ </tr>
+ <tr>
+ <td><b>p</b></td>
+ <td>Probability p</td>
+ <td align="right">0.5</td>
+ </tr>
+ <tr>
+ <td><b>q</b></td>
+ <td>Probability q</td>
+ <td align="right">0.75</td>
+ </tr>
+ <tr>
+ <td><b>f</b></td>
+ <td>Probability f</td>
+ <td align="right">0.5</td>
+ </tr>
+ </table>
+
+
+ <hr/>
+
+ <img src="exp_report/dist.png" alt="exponential distribution" />
+ <img src="gauss_report/dist.png" alt="gauss distribution" />
+ <img src="unif_report/dist.png" alt="uniform distribution" />
+</body>
+
+</html>
diff --git a/gh-pages/examples/unif_report/dist.png b/gh-pages/examples/unif_report/dist.png
new file mode 100644
index 0000000..48bcf79
--- /dev/null
+++ b/gh-pages/examples/unif_report/dist.png
Binary files differ
diff --git a/gh-pages/index.html b/gh-pages/index.html
new file mode 100644
index 0000000..be8baf2
--- /dev/null
+++ b/gh-pages/index.html
@@ -0,0 +1,14 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>RAPPOR Github Pages</title>
+ </head>
+
+ <body>
+ <h2>RAPPOR Github Pages</h2>
+ <p>
+ <a href="examples/report.html">examples/report.html</a> <br/>
+ <a href="doc/data-flow.html">doc/data-flow.html</a> <br/>
+ </p>
+ </body>
+</html>
diff --git a/pipeline/README.md b/pipeline/README.md
new file mode 100644
index 0000000..052ea9d
--- /dev/null
+++ b/pipeline/README.md
@@ -0,0 +1,52 @@
+pipeline
+========
+
+This directory contains tools and scripts for running a cron job that does
+RAPPOR analysis and generates an HTML dashboard.
+
+It works like this:
+
+1. `task_spec.py` generates a text file where each line corresponds to a process
+ to be run (a "task"). The process is `bin/decode-dist` or
+ `bin/decode-assoc`. The line contains the task parameters.
+
+2. `xargs -P` is used to run processes in parallel. Our analysis is generally
+ single-threaded (i.e. because R is single-threaded), so this helps utilize
+ the machine fully. Each task places its output in a different subdirectory.
+
+3. `cook.sh` calls `combine_results.py` to combine analysis results into a time
+ series. It also calls `combine_status.py` to keep track of task data for
+ "meta-analysis". `metric_status.R` generates more summary CSV files.
+
+4. `ui.sh` calls `csv_to_html.py` to generate an HTML fragments from the CSV
+ files.
+
+5. The JavaScript in `ui/ui.js` is loaded from static HTML, and makes AJAX calls
+ to retrieve the HTML fragments. The page is made interactive with
+ `ui/table-lib.js`.
+
+`dist.sh` and `assoc.sh` contain functions which coordinate this process.
+
+`alarm-lib.sh` is used to kill processes that have been running for too long.
+
+Testing
+-------
+
+`pipeline/regtest.sh` contains end-to-end demos of this process. Right now it
+depends on testdata from elsewhere in the tree:
+
+
+ rappor$ ./demo.sh run # prepare dist testdata
+ rappor$ cd bin
+
+ bin$ ./test.sh write-assoc-testdata # prepare assoc testdata
+ bin$ cd ../pipeline
+
+ pipeline$ ./regtest.sh dist
+ pipeline$ ./regtest.sh assoc
+
+ pipeline$ python -m SimpleHTTPServer # start a static web server
+
+ http://localhost:8000/_tmp/
+
+
diff --git a/pipeline/alarm-lib.sh b/pipeline/alarm-lib.sh
new file mode 100755
index 0000000..90495ce
--- /dev/null
+++ b/pipeline/alarm-lib.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+#
+# Alarm tool.
+#
+# Usage:
+# ./alarm.sh <function name>
+
+# You can source this file and use the alarm-status function.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+# Run a command with a timeout, and print its status to a directory.
+#
+# Usage:
+# alarm-status job_dir/STATUS 10 \
+# flaky_command ...
+
+alarm-status() {
+ set +o errexit
+ local status_file=$1
+ shift # everything except the status file goes to perl
+
+ # NOTE: It would be nice to setpgrp() before exec? And then can the signal
+ # be delivered to the entire group, like kill -SIGALRM -PID?
+
+ # NOTE: If we did this in Python, the error message would also be clearer.
+ perl -e 'alarm shift; exec @ARGV or die "ERROR: after exec @ARGV"' "$@"
+ local exit_code=$?
+
+ set -o errexit
+
+ local result=''
+ case $exit_code in
+ 0)
+ # Would be nice to show elapsed time?
+ result='OK'
+ ;;
+ 9)
+ # decode_assoc.R will exit 9 if there are no reports AFTER
+ # --remove-bad-rows. A task can also be marked SKIPPED before running
+ # the child process (see backfill.sh).
+ result='SKIPPED by child process'
+ ;;
+ # exit code 142 means SIGALARM. 128 + 14 = 142. See 'kill -l'.
+ 142)
+ local seconds=$1
+ result="TIMEOUT after $seconds seconds"
+ ;;
+ *)
+ result="FAIL with status $exit_code"
+ ;;
+ esac
+ echo "$result"
+ echo "$result" > $status_file
+}
+
+_work() {
+ local n=10 # 2 seconds
+ for i in $(seq $n); do
+ echo $i - "$@"
+ sleep 0.2
+ done
+}
+
+_succeed() {
+ _work "$@"
+ exit 0
+}
+
+_fail() {
+ _work "$@"
+ exit 1
+}
+
+_skip() {
+ exit 9
+}
+
+# http://perldoc.perl.org/functions/alarm.html
+#
+# Delivers alarm. But how to get the process to have a distinct exit code?
+
+demo() {
+ mkdir -p _tmp
+
+ # timeout
+ alarm-status _tmp/A 1 $0 _succeed foo
+ echo
+
+ # ok
+ alarm-status _tmp/B 3 $0 _succeed bar
+ echo
+
+ # fail
+ alarm-status _tmp/C 3 $0 _fail baz
+ echo
+
+ # skip
+ alarm-status _tmp/D 3 $0 _skip baz
+ echo
+
+ head _tmp/{A,B,C,D}
+}
+
+test-simple() {
+ alarm-status _tmp/status.txt 1 sleep 2
+}
+
+test-bad-command() {
+ alarm-status _tmp/status.txt 1 nonexistent_sleep 2
+}
+
+# BUG
+test-perl() {
+ set +o errexit
+ perl -e 'alarm shift; exec @ARGV or die "ERROR after exec @ARGV"' 1 _sleep 2
+ echo $?
+}
+
+if test $(basename $0) = 'alarm-lib.sh'; then
+ "$@"
+fi
diff --git a/pipeline/assoc.sh b/pipeline/assoc.sh
new file mode 100755
index 0000000..2c6a54d
--- /dev/null
+++ b/pipeline/assoc.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+#
+# Usage:
+# ./assoc.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
+
+source $RAPPOR_SRC/util.sh # log, banner
+source $RAPPOR_SRC/pipeline/tools-lib.sh
+source $RAPPOR_SRC/pipeline/alarm-lib.sh
+
+# Change the default location of these tools by setting DEP_*
+readonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc}
+readonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em}
+
+# Run a single decode-assoc process, to analyze one variable pair for one
+# metric. The arguments to this function are one row of the task spec.
+decode-one() {
+ # Job constants, from decode-many
+ local rappor_src=$1
+ local timeout_secs=$2
+ local min_reports=$3
+ local job_dir=$4
+ local sample_size=$5
+
+ # Task spec variables, from task_spec.py
+ local num_reports=$6
+ local metric_name=$7
+ local date=$8 # for output naming only
+ local reports=$9 # file with reports
+ local var1=${10}
+ local var2=${11}
+ local map1=${12}
+ local output_dir=${13}
+
+ local log_file=$output_dir/assoc-log.txt
+ local status_file=$output_dir/assoc-status.txt
+ mkdir --verbose -p $output_dir
+
+ # Flags drived from job constants
+ local schema=$job_dir/config/rappor-vars.csv
+ local params_dir=$job_dir/config
+ local em_executable=$FAST_EM
+
+ # TODO:
+ # - Skip jobs with few reports, like ./backfill.sh analyze-one.
+
+ # Output the spec for combine_status.py.
+ echo "$@" > $output_dir/assoc-spec.txt
+
+ # NOTE: Not passing --num-cores since we're parallelizing already.
+
+ # NOTE: --tmp-dir is the output dir. Then we just delete all the .bin files
+ # afterward so we don't copy them to x20 (they are big).
+
+ { time \
+ alarm-status $status_file $timeout_secs \
+ $DECODE_ASSOC \
+ --create-bool-map \
+ --remove-bad-rows \
+ --em-executable $em_executable \
+ --schema $schema \
+ --params-dir $params_dir \
+ --metric-name $metric_name \
+ --reports $reports \
+ --var1 $var1 \
+ --var2 $var2 \
+ --map1 $map1 \
+ --reports-sample-size $sample_size \
+ --tmp-dir $output_dir \
+ --output-dir $output_dir
+ } >$log_file 2>&1
+}
+
+test-decode-one() {
+ decode-one $RAPPOR_SRC
+}
+
+readonly DEFAULT_MIN_REPORTS=5000
+
+#readonly DEFAULT_TIMEOUT_SECONDS=300 # 5 minutes as a quick test.
+readonly DEFAULT_TIMEOUT_SECONDS=3600 # 1 hour
+
+readonly DEFAULT_MAX_PROCS=6 # TODO: Share with backfill.sh
+
+# Limit to 1M for now. Raise it when we have a full run.
+readonly DEFAULT_SAMPLE_SIZE=1000000
+
+readonly NUM_ARGS=8 # number of tokens in the task spec, used for xargs
+
+# Run many decode-assoc processes in parallel.
+decode-many() {
+ local job_dir=$1
+ local spec_list=$2
+
+ # These 3 params affect speed
+ local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS}
+ local sample_size=${4:-$DEFAULT_SAMPLE_SIZE}
+ local max_procs=${5:-$DEFAULT_MAX_PROCS}
+
+ local rappor_src=${6:-$RAPPOR_SRC}
+ local min_reports=${7:-$DEFAULT_MIN_REPORTS}
+
+ time cat $spec_list \
+ | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
+ $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size
+}
+
+# Combine assoc results and render HTML.
+
+combine-and-render-html() {
+ local jobs_base_dir=$1
+ local job_dir=$2
+
+ banner "Combining assoc task status"
+ TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir
+
+ banner "Combining assoc results"
+ TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir
+
+ banner "Splitting out status per metric, and writing overview"
+ TOOLS-cook assoc-metric-status $job_dir
+
+ TOOLS-gen-ui symlink-static assoc $job_dir
+
+ banner "Building overview .part.html from CSV"
+ TOOLS-gen-ui assoc-overview-part-html $job_dir
+
+ banner "Building metric .part.html from CSV"
+ TOOLS-gen-ui assoc-metric-part-html $job_dir
+
+ banner "Building pair .part.html from CSV"
+ TOOLS-gen-ui assoc-pair-part-html $job_dir
+
+ banner "Building day .part.html from CSV"
+ TOOLS-gen-ui assoc-day-part-html $job_dir
+}
+
+# Temp files left over by the fast_em R <-> C++.
+list-and-remove-bin() {
+ local job_dir=$1
+ # If everything failed, we might not have anything to list/delete.
+ find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si
+ find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose
+}
+
+"$@"
diff --git a/pipeline/combine_results.py b/pipeline/combine_results.py
new file mode 100755
index 0000000..6cb0150
--- /dev/null
+++ b/pipeline/combine_results.py
@@ -0,0 +1,138 @@
+#!/usr/bin/python
+"""Combines results from multiple days of a single metric.
+
+Feed it the STATUS.txt files on stdin. It then finds the corresponding
+results.csv, and takes the top N items.
+
+Example:
+
+Date, "google.com,", yahoo.com
+2015-03-01, 0.0, 0.9
+2015-03-02, 0.1, 0.8
+
+Dygraphs can load this CSV file directly.
+
+TODO: Use different dygraph API?
+
+Also we need error bars.
+
+ new Dygraph(document.getElementById("graphdiv2"),
+ [
+ [1,10,100],
+ [2,20,80],
+ [3,50,60],
+ [4,70,80]
+ ],
+ {
+ labels: [ "Date", "failure", "timeout", "google.com" ]
+ });
+"""
+
+import collections
+import csv
+import json
+import os
+import sys
+
+import util
+
+
+def CombineDistResults(stdin, c_out, num_top):
+ dates = []
+ var_cols = collections.defaultdict(dict) # {name: {date: value}}
+
+ seen_dates = set()
+
+ for line in stdin:
+ status_path = line.strip()
+
+ # Assume it looks like .../2015-03-01/STATUS.txt
+ task_dir = os.path.dirname(status_path)
+ date = os.path.basename(task_dir)
+
+ # Get rid of duplicate dates. These could be caused by retries.
+ if date in seen_dates:
+ continue
+
+ seen_dates.add(date)
+
+ with open(status_path) as f:
+ status = f.readline().split()[0] # OK, FAIL, TIMEOUT, SKIPPED
+
+ dates.append(date)
+
+ if status != 'OK':
+ continue # won't have results.csv
+
+ results_path = os.path.join(task_dir, 'results.csv')
+ with open(results_path) as f:
+ c = csv.reader(f)
+ unused_header = c.next() # header row
+
+ # they are sorted by decreasing "estimate", which is what we want
+ for i in xrange(0, num_top):
+ try:
+ row = c.next()
+ except StopIteration:
+ # It's OK if it doesn't have enough
+ util.log('Stopping early. Fewer than %d results to render.', num_top)
+ break
+
+ string, _, _, proportion, _, prop_low, prop_high = row
+
+ # dygraphs has a weird format with semicolons:
+ # value;lower;upper,value;lower;upper.
+
+ # http://dygraphs.com/data.html#csv
+
+ # Arbitrarily use 4 digits after decimal point (for dygraphs, not
+ # directly displayed)
+ dygraph_triple = '%.4f;%.4f;%.4f' % (
+ float(prop_low), float(proportion), float(prop_high))
+
+ var_cols[string][date] = dygraph_triple
+
+ # Now print CSV on stdout.
+ cols = sorted(var_cols.keys()) # sort columns alphabetically
+ c_out.writerow(['date'] + cols)
+
+ dates.sort()
+
+ for date in dates:
+ row = [date]
+ for col in cols:
+ cell = var_cols[col].get(date) # None mean sthere is no row
+ row.append(cell)
+ c_out.writerow(row)
+
+ #util.log("Number of dynamic cols: %d", len(var_cols))
+
+
+def CombineAssocResults(stdin, c_out, num_top):
+ header = ('dummy',)
+ c_out.writerow(header)
+
+
+def main(argv):
+ action = argv[1]
+
+ if action == 'dist':
+ num_top = int(argv[2]) # number of values to keep
+ c_out = csv.writer(sys.stdout)
+ CombineDistResults(sys.stdin, c_out, num_top)
+
+ elif action == 'assoc':
+ num_top = int(argv[2]) # number of values to keep
+ c_out = csv.writer(sys.stdout)
+ CombineAssocResults(sys.stdin, c_out, num_top)
+
+ else:
+ raise RuntimeError('Invalid action %r' % action)
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/pipeline/combine_results_test.py b/pipeline/combine_results_test.py
new file mode 100755
index 0000000..84c4cb7
--- /dev/null
+++ b/pipeline/combine_results_test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python -S
+"""
+combine_results_test.py: Tests for combine_results.py
+"""
+
+import csv
+import cStringIO
+import unittest
+
+import combine_results # module under test
+
+
+# TODO: Make these test more the header row. They rely heavily on the file
+# system!
+
+class CombineResultsTest(unittest.TestCase):
+
+ def testCombineDistResults(self):
+ stdin = cStringIO.StringIO('')
+ out = cStringIO.StringIO()
+ c_out = csv.writer(out)
+
+ combine_results.CombineDistResults(stdin, c_out, 10)
+ actual = out.getvalue()
+ self.assert_(actual.startswith('date'), actual)
+
+ def testCombineAssocResults(self):
+ stdin = cStringIO.StringIO('')
+ out = cStringIO.StringIO()
+ c_out = csv.writer(out)
+
+ combine_results.CombineAssocResults(stdin, c_out, 10)
+ actual = out.getvalue()
+ self.assert_(actual.startswith('dummy'), actual)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pipeline/combine_status.py b/pipeline/combine_status.py
new file mode 100755
index 0000000..4fbb36a
--- /dev/null
+++ b/pipeline/combine_status.py
@@ -0,0 +1,298 @@
+#!/usr/bin/python
+"""Summarize the results of many RAPPOR analysis runs.
+
+Takes a list of STATUS.txt files on stdin, and reads the corresponding spec.txt
+and log.txt files. Writes a CSV to stdout. Row key is (metric, date).
+"""
+
+import collections
+import csv
+import json
+import os
+import re
+import sys
+
+
+# Parse bash 'time' output:
+# real 0m11.578s
+
+# TODO: Parse the time from metrics.json instead.
+TIMING_RE = re.compile(
+ r'real \s+ (\d+) m ([\d.]+) s', re.VERBOSE)
+
+# TODO: Could have decode-dist and decode-assoc output the PID?
+PID_RE = re.compile(
+ r'write_pid.py: PID (\d+)') # not VERBOSE, spaces are literal
+
+
+def ParseMemCsv(f):
+ """Compute summary stats for memory.
+
+ vm5_peak_kib -> max(vm_peak_kib) # over 5 second intervals. Since it uses
+ the kernel, it's accurate except for takes that spike in their last 4
+ seconds.
+
+ vm5_mean_kib -> mean(vm_size_kib) # over 5 second intervals
+ """
+ peak_by_pid = collections.defaultdict(list)
+ size_by_pid = collections.defaultdict(list)
+
+ # Parse columns we care about, by PID
+ c = csv.reader(f)
+ for i, row in enumerate(c):
+ if i == 0:
+ continue # skip header
+ # looks like timestamp, pid, then (rss, peak, size)
+ _, pid, _, peak, size = row
+ if peak != '':
+ peak_by_pid[pid].append(int(peak))
+ if size != '':
+ size_by_pid[pid].append(int(size))
+
+ mem_by_pid = {}
+
+ # Now compute summaries
+ pids = peak_by_pid.keys()
+ for pid in pids:
+ peaks = peak_by_pid[pid]
+ vm5_peak_kib = max(peaks)
+
+ sizes = size_by_pid[pid]
+ vm5_mean_kib = sum(sizes) / len(sizes)
+
+ mem_by_pid[pid] = (vm5_peak_kib, vm5_mean_kib)
+
+ return mem_by_pid
+
+
+def CheckJobId(job_id, parts):
+ """Sanity check for date or smoke test."""
+ if not job_id.startswith('201') and not job_id.startswith('smoke'):
+ raise RuntimeError(
+ "Expected job ID to start with '201' or 'smoke': got %r (%s)" %
+ (job_id, parts))
+
+
+def ReadStatus(f):
+ status_line = f.readline().strip()
+ return status_line.split()[0] # OK, TIMEOUT, FAIL
+
+
+def CombineDistTaskStatus(stdin, c_out, mem_by_pid):
+ """Read status task paths from stdin, write CSV summary to c_out'."""
+
+ #util.log('%s', mem_by_pid)
+
+ # Parses:
+ # - input path for metric name and date
+ # - spec.txt for task params
+ # - STATUS.txt for task success/failure
+ # - metrics.json for output metrics
+ # - log.txt for timing, if it ran to completion
+ # - and for structured data
+ # - join with mem by PID
+
+ header = (
+ 'job_id', 'params_file', 'map_file',
+ 'metric', 'date',
+ 'vm5_peak_kib', 'vm5_mean_kib', # set when not skipped
+ 'seconds', 'status',
+ # only set when OK
+ 'num_reports', 'num_rappor', 'allocated_mass',
+ # only set when failed
+ 'fail_reason')
+ c_out.writerow(header)
+
+ for line in stdin:
+ #
+ # Receive a STATUS.txt path on each line of stdin, and parse it.
+ #
+ status_path = line.strip()
+
+ with open(status_path) as f:
+ status = ReadStatus(f)
+
+ # Path should look like this:
+ # ~/rappor/cron/2015-05-20__19-22-01/raw/Settings.NewTabPage/2015-05-19/STATUS.txt
+ parts = status_path.split('/')
+ job_id = parts[-5]
+ CheckJobId(job_id, parts)
+
+ #
+ # Parse the job spec
+ #
+ result_dir = os.path.dirname(status_path)
+ spec_file = os.path.join(result_dir, 'spec.txt')
+ with open(spec_file) as f:
+ spec_line = f.readline()
+ # See backfill.sh analyze-one for the order of these 7 fields.
+ # There are 3 job constants on the front.
+ (num_reports, metric_name, date, counts_path, params_path,
+ map_path, _) = spec_line.split()
+
+ # NOTE: These are all constant per metric. Could have another CSV and
+ # join. But denormalizing is OK for now.
+ params_file = os.path.basename(params_path)
+ map_file = os.path.basename(map_path)
+
+ # remove extension
+ params_file, _ = os.path.splitext(params_file)
+ map_file, _ = os.path.splitext(map_file)
+
+ #
+ # Read the log
+ #
+ log_file = os.path.join(result_dir, 'log.txt')
+ with open(log_file) as f:
+ lines = f.readlines()
+
+ # Search lines in reverse order for total time. It could have output from
+ # multiple 'time' statements, and we want the last one.
+ seconds = None # for skipped
+ for i in xrange(len(lines) - 1, -1, -1):
+ # TODO: Parse the R timing too. Could use LOG_RECORD_RE.
+ m = TIMING_RE.search(lines[i])
+ if m:
+ min_part, sec_part = m.groups()
+ seconds = float(min_part) * 60 + float(sec_part)
+ break
+
+ # Extract stack trace
+ if status == 'FAIL':
+ # Stack trace looks like: "Calls: main -> RunOne ..."
+ fail_reason = ''.join(line.strip() for line in lines if 'Calls' in line)
+ else:
+ fail_reason = None
+
+ # Extract PID and join with memory results
+ pid = None
+ vm5_peak_kib = None
+ vm5_mean_kib = None
+ if mem_by_pid:
+ for line in lines:
+ m = PID_RE.match(line)
+ if m:
+ pid = m.group(1)
+ # Could the PID not exist if the process was super short was less
+ # than 5 seconds?
+ try:
+ vm5_peak_kib, vm5_mean_kib = mem_by_pid[pid]
+ except KeyError: # sometimes we don't add mem-track on the front
+ vm5_peak_kib, vm5_mean_kib = None, None
+ break
+ else:
+ pass # we weren't passed memory.csv
+
+ #
+ # Read the metrics
+ #
+ metrics = {}
+ metrics_file = os.path.join(result_dir, 'metrics.json')
+ if os.path.isfile(metrics_file):
+ with open(metrics_file) as f:
+ metrics = json.load(f)
+
+ num_rappor = metrics.get('num_detected')
+ allocated_mass = metrics.get('allocated_mass')
+
+ # Construct and write row
+ row = (
+ job_id, params_file, map_file,
+ metric_name, date,
+ vm5_peak_kib, vm5_mean_kib,
+ seconds, status,
+ num_reports, num_rappor, allocated_mass,
+ fail_reason)
+
+ c_out.writerow(row)
+
+
+def CombineAssocTaskStatus(stdin, c_out):
+ """Read status task paths from stdin, write CSV summary to c_out'."""
+
+ header = (
+ 'job_id', 'metric', 'date', 'status', 'num_reports',
+ 'total_elapsed_seconds', 'em_elapsed_seconds', 'var1', 'var2', 'd1',
+ 'd2')
+
+ c_out.writerow(header)
+
+ for line in stdin:
+ status_path = line.strip()
+
+ with open(status_path) as f:
+ status = ReadStatus(f)
+
+ parts = status_path.split('/')
+ job_id = parts[-6]
+ CheckJobId(job_id, parts)
+
+ #
+ # Parse the job spec
+ #
+ result_dir = os.path.dirname(status_path)
+ spec_file = os.path.join(result_dir, 'assoc-spec.txt')
+ with open(spec_file) as f:
+ spec_line = f.readline()
+ # See backfill.sh analyze-one for the order of these 7 fields.
+ # There are 3 job constants on the front.
+
+ # 5 job params
+ (_, _, _, _, _,
+ dummy_num_reports, metric_name, date, reports, var1, var2, map1,
+ output_dir) = spec_line.split()
+
+ #
+ # Parse decode-assoc metrics
+ #
+ metrics = {}
+ metrics_file = os.path.join(result_dir, 'assoc-metrics.json')
+ if os.path.isfile(metrics_file):
+ with open(metrics_file) as f:
+ metrics = json.load(f)
+
+ # After we run it we have the actual number of reports
+ num_reports = metrics.get('num_reports')
+ total_elapsed_seconds = metrics.get('total_elapsed_time')
+ em_elapsed_seconds = metrics.get('em_elapsed_time')
+ estimate_dimensions = metrics.get('estimate_dimensions')
+ if estimate_dimensions:
+ d1, d2 = estimate_dimensions
+ else:
+ d1, d2 = (0, 0) # unknown
+
+ row = (
+ job_id, metric_name, date, status, num_reports, total_elapsed_seconds,
+ em_elapsed_seconds, var1, var2, d1, d2)
+ c_out.writerow(row)
+
+
+def main(argv):
+ action = argv[1]
+
+ try:
+ mem_csv = argv[2]
+ except IndexError:
+ mem_by_pid = None
+ else:
+ with open(mem_csv) as f:
+ mem_by_pid = ParseMemCsv(f)
+
+ if action == 'dist':
+ c_out = csv.writer(sys.stdout)
+ CombineDistTaskStatus(sys.stdin, c_out, mem_by_pid)
+
+ elif action == 'assoc':
+ c_out = csv.writer(sys.stdout)
+ CombineAssocTaskStatus(sys.stdin, c_out)
+
+ else:
+ raise RuntimeError('Invalid action %r' % action)
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/pipeline/combine_status_test.py b/pipeline/combine_status_test.py
new file mode 100755
index 0000000..4606587
--- /dev/null
+++ b/pipeline/combine_status_test.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python -S
+"""
+combine_status_test.py: Tests for combine_status.py
+"""
+
+import csv
+import cStringIO
+import unittest
+
+import combine_status # module under test
+
+
+# TODO: Make these test more the header row. They rely heavily on the file
+# system!
+
+class CombineStatusTest(unittest.TestCase):
+
+ def testCombineDistTaskStatus(self):
+ stdin = cStringIO.StringIO('')
+ out = cStringIO.StringIO()
+ c_out = csv.writer(out)
+
+ combine_status.CombineDistTaskStatus(stdin, c_out, {})
+ actual = out.getvalue()
+ self.assert_(actual.startswith('job_id,params_file,'), actual)
+
+ def testCombineAssocTaskStatus(self):
+ stdin = cStringIO.StringIO('')
+ out = cStringIO.StringIO()
+ c_out = csv.writer(out)
+
+ combine_status.CombineAssocTaskStatus(stdin, c_out)
+ actual = out.getvalue()
+ self.assert_(actual.startswith('job_id,metric,'), actual)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pipeline/cook.sh b/pipeline/cook.sh
new file mode 100755
index 0000000..e820d44
--- /dev/null
+++ b/pipeline/cook.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+#
+# Take the raw data from the analysis and massage it into various formats
+# suitable for display.
+#
+# Usage:
+# ./cook.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
+
+source $RAPPOR_SRC/pipeline/tools-lib.sh
+
+
+status-files() {
+ local dir=$1
+ find $dir -name STATUS.txt
+}
+
+results-files() {
+ local dir=$1
+ find $dir -name results.csv
+}
+
+count-results() {
+ # first field of each line is one of {OK, TIMEOUT, FAIL, SKIPPED}
+ status-files "$@" \
+ | xargs cat \
+ | cut -d ' ' -f 1 \
+ | sort | uniq -c | sort -n -r
+}
+
+#
+# For dist cron job
+#
+
+# Combine status of tasks over multiple jobs. Each row is a task (decode-dist
+# invocation). This has the number of reports.
+combine-dist-task-status() {
+ local base_dir=${1:-~/rappor/cron}
+ local job_dir=${2:-~/rappor/cron/2015-05-22__05-58-01}
+
+ local out=$job_dir/task-status.csv
+
+ # Ignore memory for now.
+ time status-files $base_dir | TOOLS-combine-status dist > $out
+ echo "Wrote $out"
+}
+
+# Create a single dist.csv time series for a GIVEN metric.
+combine-dist-results-one() {
+ local base_dir=$1
+ local job_dir=$2
+ local metric_name=$3
+ #echo FOO $base_dir $metric_name
+
+ local out_dir=$job_dir/cooked/$metric_name
+ mkdir -p $out_dir
+
+ # Glob to capture this specific metric name over ALL job IDs.
+ find $base_dir/*/raw/$metric_name -name STATUS.txt \
+ | TOOLS-combine-results dist 5 \
+ > $out_dir/dist.csv
+}
+
+# Creates a dist.csv file for EACH metric. TODO: Rename one/many
+combine-dist-results() {
+ local base_dir=${1:-~/rappor/cron}
+ local job_dir=${2:-~/rappor/cron/2015-05-22__05-58-01}
+
+ # Direct subdirs of 'raw' are metrics. Just print filename.
+ find $base_dir/*/raw -mindepth 1 -maxdepth 1 -type d -a -printf '%f\n' \
+ | sort | uniq \
+ | xargs --verbose -n1 -- \
+ $0 combine-dist-results-one $base_dir $job_dir
+}
+
+# Take the task-status.csv file, which has row key (metric, date). Writes
+# num_reports.csv and status.csv per metric, and a single overview.csv for all
+# metrics.
+dist-metric-status() {
+ local job_dir=${1:-_tmp/results-10}
+ local out_dir=$job_dir/cooked
+
+ TOOLS-metric-status dist $job_dir/task-status.csv $out_dir
+}
+
+#
+# For association analysis cron job
+#
+
+combine-assoc-task-status() {
+ local base_dir=${1:-~/rappor/chrome-assoc-smoke}
+ local job_dir=${2:-$base_dir/smoke1}
+
+ local out=$job_dir/assoc-task-status.csv
+
+ time find $base_dir -name assoc-status.txt \
+ | TOOLS-combine-status assoc \
+ > $out
+
+ echo "Wrote $out"
+}
+
+# Create a single assoc.csv time series for a GIVEN (var1, var2) pair.
+combine-assoc-results-one() {
+ local base_dir=$1
+ local job_dir=$2
+ local metric_pair_rel_path=$3
+
+ local out_dir=$job_dir/cooked/$metric_pair_rel_path
+ mkdir -p $out_dir
+
+ # Glob to capture this specific metric name over ALL job IDs.
+ find $base_dir/*/raw/$metric_pair_rel_path -name assoc-status.txt \
+ | TOOLS-combine-results assoc 5 \
+ > $out_dir/assoc-results-series.csv
+}
+
+# Creates a dist.csv file for EACH metric. TODO: Rename one/many
+combine-assoc-results() {
+ local base_dir=${1:-~/rappor/chrome-assoc-smoke}
+ local job_dir=${2:-$base_dir/smoke3}
+
+ # Direct subdirs of 'raw' are metrics, and subdirs of that are variable
+ # pairs. Print "$metric_name/$pair_name".
+ find $base_dir/*/raw -mindepth 2 -maxdepth 2 -type d -a -printf '%P\n' \
+ | sort | uniq \
+ | xargs --verbose -n1 -- \
+ $0 combine-assoc-results-one $base_dir $job_dir
+}
+
+# Take the assoc-task-status.csv file, which has row key (metric, date). Writes
+# num_reports.csv and status.csv per metric, and a single overview.csv for all
+# metrics.
+assoc-metric-status() {
+ local job_dir=${1:-~/rappor/chrome-assoc-smoke/smoke3}
+ local out_dir=$job_dir/cooked
+
+ TOOLS-metric-status assoc $job_dir/assoc-task-status.csv $out_dir
+}
+
+"$@"
diff --git a/pipeline/csv-to-html-test.sh b/pipeline/csv-to-html-test.sh
new file mode 100755
index 0000000..754d083
--- /dev/null
+++ b/pipeline/csv-to-html-test.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# Test for csv_to_html.py.
+#
+# Usage:
+# ./csv-to-html-test.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+test-basic() {
+ ./csv_to_html.py <<EOF
+a_number,b
+1,2
+3,4
+NA,4
+EOF
+}
+
+test-col-format() {
+ ./csv_to_html.py \
+ --col-format 'b <a href="../{b}/metric.html">{b}</a>' <<EOF
+a,b
+1,2015-05-01
+3,2015-05-02
+EOF
+}
+
+test-var-def() {
+ ./csv_to_html.py \
+ --def 'v VALUE' \
+ --col-format 'b <a href="../{b}/metric.html">{v}</a>' <<EOF
+a,b
+1,2
+3,4
+EOF
+}
+
+test-as-percent() {
+ ./csv_to_html.py \
+ --as-percent b <<EOF
+a,b
+A,0.21
+B,0.001
+C,0.0009
+D,0.0001
+EOF
+}
+
+if test $# -eq 0; then
+ test-basic
+ echo '--'
+ test-col-format
+ echo '--'
+ test-var-def
+ echo '--'
+ test-as-percent
+ echo '--'
+ echo 'OK'
+else
+ "$@"
+fi
diff --git a/pipeline/csv_to_html.py b/pipeline/csv_to_html.py
new file mode 100755
index 0000000..e4d76ae
--- /dev/null
+++ b/pipeline/csv_to_html.py
@@ -0,0 +1,218 @@
+#!/usr/bin/python
+"""Reads a CSV file on stdin, and prints an an HTML table on stdout.
+
+The static HTML can then be made made dynamic with JavaScript, e.g. jQuery
+DataTable.
+
+Use Cases:
+
+ - overview.csv -- each row is a metric
+ - links: to metric page
+
+ - status.csv -- each row is a day
+ - links: to log.txt, to results.html
+"""
+
+import cgi
+import csv
+import optparse
+import sys
+
+import util
+
+
+def CreateOptionsParser():
+ p = optparse.OptionParser()
+
+ # We are taking a path, and not using stdin, because we read it twice.
+ p.add_option(
+ '--col-format', dest='col_formats', metavar="'COLNAME FMT'", type='str',
+ default=[], action='append',
+ help='Add HTML links to the named column, using the given Python '
+ '.format() string')
+
+ p.add_option(
+ '--def', dest='defs', metavar="'NAME VALUE'", type='str',
+ default=[], action='append',
+ help='Define varaibles for use in format strings')
+
+ p.add_option(
+ '--as-percent', dest='percent_cols', metavar="COLNAME", type='str',
+ default=[], action='append',
+ help='Format this floating point column as a percentage string')
+
+ # TODO: We could include this by default, and then change all the HTML to
+ # have <div> placeholders instead of <table>.
+ p.add_option(
+ '--table', dest='table', default=False, action='store_true',
+ help='Add <table></table> tags (useful for testing)')
+
+ return p
+
+
+def ParseSpec(arg_list):
+ """Given an argument list, return a string -> string dictionary."""
+ # The format string is passed the cell value. Escaped as HTML?
+ d = {}
+ for s in arg_list:
+ try:
+ name, value = s.split(' ', 1)
+ except ValueError:
+ raise RuntimeError('Invalid column format %r' % s)
+ d[name] = value
+ return d
+
+
+def PrintRow(row, col_names, col_formats, defs, percent_cols):
+ """Print a CSV row as HTML, using the given formatting.
+
+ Returns:
+ An array of booleans indicating whether each cell is a number.
+ """
+ is_number_flags = [False] * len(col_names)
+
+ for i, cell in enumerate(row):
+ # The cell as a string. By default we leave it as is; it may be mutated
+ # below.
+ cell_str = cell
+ css_class = '' # CSS class for the cell.
+ col_name = col_names[i] # column that the cell is under
+
+ # Does the cell look like a float?
+ try:
+ cell_float = float(cell)
+ if col_name in percent_cols: # Floats can be formatted as percentages.
+ cell_str = '{:.1f}%'.format(cell_float * 100)
+ else:
+ # Arbitrarily use 3 digits of precision for display
+ cell_str = '{:.3f}'.format(cell_float)
+ css_class = 'num'
+ is_number_flags[i] = True
+ except ValueError:
+ pass
+
+ # Does it look lik an int?
+ try:
+ cell_int = int(cell)
+ cell_str = '{:,}'.format(cell_int)
+ css_class = 'num'
+ is_number_flags[i] = True
+ except ValueError:
+ pass
+
+ # Special CSS class for R NA values.
+ if cell_str.strip() == 'NA':
+ css_class = 'num na' # num should right justify; na should make it red
+ is_number_flags[i] = True
+
+ if css_class:
+ print ' <td class="{}">'.format(css_class),
+ else:
+ print ' <td>',
+
+ cell_safe = cgi.escape(cell_str)
+
+ # If the cell has a format string, print it this way.
+
+ fmt = col_formats.get(col_name) # e.g. "../{date}.html"
+ if fmt:
+ # Copy variable bindings
+ bindings = dict(defs)
+
+ # Also let the format string use other column names. TODO: Is there a
+ # more efficient way?
+ bindings.update(zip(col_names, [cgi.escape(c) for c in row]))
+
+ bindings[col_name] = cell_safe
+
+ print fmt.format(**bindings), # no newline
+ else:
+ print cell_safe, # no newline
+
+ print '</td>'
+
+ return is_number_flags
+
+
+def ReadCsv(f):
+ """Read the CSV file, returning the column names and rows."""
+ c = csv.reader(f)
+
+ # The first row of the CSV is assumed to be a header. The rest are data.
+ col_names = []
+ rows = []
+ for i, row in enumerate(c):
+ if i == 0:
+ col_names = row
+ continue
+ rows.append(row)
+ return col_names, rows
+
+
+def PrintColGroup(col_names, col_is_numeric):
+ """Print HTML colgroup element, used for JavaScript sorting."""
+ print '<colgroup>'
+ for i, col in enumerate(col_names):
+ # CSS class is used for sorting
+ if col_is_numeric[i]:
+ css_class = 'number'
+ else:
+ css_class = 'case-insensitive'
+
+ # NOTE: id is a comment only; not used
+ print ' <col id="{}" type="{}" />'.format(col, css_class)
+ print '</colgroup>'
+
+
+def main(argv):
+ (opts, argv) = CreateOptionsParser().parse_args(argv)
+
+ col_formats = ParseSpec(opts.col_formats)
+ defs = ParseSpec(opts.defs)
+
+ col_names, rows = ReadCsv(sys.stdin)
+
+ for col in opts.percent_cols:
+ if col not in col_names:
+ raise RuntimeError('--percent-col %s is not a valid column' % col)
+
+ # By default, we don't print the <table> bit -- that's up to the host page
+ if opts.table:
+ print '<table>'
+
+ print '<thead>'
+ for col in col_names:
+ # change _ to space so long column names can wrap
+ print ' <td>%s</td>' % cgi.escape(col.replace('_', ' '))
+ print '</thead>'
+
+ # Assume all columns are numeric at first. Look at each row for non-numeric
+ # values.
+ col_is_numeric = [True] * len(col_names)
+
+ print '<tbody>'
+ for row in rows:
+ print ' <tr>'
+ is_number_flags = PrintRow(row, col_names, col_formats, defs,
+ opts.percent_cols)
+
+ # If one cell in a column is not a number, then the whole cell isn't.
+ for (i, is_number) in enumerate(is_number_flags):
+ if not is_number:
+ col_is_numeric[i] = False
+
+ print ' </tr>'
+ print '</tbody>'
+
+ PrintColGroup(col_names, col_is_numeric)
+
+ if opts.table:
+ print '</table>'
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/pipeline/csv_to_html_test.py b/pipeline/csv_to_html_test.py
new file mode 100755
index 0000000..5fd5822
--- /dev/null
+++ b/pipeline/csv_to_html_test.py
@@ -0,0 +1,24 @@
+#!/usr/bin/python -S
+"""
+csv_to_html_test.py: Tests for csv_to_html.py
+"""
+
+import unittest
+
+import csv_to_html # module under test
+
+
+class CsvToHtmlTest(unittest.TestCase):
+
+ def testParseSpec(self):
+ self.assertEqual(
+ {'foo': 'bar', 'spam': 'eggs'},
+ csv_to_html.ParseSpec(['foo bar', 'spam eggs']))
+
+ self.assertEqual(
+ {},
+ csv_to_html.ParseSpec([]))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pipeline/dist.sh b/pipeline/dist.sh
new file mode 100755
index 0000000..ad33006
--- /dev/null
+++ b/pipeline/dist.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+#
+# Usage:
+# ./dist.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
+
+source $RAPPOR_SRC/util.sh # log, banner
+source $RAPPOR_SRC/pipeline/tools-lib.sh
+source $RAPPOR_SRC/pipeline/alarm-lib.sh
+
+readonly DECODE_DIST=${DEP_DECODE_DIST:-$RAPPOR_SRC/bin/decode-dist}
+
+readonly NUM_ARGS=7 # used for xargs
+
+decode-dist-one() {
+ # Job constants
+ local rappor_src=$1
+ local timeout_secs=$2
+ local min_reports=$3
+ shift 3 # job constants do not vary per task and are not part of the spec
+
+ # 7 spec variables
+ local num_reports=$1 # unused, only for filtering
+ local metric_name=$2
+ local date=$3
+ local counts=$4
+ local params=$5
+ local map=$6
+ local results_dir=$7
+
+ local task_dir=$results_dir/$metric_name/$date
+ mkdir --verbose -p $task_dir
+
+ local log_file=$task_dir/log.txt
+ local status_file=$task_dir/STATUS.txt
+
+ # Record the spec so we know params, counts, etc.
+ echo "$@" > $task_dir/spec.txt
+
+ if test $num_reports -lt $min_reports; then
+ local msg="SKIPPED because $num_reports reports is less than $min_reports"
+ # Duplicate this message
+ echo "$msg" > $status_file
+ echo "$msg" > $log_file
+ return
+ fi
+
+ # Run it with a timeout, and record status in the task dir.
+ { time \
+ alarm-status $status_file $timeout_secs \
+ $DECODE_DIST \
+ --counts $counts \
+ --params $params \
+ --map $map \
+ --output-dir $task_dir \
+ --adjust-counts-hack
+ } >$log_file 2>&1
+
+ # TODO: Don't pass --adjust-counts-hack unless the user asks for it.
+}
+
+# Print the number of processes to use.
+# NOTE: This is copied from google/rappor regtest.sh.
+# It also doesn't take into account the fact that we are memory-bound.
+#
+# 128 GiB / 4GiB would also imply about 32 processes though.
+num-processes() {
+ local processors=$(grep -c ^processor /proc/cpuinfo || echo 4)
+ if test $processors -gt 1; then # leave one CPU for the OS
+ processors=$(expr $processors - 1)
+ fi
+ echo $processors
+}
+
+#readonly DEFAULT_MAX_PROCS=6 # for andychu2.hot, to avoid locking up UI
+#readonly DEFAULT_MAX_PROCS=16 # for rappor-ac.hot, to avoid thrashing
+readonly DEFAULT_MAX_PROCS=$(num-processes)
+
+#readonly DEFAULT_MAX_TASKS=12
+readonly DEFAULT_MAX_TASKS=10000 # more than the max
+
+# NOTE: Since we have 125 GB RAM, and processes can take up to 12 gigs of RAM,
+# only use parallelism of 10, even though we have 31 cores.
+
+readonly DEFAULT_MIN_REPORTS=5000
+
+
+decode-dist-many() {
+ local job_dir=$1
+ local spec_list=$2
+ local timeout_secs=${3:-1200} # default timeout
+ local max_procs=${4:-$DEFAULT_MAX_PROCS}
+ local rappor_src=${5:-$RAPPOR_SRC}
+ local min_reports=${6:-$DEFAULT_MIN_REPORTS}
+
+ local interval_secs=5
+ local pid_dir="$job_dir/pids"
+ local sys_mem="$job_dir/system-mem.csv"
+ mkdir --verbose -p $pid_dir
+
+ time cat $spec_list \
+ | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
+ $0 decode-dist-one $rappor_src $timeout_secs $min_reports
+}
+
+# Combine/summarize results and task metadata from the parallel decode-dist
+# processes. Render them as HTML.
+combine-and-render-html() {
+ local jobs_base_dir=$1
+ local job_dir=$2
+
+ banner "Combining dist task status"
+ TOOLS-cook combine-dist-task-status $jobs_base_dir $job_dir
+
+ banner "Combining dist results"
+ TOOLS-cook combine-dist-results $jobs_base_dir $job_dir
+
+ banner "Splitting out status per metric, and writing overview"
+ TOOLS-cook dist-metric-status $job_dir
+
+ # The task-status.csv file should have the a JOB ID.
+ banner "Building overview.html and per-metric HTML"
+ TOOLS-gen-ui build-html1 $job_dir
+
+ banner "Building individual results.html (for ONE day)"
+ TOOLS-gen-ui results-html $job_dir
+}
+
+"$@"
diff --git a/pipeline/metric_status.R b/pipeline/metric_status.R
new file mode 100755
index 0000000..0774423
--- /dev/null
+++ b/pipeline/metric_status.R
@@ -0,0 +1,343 @@
+#!/usr/bin/Rscript
+#
+# Write an overview of task status, per-metric task status, task histograms.
+
+library(data.table)
+library(ggplot2)
+
+options(stringsAsFactors = FALSE) # get rid of annoying behavior
+
+Log <- function(fmt, ...) {
+ cat(sprintf(fmt, ...))
+ cat('\n')
+}
+
+# max of non-NA values; NA if there are none
+MaybeMax <- function(values) {
+ v <- values[!is.na(values)]
+ if (length(v) == 0) {
+ m <- NA
+ } else {
+ m <- max(v)
+ }
+ as.numeric(m) # data.table requires this; otherwise we get type errors
+}
+
+# mean of non-NA values; NA if there are none
+MaybeMean <- function(values) {
+ v <- values[!is.na(values)]
+ if (length(v) == 0) {
+ m <- NA
+ } else {
+ m <- mean(v)
+ }
+ as.numeric(m) # data.table require this; otherwise we get type errors
+}
+
+WriteDistOverview <- function(summary, output_dir) {
+ s <- data.table(summary) # data.table syntax is easier here
+
+ by_metric <- s[ , list(
+ params_file = unique(params_file),
+ map_file = unique(map_file),
+ days = length(date),
+ max_num_reports = MaybeMax(num_reports),
+
+ # summarize status
+ ok = sum(status == 'OK'),
+ fail = sum(status == 'FAIL'),
+ timeout = sum(status == 'TIMEOUT'),
+ skipped = sum(status == 'SKIPPED'),
+
+ # TODO: Need to document the meaning of these metrics.
+ # All could be NA
+ # KiB -> MB
+ #max_vm5_peak_mb = MaybeMax(vm5_peak_kib * 1024 / 1e6),
+ #mean_vm5_mean_mb = MaybeMean(vm5_mean_kib * 1024 / 1e6),
+
+ mean_secs = MaybeMean(seconds),
+ mean_allocated_mass = MaybeMean(allocated_mass)
+
+ # unique failure reasons
+ # This can be used when there are different call stacks.
+ #fail_reasons = length(unique(fail_reason[fail_reason != ""]))
+ ), by=metric]
+
+ # Case insensitive sort by metric name
+ by_metric <- by_metric[order(tolower(by_metric$metric)), ]
+
+ overview_path <- file.path(output_dir, 'overview.csv')
+ write.csv(by_metric, file = overview_path, row.names = FALSE)
+ Log("Wrote %s", overview_path)
+
+ by_metric
+}
+
+WriteDistMetricStatus <- function(summary, output_dir) {
+ # Write status.csv, num_reports.csv, and mass.csv for each metric.
+
+ s <- data.table(summary)
+
+ # loop over unique metrics, and write a CSV for each one
+ for (m in unique(s$metric)) {
+ # Select cols, and convert units. Don't need params / map / metric.
+ subframe <- s[s$metric == m,
+ list(job_id, date, status,
+ #vm5_peak_mb = vm5_peak_kib * 1024 / 1e6,
+ #vm5_mean_mb = vm5_mean_kib * 1024 / 1e6,
+ num_reports,
+ seconds,
+ allocated_mass, num_rappor)]
+
+ # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD.
+ subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
+
+ out_path = file.path(output_dir, m, 'status.csv')
+ write.csv(subframe, file = out_path, row.names = FALSE)
+ Log("Wrote %s", out_path)
+ }
+
+ # This one is just for plotting with dygraphs. TODO: can dygraphs do
+ # something smarter? Maybe you need to select the column in JavaScript, and
+ # pass it an array, rather than CSV text.
+ for (m in unique(s$metric)) {
+ f1 <- s[s$metric == m, list(date, num_reports)]
+ path1 <- file.path(output_dir, m, 'num_reports.csv')
+ # NOTE: dygraphs (only in Firefox?) doesn't like the quotes around
+ # "2015-04-03". In general, we can't turn off quotes, because strings with
+ # double quotes will be invalid CSV files. But in this case, we only have
+ # date and number columns, so we can. dygraphs is mistaken here.
+ write.csv(f1, file = path1, row.names = FALSE, quote = FALSE)
+ Log("Wrote %s", path1)
+
+ # Write unallocated mass. TODO: Write the other 2 vars too?
+ f2 <- s[s$metric == m,
+ list(date,
+ unallocated_mass = 1.0 - allocated_mass)]
+
+ path2 <- file.path(output_dir, m, 'mass.csv')
+ write.csv(f2, file = path2, row.names = FALSE, quote = FALSE)
+ Log("Wrote %s", path2)
+ }
+}
+
+WritePlot <- function(p, outdir, filename, width = 800, height = 600) {
+ filename <- file.path(outdir, filename)
+ png(filename, width = width, height = height)
+ plot(p)
+ dev.off()
+ Log('Wrote %s', filename)
+}
+
+# Make sure the histogram has some valid input. If we don't do this, ggplot
+# blows up with an unintuitive error message.
+CheckHistogramInput <- function(v) {
+ if (all(is.na(v))) {
+ arg_name <- deparse(substitute(v)) # R idiom to get name
+ Log('FATAL: All values in %s are NA (no successful runs?)', arg_name)
+ quit(status = 1)
+ }
+}
+
+WriteDistHistograms <- function(s, output_dir) {
+ CheckHistogramInput(s$allocated_mass)
+
+ p <- qplot(s$allocated_mass, geom = "histogram")
+ t <- ggtitle("Allocated Mass by Task")
+ x <- xlab("allocated mass")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'allocated_mass.png')
+
+ CheckHistogramInput(s$num_rappor)
+
+ p <- qplot(s$num_rappor, geom = "histogram")
+ t <- ggtitle("Detected Strings by Task")
+ x <- xlab("detected strings")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'num_rappor.png')
+
+ CheckHistogramInput(s$num_reports)
+
+ p <- qplot(s$num_reports / 1e6, geom = "histogram")
+ t <- ggtitle("Raw Reports by Task")
+ x <- xlab("millions of reports")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'num_reports.png')
+
+ CheckHistogramInput(s$seconds)
+
+ p <- qplot(s$seconds, geom = "histogram")
+ t <- ggtitle("Analysis Duration by Task")
+ x <- xlab("seconds")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'seconds.png')
+
+ # NOTE: Skipping this for 'series' jobs.
+ if (sum(!is.na(s$vm5_peak_kib)) > 0) {
+ p <- qplot(s$vm5_peak_kib * 1024 / 1e6, geom = "histogram")
+ t <- ggtitle("Peak Memory Usage by Task")
+ x <- xlab("Peak megabytes (1e6 bytes) of memory")
+ y <- ylab("number of tasks")
+ WritePlot(p + t + x + y, output_dir, 'memory.png')
+ }
+}
+
+ProcessAllDist <- function(s, output_dir) {
+ Log('dist: Writing per-metric status.csv')
+ WriteDistMetricStatus(s, output_dir)
+
+ Log('dist: Writing histograms')
+ WriteDistHistograms(s, output_dir)
+
+ Log('dist: Writing aggregated overview.csv')
+ WriteDistOverview(s, output_dir)
+}
+
+# Write the single CSV file loaded by assoc-overview.html.
+WriteAssocOverview <- function(summary, output_dir) {
+ s <- data.table(summary) # data.table syntax is easier here
+
+ by_metric <- s[ , list(
+ #params_file = unique(params_file),
+ #map_file = unique(map_file),
+
+ days = length(date),
+ max_num_reports = MaybeMax(num_reports),
+
+ # summarize status
+ ok = sum(status == 'OK'),
+ fail = sum(status == 'FAIL'),
+ timeout = sum(status == 'TIMEOUT'),
+ skipped = sum(status == 'SKIPPED'),
+
+ mean_total_secs = MaybeMean(total_elapsed_seconds),
+ mean_em_secs = MaybeMean(em_elapsed_seconds)
+
+ ), by=list(metric)]
+
+ # Case insensitive sort by metric name
+ by_metric <- by_metric[order(tolower(by_metric$metric)), ]
+
+ overview_path <- file.path(output_dir, 'assoc-overview.csv')
+ write.csv(by_metric, file = overview_path, row.names = FALSE)
+ Log("Wrote %s", overview_path)
+
+ by_metric
+}
+
+# Write the CSV files loaded by assoc-metric.html -- that is, one
+# metric-status.csv for each metric name.
+WriteAssocMetricStatus <- function(summary, output_dir) {
+ s <- data.table(summary)
+ csv_list <- unique(s[, list(metric)])
+ for (i in 1:nrow(csv_list)) {
+ u <- csv_list[i, ]
+ # Select cols, and convert units. Don't need params / map / metric.
+ by_pair <- s[s$metric == u$metric,
+ list(days = length(date),
+ max_num_reports = MaybeMax(num_reports),
+
+ # summarize status
+ ok = sum(status == 'OK'),
+ fail = sum(status == 'FAIL'),
+ timeout = sum(status == 'TIMEOUT'),
+ skipped = sum(status == 'SKIPPED'),
+
+ mean_total_secs = MaybeMean(total_elapsed_seconds),
+ mean_em_secs = MaybeMean(em_elapsed_seconds)
+ ),
+ by=list(var1, var2)]
+
+ # Case insensitive sort by var1 name
+ by_pair <- by_pair[order(tolower(by_pair$var1)), ]
+
+ csv_path <- file.path(output_dir, u$metric, 'metric-status.csv')
+ write.csv(by_pair, file = csv_path, row.names = FALSE)
+ Log("Wrote %s", csv_path)
+ }
+}
+
+# This naming convention is in task_spec.py AssocTaskSpec.
+FormatAssocRelPath <- function(metric, var1, var2) {
+ v2 <- gsub('..', '_', var2, fixed = TRUE)
+ var_dir <- sprintf('%s_X_%s', var1, v2)
+ file.path(metric, var_dir)
+}
+
+# Write the CSV files loaded by assoc-pair.html -- that is, one pair-status.csv
+# for each (metric, var1, var2) pair.
+WriteAssocPairStatus <- function(summary, output_dir) {
+
+ s <- data.table(summary)
+
+ csv_list <- unique(s[, list(metric, var1, var2)])
+ Log('CSV list:')
+ print(csv_list)
+
+ # loop over unique metrics, and write a CSV for each one
+ for (i in 1:nrow(csv_list)) {
+ u <- csv_list[i, ]
+
+ # Select cols, and convert units. Don't need params / map / metric.
+ subframe <- s[s$metric == u$metric & s$var1 == u$var1 & s$var2 == u$var2,
+ list(job_id, date, status,
+ num_reports, d1, d2,
+ total_elapsed_seconds,
+ em_elapsed_seconds)]
+
+ # Sort by descending date. Alphabetical sort works fine for YYYY-MM-DD.
+ subframe <- subframe[order(subframe$date, decreasing = TRUE), ]
+
+ pair_rel_path <- FormatAssocRelPath(u$metric, u$var1, u$var2)
+
+ csv_path <- file.path(output_dir, pair_rel_path, 'pair-status.csv')
+ write.csv(subframe, file = csv_path, row.names = FALSE)
+ Log("Wrote %s", csv_path)
+
+ # Write a file with the raw variable names. Parsed by ui.sh, to pass to
+ # csv_to_html.py.
+ meta_path <- file.path(output_dir, pair_rel_path, 'pair-metadata.txt')
+
+ # NOTE: The conversion from data.table to character vector requires
+ # stringsAsFactors to work correctly!
+ lines <- as.character(u)
+ writeLines(lines, con = meta_path)
+ Log("Wrote %s", meta_path)
+ }
+}
+
+ProcessAllAssoc <- function(s, output_dir) {
+ Log('assoc: Writing pair-status.csv for each variable pair in each metric')
+ WriteAssocPairStatus(s, output_dir)
+
+ Log('assoc: Writing metric-status.csv for each metric')
+ WriteAssocMetricStatus(s, output_dir)
+
+ Log('assoc: Writing aggregated overview.csv')
+ WriteAssocOverview(s, output_dir)
+}
+
+main <- function(argv) {
+ # increase ggplot font size globally
+ theme_set(theme_grey(base_size = 16))
+
+ action = argv[[1]]
+ input = argv[[2]]
+ output_dir = argv[[3]]
+
+ if (action == 'dist') {
+ summary = read.csv(input)
+ ProcessAllDist(summary, output_dir)
+ } else if (action == 'assoc') {
+ summary = read.csv(input)
+ ProcessAllAssoc(summary, output_dir)
+ } else {
+ stop(sprintf('Invalid action %s', action))
+ }
+
+ Log('Done')
+}
+
+if (length(sys.frames()) == 0) {
+ main(commandArgs(TRUE))
+}
diff --git a/pipeline/regtest.sh b/pipeline/regtest.sh
new file mode 100755
index 0000000..a29a0f0
--- /dev/null
+++ b/pipeline/regtest.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+#
+# End-to-end tests for the dashboard.
+#
+# Usage:
+# ./regtest.sh <function name>
+#
+# NOTE: Must be run in this directory (rappor/pipeline).
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+# Create schema and params.
+create-metadata() {
+ mkdir -p _tmp/metadata
+ echo 'Hello from regtest.sh'
+
+ local params_path=_tmp/metadata/regtest_params.csv
+
+ # Relying on $RAPPOR_SRC/regtest.sh
+ cp --verbose ../_tmp/python/demo1/case_params.csv $params_path
+
+ # For now, use the same map everywhere.
+ cat >_tmp/metadata/dist-analysis.csv <<EOF
+var,map_filename
+unif,map.csv
+gauss,map.csv
+exp,map.csv
+m.domain,domain_map.csv
+EOF
+
+ # Both single dimensional and multi dimensional metrics.
+ cat >_tmp/metadata/rappor-vars.csv <<EOF
+metric,var,var_type,params
+m,domain,string,m_params
+m,flag..HTTPS,boolean,m_params
+unif,,string,regtest_params
+gauss,,string,regtest_params
+exp,,string,regtest_params
+EOF
+}
+
+# Create map files.
+create-maps() {
+ mkdir -p _tmp/maps
+ # Use the same map for everyone now?
+ local map_path=_tmp/maps/map.csv
+
+ # Relying on $RAPPOR_SRC/regtest.sh
+ cp --verbose ../_tmp/python/demo1/case_map.csv $map_path
+}
+
+# Simulate different metrics.
+create-counts() {
+ mkdir -p _tmp/counts
+
+ for date in 2015-12-01 2015-12-02 2015-12-03; do
+ mkdir -p _tmp/counts/$date
+
+ # TODO: Change params for each day.
+ cp --verbose \
+ ../_tmp/python/demo1/1/case_counts.csv _tmp/counts/$date/unif_counts.csv
+ cp --verbose \
+ ../_tmp/python/demo2/1/case_counts.csv _tmp/counts/$date/gauss_counts.csv
+ cp --verbose \
+ ../_tmp/python/demo3/1/case_counts.csv _tmp/counts/$date/exp_counts.csv
+ done
+}
+
+dist-task-spec() {
+ local job_dir=$1
+ ./task_spec.py dist \
+ --map-dir _tmp/maps \
+ --config-dir _tmp/metadata \
+ --output-base-dir $job_dir/raw \
+ --bad-report-out _tmp/bad_counts.csv \
+ "$@"
+}
+
+dist-job() {
+ local job_id=$1
+ local pat=$2
+
+ local job_dir=_tmp/$job_id
+ mkdir -p $job_dir/raw
+
+ local spec_list=$job_dir/spec-list.txt
+
+ find _tmp/counts/$pat -name \*_counts.csv \
+ | dist-task-spec $job_dir \
+ | tee $spec_list
+
+ ./dist.sh decode-dist-many $job_dir $spec_list
+ ./dist.sh combine-and-render-html _tmp $job_dir
+}
+
+dist() {
+ create-metadata
+ create-maps
+ create-counts
+
+ dist-job smoke1 '2015-12-01' # one day
+ dist-job smoke2 '2015-12-0[23]' # two days
+}
+
+# Simulate different metrics.
+create-reports() {
+ mkdir -p _tmp/reports
+
+ for date in 2015-12-01 2015-12-02 2015-12-03; do
+ mkdir -p _tmp/reports/$date
+
+ # TODO: Change params for each day.
+ cp --verbose \
+ ../bin/_tmp/reports.csv _tmp/reports/$date/m_reports.csv
+ done
+}
+
+assoc-task-spec() {
+ local job_dir=$1
+ ./task_spec.py assoc \
+ --map-dir _tmp/maps \
+ --config-dir _tmp/metadata \
+ --output-base-dir $job_dir/raw \
+ "$@"
+}
+
+assoc-job() {
+ local job_id=$1
+ local pat=$2
+
+ local job_dir=_tmp/$job_id
+ mkdir -p $job_dir/raw $job_dir/config
+
+ local spec_list=$job_dir/spec-list.txt
+
+ find _tmp/reports/$pat -name \*_reports.csv \
+ | assoc-task-spec $job_dir \
+ | tee $spec_list
+
+ # decode-many calls decode_assoc.R, which expects this schema in the 'config'
+ # dir now. TODO: adjust this.
+ cp --verbose _tmp/metadata/rappor-vars.csv $job_dir/config
+ cp --verbose ../bin/_tmp/m_params.csv $job_dir/config
+
+ ./assoc.sh decode-many $job_dir $spec_list
+ ./assoc.sh combine-and-render-html _tmp $job_dir
+}
+
+# Copy some from bin/test.sh? The input _reports.csv files should be taken
+# from there.
+assoc() {
+ create-reports
+ cp --verbose ../bin/_tmp/domain_map.csv _tmp/maps
+
+ assoc-job smoke1-assoc '2015-12-01' # one day
+ assoc-job smoke2-assoc '2015-12-0[23]' # two days
+}
+
+"$@"
diff --git a/pipeline/task_spec.py b/pipeline/task_spec.py
new file mode 100755
index 0000000..c46bb35
--- /dev/null
+++ b/pipeline/task_spec.py
@@ -0,0 +1,364 @@
+#!/usr/bin/python
+"""Read a list of 'counts' paths on stdin, and write a task spec on stdout.
+
+Each line represents a task, or R process invocation. The params on each line
+are passed to ./dist.sh decode-many or ./assoc.sh decode-many.
+"""
+
+import collections
+import csv
+import errno
+import optparse
+import os
+import pprint
+import re
+import sys
+
+import util
+
+
+def _ReadDistMaps(f):
+ dist_maps = {}
+ c = csv.reader(f)
+ for i, row in enumerate(c):
+ if i == 0:
+ expected = ['var', 'map_filename']
+ if row != expected:
+ raise RuntimeError('Expected CSV header %s' % expected)
+ continue # skip header
+
+ var_name, map_filename = row
+ dist_maps[var_name] = map_filename
+ return dist_maps
+
+
+class DistMapLookup(object):
+ """Create a dictionary of var -> map to analyze against.
+
+ TODO: Support a LIST of maps. Users should be able to specify more than one.
+ """
+ def __init__(self, f, map_dir):
+ self.dist_maps = _ReadDistMaps(f)
+ self.map_dir = map_dir
+
+ def GetMapPath(self, var_name):
+ filename = self.dist_maps[var_name]
+ return os.path.join(self.map_dir, filename)
+
+
+def CreateFieldIdLookup(f):
+ """Create a dictionary that specifies single variable analysis each var.
+
+ Args:
+ config_dir: directory of metadata, output by update_rappor.par
+
+ Returns:
+ A dictionary from field ID -> full field name
+
+ NOTE: Right now we're only doing single variable analysis for strings, so we
+ don't have the "type".
+ """
+ field_id_lookup = {}
+ c = csv.reader(f)
+ for i, row in enumerate(c):
+ if i == 0:
+ expected = ['metric', 'field', 'field_type', 'params', 'field_id']
+ if row != expected:
+ raise RuntimeError('Expected CSV header %s' % expected)
+ continue
+
+ metric, field, field_type, _, field_id = row
+
+ if field_type != 'string':
+ continue
+
+ # Paper over the difference between plain metrics (single variable) and
+ # metrics with fields (multiple variables, for association analysis).
+ if field:
+ full_field_name = '%s.%s' % (metric, field)
+ else:
+ full_field_name = metric
+
+ field_id_lookup[field_id] = full_field_name
+ return field_id_lookup
+
+
+def _ReadVarSchema(f):
+ """Given the rappor-vars.csv file, return a list of metric/var/type."""
+ # metric -> list of (variable name, type)
+ assoc_metrics = collections.defaultdict(list)
+ params_lookup = {}
+
+ c = csv.reader(f)
+ for i, row in enumerate(c):
+ if i == 0:
+ expected = ['metric', 'var', 'var_type', 'params']
+ if row != expected:
+ raise RuntimeError('Expected CSV header %s, got %s' % (expected, row))
+ continue
+
+ metric, var, var_type, params = row
+ if var == '':
+ full_var_name = metric
+ else:
+ full_var_name = '%s.%s' % (metric, var)
+ # Also group multi-dimensional reports
+ assoc_metrics[metric].append((var, var_type))
+
+ params_lookup[full_var_name] = params
+
+ return assoc_metrics, params_lookup
+
+
+class VarSchema(object):
+ """Object representing rappor-vars.csv.
+
+ Right now we use it for slightly different purposes for dist and assoc
+ analysis.
+ """
+ def __init__(self, f, params_dir):
+ self.assoc_metrics, self.params_lookup = _ReadVarSchema(f)
+ self.params_dir = params_dir
+
+ def GetParamsPath(self, var_name):
+ filename = self.params_lookup[var_name]
+ return os.path.join(self.params_dir, filename + '.csv')
+
+ def GetAssocMetrics(self):
+ return self.assoc_metrics
+
+
+def CountReports(f):
+ num_reports = 0
+ for line in f:
+ first_col = line.split(',')[0]
+ num_reports += int(first_col)
+ return num_reports
+
+
+DIST_INPUT_PATH_RE = re.compile(r'.*/(\d+-\d+-\d+)/(\S+)_counts.csv')
+
+
+def DistInputIter(stdin):
+ """Read lines from stdin and extract fields to construct analysis tasks."""
+ for line in stdin:
+ m = DIST_INPUT_PATH_RE.match(line)
+ if not m:
+ raise RuntimeError('Invalid path %r' % line)
+
+ counts_path = line.strip()
+ date, field_id = m.groups()
+
+ yield counts_path, date, field_id
+
+
+def DistTaskSpec(input_iter, field_id_lookup, var_schema, dist_maps, bad_c):
+ """Print task spec for single variable RAPPOR to stdout."""
+
+ num_bad = 0
+ unique_ids = set()
+
+ for counts_path, date, field_id in input_iter:
+ unique_ids.add(field_id)
+
+ # num_reports is used for filtering
+ with open(counts_path) as f:
+ num_reports = CountReports(f)
+
+ # Look up field name from field ID
+ if field_id_lookup:
+ field_name = field_id_lookup.get(field_id)
+ if field_name is None:
+ # The metric id is the md5 hash of the name. We can miss some, e.g. due
+ # to debug builds.
+ if bad_c:
+ bad_c.writerow((date, field_id, num_reports))
+ num_bad += 1
+ continue
+ else:
+ field_name = field_id
+
+ # NOTE: We could remove the params from the spec if decode_dist.R took the
+ # --schema flag. The var type is there too.
+ params_path = var_schema.GetParamsPath(field_name)
+ map_path= dist_maps.GetMapPath(field_name)
+
+ yield num_reports, field_name, date, counts_path, params_path, map_path
+
+ util.log('%d unique field IDs', len(unique_ids))
+ if num_bad:
+ util.log('Failed field ID -> field name lookup on %d files '
+ '(check --field-ids file)', num_bad)
+
+
+ASSOC_INPUT_PATH_RE = re.compile(r'.*/(\d+-\d+-\d+)/(\S+)_reports.csv')
+
+
+def AssocInputIter(stdin):
+ """Read lines from stdin and extract fields to construct analysis tasks."""
+ for line in stdin:
+ m = ASSOC_INPUT_PATH_RE.match(line)
+ if not m:
+ raise RuntimeError('Invalid path %r' % line)
+
+ reports_path = line.strip()
+ date, metric_name = m.groups()
+
+ yield reports_path, date, metric_name
+
+
+def CreateAssocVarPairs(rappor_metrics):
+ """Yield a list of pairs of variables that should be associated.
+
+ For now just do all (string x boolean) analysis.
+ """
+ var_pairs = collections.defaultdict(list)
+
+ for metric, var_list in rappor_metrics.iteritems():
+ string_vars = []
+ boolean_vars = []
+
+ # Separate variables into strings and booleans
+ for var_name, var_type in var_list:
+ if var_type == 'string':
+ string_vars.append(var_name)
+ elif var_type == 'boolean':
+ boolean_vars.append(var_name)
+ else:
+ util.log('Unknown type variable type %r', var_type)
+
+ for s in string_vars:
+ for b in boolean_vars:
+ var_pairs[metric].append((s, b))
+ return var_pairs
+
+
+# For debugging
+def PrintAssocVarPairs(var_pairs):
+ for metric, var_list in var_pairs.iteritems():
+ print metric
+ for var_name, var_type in var_list:
+ print '\t', var_name, var_type
+
+
+def AssocTaskSpec(input_iter, var_pairs, dist_maps, output_base_dir, bad_c):
+ """Print the task spec for multiple variable RAPPOR to stdout."""
+ # Flow:
+ #
+ # Long term: We should have assoc-analysis.xml, next to dist-analysis.xml?
+ #
+ # Short term: update_rappor.py should print every combination of string vs.
+ # bool? Or I guess we have it in rappor-vars.csv
+
+ for reports_path, date, metric_name in input_iter:
+ pairs = var_pairs[metric_name]
+ for var1, var2 in pairs:
+ # Assuming var1 is a string. TODO: Use an assoc file, not dist_maps?
+ field1_name = '%s.%s' % (metric_name, var1)
+ map1_path = dist_maps.GetMapPath(field1_name)
+
+ # e.g. domain_X_flags__DID_PROCEED
+ # Don't use .. in filenames since it could be confusing.
+ pair_name = '%s_X_%s' % (var1, var2.replace('..', '_'))
+ output_dir = os.path.join(output_base_dir, metric_name, pair_name, date)
+
+ yield metric_name, date, reports_path, var1, var2, map1_path, output_dir
+
+
+def CreateOptionsParser():
+ p = optparse.OptionParser()
+
+ p.add_option(
+ '--bad-report-out', dest='bad_report', metavar='PATH', type='str',
+ default='',
+ help='Optionally write a report of input filenames with invalid field '
+ 'IDs to this file.')
+ p.add_option(
+ '--config-dir', dest='config_dir', metavar='PATH', type='str',
+ default='',
+ help='Directory with metadata schema and params files to read.')
+ p.add_option(
+ '--map-dir', dest='map_dir', metavar='PATH', type='str',
+ default='',
+ help='Directory with map files to read.')
+ p.add_option(
+ '--output-base-dir', dest='output_base_dir', metavar='PATH', type='str',
+ default='',
+ help='Root of the directory tree where analysis output will be placed.')
+ p.add_option(
+ '--field-ids', dest='field_ids', metavar='PATH', type='str',
+ default='',
+ help='Optional CSV file with field IDs (generally should not be used).')
+
+ return p
+
+
+def main(argv):
+ (opts, argv) = CreateOptionsParser().parse_args(argv)
+
+ if opts.bad_report:
+ bad_f = open(opts.bad_report, 'w')
+ bad_c = csv.writer(bad_f)
+ else:
+ bad_c = None
+
+ action = argv[1]
+
+ if not opts.config_dir:
+ raise RuntimeError('--config-dir is required')
+ if not opts.map_dir:
+ raise RuntimeError('--map-dir is required')
+ if not opts.output_base_dir:
+ raise RuntimeError('--output-base-dir is required')
+
+ # This is shared between the two specs.
+ path = os.path.join(opts.config_dir, 'dist-analysis.csv')
+ with open(path) as f:
+ dist_maps = DistMapLookup(f, opts.map_dir)
+
+ path = os.path.join(opts.config_dir, 'rappor-vars.csv')
+ with open(path) as f:
+ var_schema = VarSchema(f, opts.config_dir)
+
+ if action == 'dist':
+ if opts.field_ids:
+ with open(opts.field_ids) as f:
+ field_id_lookup = CreateFieldIdLookup(f)
+ else:
+ field_id_lookup = {}
+
+ input_iter = DistInputIter(sys.stdin)
+ for row in DistTaskSpec(input_iter, field_id_lookup, var_schema, dist_maps,
+ bad_c):
+ # The spec is a series of space-separated tokens.
+ tokens = row + (opts.output_base_dir,)
+ print ' '.join(str(t) for t in tokens)
+
+ elif action == 'assoc':
+ # Parse input
+ input_iter = AssocInputIter(sys.stdin)
+
+ # Create M x N association tasks
+ var_pairs = CreateAssocVarPairs(var_schema.GetAssocMetrics())
+
+ # Now add the other constant stuff
+ for row in AssocTaskSpec(
+ input_iter, var_pairs, dist_maps, opts.output_base_dir, bad_c):
+
+ num_reports = 0 # placeholder, not filtering yet
+ tokens = (num_reports,) + row
+ print ' '.join(str(t) for t in tokens)
+
+ else:
+ raise RuntimeError('Invalid action %r' % action)
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except IOError, e:
+ if e.errno != errno.EPIPE: # ignore broken pipe
+ raise
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/pipeline/task_spec_test.py b/pipeline/task_spec_test.py
new file mode 100755
index 0000000..94cbac8
--- /dev/null
+++ b/pipeline/task_spec_test.py
@@ -0,0 +1,61 @@
+#!/usr/bin/python -S
+"""
+task_spec_test.py: Tests for task_spec.py
+"""
+
+import cStringIO
+import unittest
+
+import task_spec # module under test
+
+
+class TaskSpecTest(unittest.TestCase):
+
+ def testCountReports(self):
+ f = cStringIO.StringIO("""\
+1,2
+3,4
+5,6
+""")
+ c = task_spec.CountReports(f)
+ self.assertEqual(9, c)
+
+ def testDist(self):
+ # NOTE: These files are opened, in order to count the reports. Maybe skip
+ # that step.
+ f = cStringIO.StringIO("""\
+_tmp/counts/2015-12-01/exp_counts.csv
+_tmp/counts/2015-12-01/gauss_counts.csv
+_tmp/counts/2015-12-02/exp_counts.csv
+_tmp/counts/2015-12-02/gauss_counts.csv
+""")
+ input_iter = task_spec.DistInputIter(f)
+ #for row in input_iter:
+ # print row
+
+ field_id_lookup = {}
+
+ # var name -> map filename
+ f = cStringIO.StringIO("""\
+var,map_filename
+exp,map.csv
+unif,map.csv
+gauss,map.csv
+""")
+ dist_maps = task_spec.DistMapLookup(f, '_tmp/maps')
+
+ f2 = cStringIO.StringIO("""\
+metric,var,var_type,params
+exp,,string,params
+unif,,string,params
+gauss,,string,params
+""")
+ var_schema = task_spec.VarSchema(f2, '_tmp/config')
+
+ for row in task_spec.DistTaskSpec(
+ input_iter, field_id_lookup, var_schema, dist_maps, None):
+ print row
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/pipeline/tools-lib.sh b/pipeline/tools-lib.sh
new file mode 100755
index 0000000..c7b3b24
--- /dev/null
+++ b/pipeline/tools-lib.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Library used to refer to open source tools.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+# NOTE: RAPPOR_SRC defined by the module that sources (cook.sh or ui.sh)
+
+# Caller can override shebang line by setting $DEP_PYTHON.
+readonly PYTHON=${DEP_PYTHON:-}
+
+readonly METRIC_STATUS=${DEP_METRIC_STATUS:-}
+
+
+# These 3 used by cook.sh.
+
+TOOLS-combine-status() {
+ if test -n "$PYTHON"; then
+ $PYTHON $RAPPOR_SRC/pipeline/combine_status.py "$@"
+ else
+ $RAPPOR_SRC/pipeline/combine_status.py "$@"
+ fi
+}
+
+TOOLS-combine-results() {
+ if test -n "$PYTHON"; then
+ $PYTHON $RAPPOR_SRC/pipeline/combine_results.py "$@"
+ else
+ $RAPPOR_SRC/pipeline/combine_results.py "$@"
+ fi
+}
+
+TOOLS-metric-status() {
+ if test -n "$METRIC_STATUS"; then
+ $METRIC_STATUS "$@"
+ else
+ $RAPPOR_SRC/pipeline/metric_status.R "$@"
+ fi
+}
+
+# Used by ui.sh.
+
+TOOLS-csv-to-html() {
+ if test -n "$PYTHON"; then
+ $PYTHON $RAPPOR_SRC/pipeline/csv_to_html.py "$@"
+ else
+ $RAPPOR_SRC/pipeline/csv_to_html.py "$@"
+ fi
+}
+
+#
+# Higher level scripts
+#
+
+TOOLS-cook() {
+ $RAPPOR_SRC/pipeline/cook.sh "$@"
+}
+
+# TODO: Rename gen-ui.sh.
+TOOLS-gen-ui() {
+ $RAPPOR_SRC/pipeline/ui.sh "$@"
+}
diff --git a/pipeline/ui.sh b/pipeline/ui.sh
new file mode 100755
index 0000000..8fbcde0
--- /dev/null
+++ b/pipeline/ui.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+#
+# Build the user interface.
+#
+# Usage:
+# ./ui.sh <function name>
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+readonly THIS_DIR=$(dirname $0)
+readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
+
+source $RAPPOR_SRC/pipeline/tools-lib.sh
+
+# Change the default location of this file by setting DEP_DYGRAPHS_JS
+readonly DYGRAPHS_JS=${DEP_DYGRAPHS_JS:-$RAPPOR_SRC/third_party/dygraph-combined.js}
+
+_link() {
+ ln --verbose -s -f "$@"
+}
+
+_copy() {
+ cp --verbose -f "$@"
+}
+
+download-dygraphs() {
+ local out=third_party
+ wget --directory $out \
+ http://dygraphs.com/1.1.1/dygraph-combined.js
+}
+
+import-table() {
+ local src=~/git/scratch/ajax/
+ cp --verbose $src/table-sort.{js,css} $src/url-hash.js ui
+ pushd ui
+ # TODO: Could minify it here
+ cat table-sort.js url-hash.js > table-lib.js
+ popd
+}
+
+# Use symlinks so we can edit and reload during development.
+symlink-static() {
+ local kind=$1
+ local job_dir=$2
+
+ local base=$RAPPOR_SRC/ui
+
+ # HTML goes at the top level.
+ if test "$kind" = dist; then
+ _link \
+ $base/overview.html $base/histograms.html $base/metric.html $base/day.html \
+ $job_dir
+ elif test "$kind" = assoc; then
+ _link \
+ $base/assoc-overview.html $base/assoc-metric.html $base/assoc-pair.html \
+ $base/assoc-day.html \
+ $job_dir
+ else
+ log "Invalid kind $kind"
+ exit 1
+ fi
+
+ mkdir --verbose -p $job_dir/static
+
+ # Static subdir.
+ _link \
+ $base/ui.css $base/ui.js \
+ $base/table-sort.css $base/table-lib.js \
+ $DYGRAPHS_JS \
+ $job_dir/static
+}
+
+
+# Write HTML fragment based on overview.csv.
+overview-part-html() {
+ local job_dir=${1:-_tmp/results-10}
+ local out=$job_dir/cooked/overview.part.html
+ # Sort by descending date!
+ TOOLS-csv-to-html \
+ --col-format 'metric <a href="metric.html#metric={metric}">{metric}</a>' \
+ < $job_dir/cooked/overview.csv \
+ > $out
+ echo "Wrote $out"
+}
+
+metric-part-html() {
+ local job_dir=${1:-_tmp/results-10}
+ # Testing it out. This should probably be a different dir.
+
+ for entry in $job_dir/cooked/*; do
+ # Only do it for dirs
+ if ! test -d $entry; then
+ continue
+ fi
+ # Now it's a metric dir
+ echo $entry
+
+ local metric_name=$(basename $entry)
+
+ # Convert status.csv to status.part.html (a fragment)
+
+ # NOTE: counts path could be useful. You need the input tree though. Hash
+ # it? Or point to the git link.
+
+ # Link to raw CSV
+ #--col-format 'date <a href="../../raw/{metric}/{date}/results.csv">{date}</a>' \
+
+ # TODO: Link to ui/results_viewer.html#{metric}_{date}
+ # And that needs some JavaScript to load the correct fragment.
+ # I guess you could do the same with metric.html. Currently it uses a
+ # symlink.
+
+ # Before job ID:
+ # --col-format 'date <a href="{date}.html">{date}</a>' \
+ # --col-format 'status <a href="../../raw/{metric}/{date}/log.txt">{status}</a>' \
+
+ local fmt1='date <a href="day.html#jobId={job_id}&metric={metric}&date={date}">{date}</a>'
+ local fmt2='status <a href="../{job_id}/raw/{metric}/{date}/log.txt">{status}</a>'
+
+ TOOLS-csv-to-html \
+ --def "metric $metric_name" \
+ --col-format "$fmt1" \
+ --col-format "$fmt2" \
+ < $entry/status.csv \
+ > $entry/status.part.html
+ done
+}
+
+results-html-one() {
+ local csv_in=$1
+ echo "$csv_in -> HTML"
+
+ # .../raw/Settings.HomePage2/2015-03-01/results.csv ->
+ # .../cooked/Settings.HomePage2/2015-03-01.part.html
+ # (This saves some directories)
+ local html_out=$(echo $csv_in | sed -e 's|/raw/|/cooked/|; s|/results.csv|.part.html|')
+
+ TOOLS-csv-to-html < $csv_in > $html_out
+}
+
+results-html() {
+ local job_dir=${1:-_tmp/results-10}
+
+ find $job_dir -name results.csv \
+ | xargs -n 1 --verbose --no-run-if-empty -- $0 results-html-one
+}
+
+# Build parts of the HTML
+build-html1() {
+ local job_dir=${1:-_tmp/results-10}
+
+ symlink-static dist $job_dir
+
+ # writes overview.part.html, which is loaded by overview.html
+ overview-part-html $job_dir
+
+ # Writes status.part.html for each metric
+ metric-part-html $job_dir
+}
+
+#
+# Association Analysis
+#
+
+readonly ASSOC_TEST_JOB_DIR=~/rappor/chrome-assoc-smoke/smoke5-assoc
+
+# Write HTML fragment based on CSV.
+assoc-overview-part-html() {
+ local job_dir=${1:-$ASSOC_TEST_JOB_DIR}
+ local html_path=$job_dir/cooked/assoc-overview.part.html
+
+ # Sort by descending date!
+
+ TOOLS-csv-to-html \
+ --col-format 'metric <a href="assoc-metric.html#metric={metric}">{metric}</a>' \
+ < $job_dir/cooked/assoc-overview.csv \
+ > $html_path
+ echo "Wrote $html_path"
+}
+
+assoc-metric-part-html-one() {
+ local csv_path=$1
+ local html_path=$(echo $csv_path | sed 's/.csv$/.part.html/')
+
+ local metric_dir=$(dirname $csv_path)
+ local metric_name=$(basename $metric_dir) # e.g. interstitial.harmful
+
+ local fmt='days <a href="assoc-pair.html#metric={metric}&var1={var1}&var2={var2}">{days}</a>'
+
+ TOOLS-csv-to-html \
+ --def "metric $metric_name" \
+ --col-format "$fmt" \
+ < $csv_path \
+ > $html_path
+
+ echo "Wrote $html_path"
+}
+
+assoc-metric-part-html() {
+ local job_dir=${1:-$ASSOC_TEST_JOB_DIR}
+ # Testing it out. This should probably be a different dir.
+
+ find $job_dir/cooked -name metric-status.csv \
+ | xargs -n 1 --verbose --no-run-if-empty -- $0 assoc-metric-part-html-one
+}
+
+# TODO:
+# - Construct link in JavaScript instead? It has more information. The
+# pair-metadata.txt file is a hack.
+
+assoc-pair-part-html-one() {
+ local csv_path=$1
+ local html_path=$(echo $csv_path | sed 's/.csv$/.part.html/')
+
+ local pair_dir_path=$(dirname $csv_path)
+ local pair_dir_name=$(basename $pair_dir_path) # e.g. domain_X_flags_IS_REPEAT_VISIT
+
+ # This file is generated by metric_status.R for each pair of variables.
+ local metadata="$pair_dir_path/pair-metadata.txt"
+ # Read one variable per line.
+ { read metric_name; read var1; read var2; } < $metadata
+
+ local fmt1='date <a href="assoc-day.html#jobId={job_id}&metric={metric}&var1={var1}&var2={var2}&date={date}">{date}</a>'
+ local fmt2="status <a href=\"../{job_id}/raw/{metric}/$pair_dir_name/{date}/assoc-log.txt\">{status}</a>"
+
+ TOOLS-csv-to-html \
+ --def "metric $metric_name" \
+ --def "var1 $var1" \
+ --def "var2 $var2" \
+ --col-format "$fmt1" \
+ --col-format "$fmt2" \
+ < $csv_path \
+ > $html_path
+}
+
+assoc-pair-part-html() {
+ local job_dir=${1:-~/rappor/chrome-assoc-smoke/smoke3}
+ # Testing it out. This should probably be a different dir.
+
+ find $job_dir/cooked -name pair-status.csv \
+ | xargs -n 1 --verbose -- $0 assoc-pair-part-html-one
+
+ return
+
+ # OLD STUFF
+ for entry in $job_dir/cooked/*; do
+ # Only do it for dirs
+ if ! test -d $entry; then
+ continue
+ fi
+ # Now it's a metric dir
+ echo $entry
+
+ local metric_name=$(basename $entry)
+
+ # Convert status.csv to status.part.html (a fragment)
+
+ # NOTE: counts path could be useful. You need the input tree though. Hash
+ # it? Or point to the git link.
+
+ # Link to raw CSV
+ #--col-format 'date <a href="../../raw/{metric}/{date}/results.csv">{date}</a>' \
+
+ # TODO: Link to ui/results_viewer.html#{metric}_{date}
+ # And that needs some JavaScript to load the correct fragment.
+ # I guess you could do the same with metric.html. Currently it uses a
+ # symlink.
+
+ # Before job ID:
+ # --col-format 'date <a href="{date}.html">{date}</a>' \
+ # --col-format 'status <a href="../../raw/{metric}/{date}/log.txt">{status}</a>' \
+
+ local fmt1='date <a href="day.html#jobId={job_id}&metric={metric}&date={date}">{date}</a>'
+ local fmt2='status <a href="../{job_id}/raw/{metric}/{date}/log.txt">{status}</a>'
+
+ TOOLS-csv-to-html \
+ --def "metric $metric_name" \
+ --col-format "$fmt1" \
+ --col-format "$fmt2" \
+ < $entry/status.csv \
+ > $entry/status.part.html
+ done
+}
+
+assoc-day-part-html-one() {
+ local csv_in=$1
+ echo "$csv_in -> HTML"
+
+ # .../raw/interstitial.harmful/a_X_b/2015-03-01/assoc-results.csv ->
+ # .../cooked/interstitial.harmful/a_X_b/2015-03-01.part.html
+ # (This saves some directories)
+ local html_out=$(echo $csv_in | sed -e 's|/raw/|/cooked/|; s|/assoc-results.csv|.part.html|')
+
+ TOOLS-csv-to-html --as-percent proportion < $csv_in > $html_out
+}
+
+assoc-day-part-html() {
+ local job_dir=${1:-_tmp/results-10}
+
+ find $job_dir -name assoc-results.csv \
+ | xargs -n 1 --verbose --no-run-if-empty -- $0 assoc-day-part-html-one
+}
+
+lint-html() {
+ set -o xtrace
+ set +o errexit # don't fail fast
+ tidy -errors -quiet ui/metric.html
+ tidy -errors -quiet ui/overview.html
+ tidy -errors -quiet ui/histograms.html
+}
+
+# Directory we should serve from
+readonly WWW_DIR=_tmp
+
+serve() {
+ local port=${1:-7999}
+ cd $WWW_DIR && python -m SimpleHTTPServer $port
+}
+
+"$@"
diff --git a/pipeline/util.py b/pipeline/util.py
new file mode 100755
index 0000000..c517483
--- /dev/null
+++ b/pipeline/util.py
@@ -0,0 +1,9 @@
+"""Common functions."""
+
+import sys
+
+
+def log(msg, *args):
+ if args:
+ msg = msg % args
+ print >>sys.stderr, msg
diff --git a/regtest.sh b/regtest.sh
new file mode 100755
index 0000000..d21edc2
--- /dev/null
+++ b/regtest.sh
@@ -0,0 +1,440 @@
+#!/bin/bash
+usage() {
+echo "
+ Run end-to-end tests in parallel.
+
+ Usage:
+ ./regtest.sh <function name>
+ At the end, it will print an HTML summary.
+
+ Three main functions are
+ run [<pattern> [<lang>]] - run tests matching <pattern> in
+ parallel. The language
+ of the client to use.
+ run-seq [<pattern> [<lang>]] - ditto, except that tests are run
+ sequentially
+ run-all - run all tests, in parallel
+
+ Examples:
+ $ ./regtest.sh run-seq unif-small-typical # Run, the unif-small-typical test
+ $ ./regtest.sh run-seq unif-small- # Sequential, the tests containing:
+ # 'unif-small-'
+ $ ./regtest.sh run unif- # Parallel run, matches multiple cases
+ $ ./regtest.sh run-all # Run all tests
+
+ The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
+ use $ in the pattern, since it matches the whole spec line and not just the
+ test case name.) The number of processors used in a parallel run is one less
+ than the number of CPUs on the machine.
+"
+}
+# Future speedups:
+# - Reuse the same input -- come up with naming scheme based on params
+# - Reuse the same maps -- ditto, rappor library can cache it
+#
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+. util.sh
+
+readonly THIS_DIR=$(dirname $0)
+readonly REPO_ROOT=$THIS_DIR
+readonly CLIENT_DIR=$REPO_ROOT/client/python
+# subdirs are in _tmp/$impl, which shouldn't overlap with anything else in _tmp
+readonly REGTEST_BASE_DIR=_tmp
+
+# All the Python tools need this
+export PYTHONPATH=$CLIENT_DIR
+
+print-unique-values() {
+ local num_unique_values=$1
+ seq 1 $num_unique_values | awk '{print "v" $1}'
+}
+
+# Add some more candidates here. We hope these are estimated at 0.
+# e.g. if add_start=51, and num_additional is 20, show v51-v70
+more-candidates() {
+ local last_true=$1
+ local num_additional=$2
+
+ local begin
+ local end
+ begin=$(expr $last_true + 1)
+ end=$(expr $last_true + $num_additional)
+
+ seq $begin $end | awk '{print "v" $1}'
+}
+
+# Args:
+# unique_values: File of unique true values
+# last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
+# num_additional: additional candidates to generate (starting at 'last_true')
+# to_remove: Regex of true values to omit from the candidates list, or the
+# string 'NONE' if none should be. (Our values look like 'v1', 'v2', etc. so
+# there isn't any ambiguity.)
+print-candidates() {
+ local unique_values=$1
+ local last_true=$2
+ local num_additional=$3
+ local to_remove=$4
+
+ if test $to_remove = NONE; then
+ cat $unique_values # include all true inputs
+ else
+ egrep -v $to_remove $unique_values # remove some true inputs
+ fi
+ more-candidates $last_true $num_additional
+}
+
+# Generate a single test case, specified by a line of the test spec.
+# This is a helper function for _run_tests().
+_setup-one-case() {
+ local impl=$1
+ shift # impl is not part of the spec; the next 13 params are
+
+ local test_case=$1
+
+ # input params
+ local dist=$2
+ local num_unique_values=$3
+ local num_clients=$4
+ local values_per_client=$5
+
+ # RAPPOR params
+ local num_bits=$6
+ local num_hashes=$7
+ local num_cohorts=$8
+ local p=$9
+ local q=${10} # need curly braces to get the 10th arg
+ local f=${11}
+
+ # map params
+ local num_additional=${12}
+ local to_remove=${13}
+
+ banner 'Setting up parameters and candidate files for '$test_case
+
+ local case_dir=$REGTEST_BASE_DIR/$impl/$test_case
+ mkdir --verbose -p $case_dir
+
+ # Save the "spec"
+ echo "$@" > $case_dir/spec.txt
+
+ local params_path=$case_dir/case_params.csv
+
+ echo 'k,h,m,p,q,f' > $params_path
+ echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
+
+ print-unique-values $num_unique_values > $case_dir/case_unique_values.txt
+
+ local true_map_path=$case_dir/case_true_map.csv
+
+ bin/hash_candidates.py \
+ $params_path \
+ < $case_dir/case_unique_values.txt \
+ > $true_map_path
+
+ # banner "Constructing candidates"
+
+ print-candidates \
+ $case_dir/case_unique_values.txt $num_unique_values \
+ $num_additional "$to_remove" \
+ > $case_dir/case_candidates.txt
+
+ # banner "Hashing candidates to get 'map'"
+
+ bin/hash_candidates.py \
+ $params_path \
+ < $case_dir/case_candidates.txt \
+ > $case_dir/case_map.csv
+}
+
+# Run a single test instance, specified by <test_name, instance_num>.
+# This is a helper function for _run_tests().
+_run-one-instance() {
+ local test_case=$1
+ local test_instance=$2
+ local impl=$3
+
+ local case_dir=$REGTEST_BASE_DIR/$impl/$test_case
+
+ read -r \
+ case_name distr num_unique_values num_clients values_per_client \
+ num_bits num_hashes num_cohorts p q f \
+ num_additional to_remove \
+ < $case_dir/spec.txt
+
+ local instance_dir=$case_dir/$test_instance
+ mkdir --verbose -p $instance_dir
+
+ banner "Generating reports (gen_reports.R)"
+
+ # the TRUE_VALUES_PATH environment variable can be used to avoid
+ # generating new values every time. NOTE: You are responsible for making
+ # sure the params match!
+
+ local true_values=${TRUE_VALUES_PATH:-}
+ if test -z "$true_values"; then
+ true_values=$instance_dir/case_true_values.csv
+ tests/gen_true_values.R $distr $num_unique_values $num_clients \
+ $values_per_client $num_cohorts \
+ $true_values
+ else
+ # TEMP hack: Make it visible to plot.
+ # TODO: Fix compare_dist.R
+ ln -s -f --verbose \
+ $PWD/$true_values \
+ $instance_dir/case_true_values.csv
+ fi
+
+ case $impl in
+ python)
+ banner "Running RAPPOR Python client"
+
+ # Writes encoded "out" file, true histogram, true inputs to
+ # $instance_dir.
+ time tests/rappor_sim.py \
+ --num-bits $num_bits \
+ --num-hashes $num_hashes \
+ --num-cohorts $num_cohorts \
+ -p $p \
+ -q $q \
+ -f $f \
+ < $true_values \
+ > "$instance_dir/case_reports.csv"
+ ;;
+
+ cpp)
+ banner "Running RAPPOR C++ client (see rappor_sim.log for errors)"
+
+ time client/cpp/_tmp/rappor_sim \
+ $num_bits \
+ $num_hashes \
+ $num_cohorts \
+ $p \
+ $q \
+ $f \
+ < $true_values \
+ > "$instance_dir/case_reports.csv" \
+ 2>"$instance_dir/rappor_sim.log"
+ ;;
+
+ *)
+ log "Invalid impl $impl (should be one of python|cpp)"
+ exit 1
+ ;;
+
+ esac
+
+ banner "Summing RAPPOR IRR bits to get 'counts'"
+
+ bin/sum_bits.py \
+ $case_dir/case_params.csv \
+ < $instance_dir/case_reports.csv \
+ > $instance_dir/case_counts.csv
+
+ local out_dir=${instance_dir}_report
+ mkdir --verbose -p $out_dir
+
+ # Currently, the summary file shows and aggregates timing of the inference
+ # engine, which excludes R's loading time and reading of the (possibly
+ # substantial) map file. Timing below is more inclusive.
+ TIMEFORMAT='Running compare_dist.R took %R seconds'
+ time {
+ # Input prefix, output dir
+ tests/compare_dist.R -t "Test case: $test_case (instance $test_instance)" \
+ "$case_dir/case" "$instance_dir/case" $out_dir
+ }
+}
+
+# Like _run-once-case, but log to a file.
+_run-one-instance-logged() {
+ local test_case=$1
+ local test_instance=$2
+ local impl=$3
+
+ local log_dir=$REGTEST_BASE_DIR/$impl/$test_case/${test_instance}_report
+ mkdir --verbose -p $log_dir
+
+ log "Started '$test_case' (instance $test_instance) -- logging to $log_dir/log.txt"
+ _run-one-instance "$@" >$log_dir/log.txt 2>&1 \
+ && log "Test case $test_case (instance $test_instance) done" \
+ || log "Test case $test_case (instance $test_instance) failed"
+}
+
+make-summary() {
+ local dir=$1
+ local impl=$2
+
+ local filename=results.html
+
+ tests/make_summary.py $dir $dir/rows.html
+
+ pushd $dir >/dev/null
+
+ cat ../../tests/regtest.html \
+ | sed -e '/__TABLE_ROWS__/ r rows.html' -e "s/_IMPL_/$impl/g" \
+ > $filename
+
+ popd >/dev/null
+
+ log "Wrote $dir/$filename"
+ log "URL: file://$PWD/$dir/$filename"
+}
+
+test-error() {
+ local spec_regex=${1:-}
+ log "Some test cases failed"
+ if test -n "$spec_regex"; then
+ log "(Perhaps none matched pattern '$spec_regex')"
+ fi
+ # don't quit just yet
+ # exit 1
+}
+
+# Assuming the spec file, write a list of test case names (first column) with
+# the instance ids (second column), where instance ids run from 1 to $1.
+# Third column is impl.
+_setup-test-instances() {
+ local instances=$1
+ local impl=$2
+
+ while read line; do
+ for i in $(seq 1 $instances); do
+ read case_name _ <<< $line # extract the first token
+ echo $case_name $i $impl
+ done
+ done
+}
+
+# Print the default number of parallel processes, which is max(#CPUs - 1, 1)
+default-processes() {
+ processors=$(grep -c ^processor /proc/cpuinfo || echo 4) # Linux-specific
+ if test $processors -gt 1; then # leave one CPU for the OS
+ processors=$(expr $processors - 1)
+ fi
+ echo $processors
+}
+
+# Args:
+# spec_gen: A program to execute to generate the spec.
+# spec_regex: A pattern selecting the subset of tests to run
+# parallel: Whether the tests are run in parallel (T/F). Sequential
+# runs log to the console; parallel runs log to files.
+# impl: one of python, or cpp
+# instances: A number of times each test case is run
+
+_run-tests() {
+ local spec_gen=$1
+ local spec_regex="$2" # grep -E format on the spec, can be empty
+ local parallel=$3
+ local impl=${4:-"cpp"}
+ local instances=${5:-1}
+
+ local regtest_dir=$REGTEST_BASE_DIR/$impl
+ rm -r -f --verbose $regtest_dir
+
+ mkdir --verbose -p $regtest_dir
+
+ local func
+ local processors
+
+ if test $parallel = F; then
+ func=_run-one-instance # output to the console
+ processors=1
+ else
+ func=_run-one-instance-logged
+ # Let the user override with MAX_PROC, in case they don't have enough
+ # memory.
+ processors=${MAX_PROC:-$(default-processes)}
+ log "Running $processors parallel processes"
+ fi
+
+ local cases_list=$regtest_dir/test-cases.txt
+ # Need -- for regexes that start with -
+ $spec_gen | grep -E -- "$spec_regex" > $cases_list
+
+ # Generate parameters for all test cases.
+ cat $cases_list \
+ | xargs -l -P $processors -- $0 _setup-one-case $impl \
+ || test-error
+
+ log "Done generating parameters for all test cases"
+
+ local instances_list=$regtest_dir/test-instances.txt
+ _setup-test-instances $instances $impl < $cases_list > $instances_list
+
+ cat $instances_list \
+ | xargs -l -P $processors -- $0 $func || test-error
+
+ log "Done running all test instances"
+
+ make-summary $regtest_dir $impl
+}
+
+# used for most tests
+readonly REGTEST_SPEC=tests/regtest_spec.py
+
+# Run tests sequentially. NOTE: called by demo.sh.
+run-seq() {
+ local spec_regex=${1:-'^r-'} # grep -E format on the spec
+ shift
+
+ time _run-tests $REGTEST_SPEC $spec_regex F $@
+}
+
+# Run tests in parallel
+run() {
+ local spec_regex=${1:-'^r-'} # grep -E format on the spec
+ shift
+
+ time _run-tests $REGTEST_SPEC $spec_regex T $@
+}
+
+# Run tests in parallel (7+ minutes on 8 cores)
+run-all() {
+ log "Running all tests. Can take a while."
+ time _run-tests $REGTEST_SPEC '^r-' T cpp
+}
+
+run-user() {
+ local spec_regex=${1:-}
+ local parallel=T # too much memory
+ time _run-tests tests/user_spec.py "$spec_regex" $parallel cpp
+}
+
+# Use stable true values
+compare-python-cpp() {
+ local num_unique_values=100
+ local num_clients=10000
+ local values_per_client=10
+ local num_cohorts=64
+
+ local true_values=$REGTEST_BASE_DIR/stable_true_values.csv
+
+ tests/gen_true_values.R \
+ exp $num_unique_values $num_clients $values_per_client $num_cohorts \
+ $true_values
+
+ wc -l $true_values
+
+ # Run Python and C++ simulation on the same input
+
+ ./build.sh cpp-client
+
+ TRUE_VALUES_PATH=$true_values \
+ ./regtest.sh run-seq '^demo3' 1 python
+
+ TRUE_VALUES_PATH=$true_values \
+ ./regtest.sh run-seq '^demo3' 1 cpp
+
+ head _tmp/{python,cpp}/demo3/1/case_reports.csv
+}
+
+if test $# -eq 0 ; then
+ usage
+else
+ "$@"
+fi
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 0000000..6899113
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Setup RAPPOR analysis on Ubuntu Trusty (Google Cloud or otherwise).
+#
+# For the apps/api server, you need 'install-minimal'. For the regtest, and
+# Shiny apps, we need a few more R packages (ggplot2, data.table, etc.). They
+# cause versioning problems, so we keep them separate.
+#
+# Usage:
+# ./setup.sh [function name]
+# If run without specifing any function it will run: install-most
+# which should cover all the packages needed to run the demo.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+native-packages() {
+ sudo apt-get update
+ # - build-essential for gcc compilers, invoked while installing R packages.
+ # - gfortran Fortran compiler needed for glmnet.
+ # - libblas-dev needed for limSolve.
+ # - python-dev is for building the fastrand extension
+ #
+ # NOTE: we get R 3.0.2 on Trusty.
+ sudo apt-get install build-essential gfortran libblas-dev r-base python-dev graphviz
+}
+
+r-packages() {
+ # Install as root so you can write to /usr/local/lib/R.
+
+ # glmnet, limSolve: solvers for decode.R
+ # RJSONIO, optparse: for decode_dist.R
+ # RUnit: for unit tests
+ # abind: for decode_test only
+ sudo R -e \
+ 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")'
+}
+
+# R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround.
+install-plyr-with-friends() {
+ mkdir -p _tmp
+ wget --directory _tmp \
+ http://cran.r-project.org/src/contrib/Archive/Rcpp/Rcpp_0.11.4.tar.gz
+ wget --directory _tmp \
+ http://cran.r-project.org/src/contrib/Archive/plyr/plyr_1.8.1.tar.gz
+ sudo R CMD INSTALL _tmp/Rcpp_0.11.4.tar.gz
+ sudo R CMD INSTALL _tmp/plyr_1.8.1.tar.gz
+ sudo R -e \
+ 'install.packages(c("reshape2", "ggplot2", "data.table"), repos="http://cran.rstudio.com/")'
+}
+
+# Keep Shiny separate, since it seems to install a lot of dependencies.
+shiny() {
+ sudo R -e \
+ 'install.packages(c("shiny"), repos="http://cran.rstudio.com/")'
+}
+
+#
+# Batch
+#
+
+install-minimal() {
+ native-packages
+ r-packages
+}
+
+# NOTE: hasn't yet been tested on a clean machine.
+install-most() {
+ install-minimal
+ install-plyr-with-friends
+}
+
+#
+# Shiny Apps / API Server
+#
+
+# After running one of the run_app.sh scripts, see if the app returns a page.
+shiny-smoke-test() {
+ curl http://localhost:6789/
+}
+
+# Then set up a "firewall rule" in console.developers.google.com to open up
+# "tcp:6789". Test it from the outside.
+
+if test $# -eq 0 ; then
+ install-most
+else
+ "$@"
+fi
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000..974157f
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Test automation script.
+#
+# Usage:
+# test.sh [function name]
+#
+# Examples:
+# $ ./test.sh py-unit # run Python unit tests
+# $ ./test.sh all # all tests
+# $ ./test.sh lint # run lint checks
+# If no function is provided all of the unit tests will be run.
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+. util.sh
+
+readonly THIS_DIR=$(dirname $0)
+readonly REPO_ROOT=$THIS_DIR
+readonly CLIENT_DIR=$REPO_ROOT/client/python
+
+#
+# Fully Automated Tests
+#
+
+# Run all Python unit tests.
+#
+# Or pass a particular test to run with the correct PYTHONPATH, e.g.
+#
+# $ ./test.sh py-unit tests/fastrand_test.py
+#
+# TODO: Separate out deterministic tests from statistical tests (which may
+# rarely fail)
+py-unit() {
+ export PYTHONPATH=$CLIENT_DIR # to find client library
+
+ if test $# -gt 0; then
+ "$@"
+ else
+ set +o errexit
+
+ # -e: exit at first failure
+ find $REPO_ROOT -name \*_test.py | sh -x -e
+ fi
+
+ local exit_code=$?
+
+ if test $exit_code -eq 0; then
+ echo 'ALL TESTS PASSED'
+ else
+ echo 'FAIL'
+ exit 1
+ fi
+ set -o errexit
+}
+
+# All tests
+all() {
+ banner "Running Python unit tests"
+ py-unit
+ echo
+
+ banner "Running R unit tests"
+ r-unit
+}
+
+#
+# Lint
+#
+lint() {
+ banner "Linting Python source files"
+ py-lint
+ echo
+
+ banner "Linting Documentation files"
+ doc-lint
+}
+
+python-lint() {
+ # E111: indent not a multiple of 4. We are following the Google/Chrome
+ # style and using 2 space indents.
+ if pep8 --ignore=E111 "$@"; then
+ echo
+ echo 'LINT PASSED'
+ else
+ echo
+ echo 'LINT FAILED'
+ exit 1
+ fi
+}
+
+py-lint() {
+ which pep8 >/dev/null || die "pep8 not installed ('sudo apt-get install pep8' on Ubuntu)"
+
+ # - Skip _tmp dir, because we are downloading cpplint.py there, and it has
+ # pep8 lint errors
+ # - Exclude setup.py, because it's a config file and uses "invalid" 'name =
+ # 1' style (spaces around =).
+ find $REPO_ROOT \
+ \( -name _tmp -a -prune \) -o \
+ \( -name \*.py -a -print \) \
+ | grep -v /setup.py \
+ | xargs --verbose -- $0 python-lint
+}
+
+r-unit() {
+ set -o xtrace # show tests we're running
+
+ # This one needs to be run from the root dir
+ tests/compare_dist_test.R
+
+ tests/gen_counts_test.R
+
+ tests/gen_true_values_test.R
+
+ analysis/R/decode_test.R
+
+ analysis/test/run_tests.R
+}
+
+doc-lint() {
+ which tidy >/dev/null || die "tidy not found"
+ for doc in _tmp/report.html _tmp/doc/*.html; do
+ echo $doc
+ # -e: show only errors and warnings
+ # -q: quiet
+ tidy -e -q $doc || true
+ done
+}
+
+# This isn't a strict check, but can help.
+# TODO: Add words to whitelist.
+spell-all() {
+ which spell >/dev/null || die "spell not found"
+ spell README.md doc/*.md | sort | uniq
+}
+
+#
+# Smoke Tests. These can be manually run.
+#
+
+gen-true-values() {
+ local num_unique_values=10
+ local num_clients=10
+ local values_per_client=2
+ local num_cohorts=4
+ local out=_tmp/reports.csv
+
+ tests/gen_true_values.R \
+ exp $num_unique_values $num_clients $values_per_client $num_cohorts $out
+ wc -l $out
+ cat $out
+}
+
+if test $# -eq 0 ; then
+ all
+else
+ "$@"
+fi
diff --git a/tests/_fastrand.c b/tests/_fastrand.c
new file mode 100644
index 0000000..d1f85ea
--- /dev/null
+++ b/tests/_fastrand.c
@@ -0,0 +1,101 @@
+/*
+Copyright 2014 Google Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+/*
+ * _fastrand.c -- Python extension module to generate random bit vectors
+ * quickly.
+ *
+ * IMPORTANT: This module does not use crytographically strong randomness. It
+ * should be used ONLY be used to speed up the simulation. Don't use it in
+ * production.
+ *
+ * If an adversary can predict which random bits are flipped, then RAPPOR's
+ * privacy is compromised.
+ *
+ */
+
+#include <stdint.h> // uint64_t
+#include <stdio.h> // printf
+#include <stdlib.h> // srand
+#include <time.h> // time
+
+#include <Python.h>
+
+uint64_t randbits(float p1, int num_bits) {
+ uint64_t result = 0;
+ // RAND_MAX is the maximum int returned by rand().
+ //
+ // When p1 == 1.0, we want to guarantee that all bits are 1. The threshold
+ // will be RAND_MAX + 1. In the rare case that rand() returns RAND_MAX, the
+ // "<" test succeeds, so we get 1.
+ //
+ // When p1 == 0.0, we want to guarantee that all bits are 0. The threshold
+ // will be 0. In the rare case that rand() returns 0, the "<" test fails, so
+ // we get 0.
+
+ // NOTE: cast is necessary to do unsigned arithmetic rather than signed.
+ // RAND_MAX is an int so adding 1 won't overflow a uint64_t.
+ uint64_t max = (uint64_t)RAND_MAX + 1u;
+ uint64_t threshold = p1 * max;
+ int i;
+ for (i = 0; i < num_bits; ++i) {
+ // NOTE: The comparison is <= so that p1 = 1.0 implies that the bit is
+ // ALWAYS set. RAND_MAX is the maximum value returned by rand().
+ uint64_t bit = (rand() < threshold);
+ result |= (bit << i);
+ }
+ return result;
+}
+
+static PyObject *
+func_randbits(PyObject *self, PyObject *args) {
+ float p1;
+ int num_bits;
+
+ if (!PyArg_ParseTuple(args, "fi", &p1, &num_bits)) {
+ return NULL;
+ }
+ if (p1 < 0.0 || p1 > 1.0) {
+ printf("p1 must be between 0.0 and 1.0\n");
+ // return None for now; easier than raising ValueError
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+ if (num_bits < 0 || num_bits > 64) {
+ printf("num_bits must be 64 or less\n");
+ // return None for now; easier than raising ValueError
+ Py_INCREF(Py_None);
+ return Py_None;
+ }
+
+ //printf("p: %f\n", p);
+ uint64_t r = randbits(p1, num_bits);
+ return PyLong_FromUnsignedLongLong(r);
+}
+
+PyMethodDef methods[] = {
+ {"randbits", func_randbits, METH_VARARGS,
+ "Return a number with N bits, where each bit is 1 with probability p."},
+ {NULL, NULL},
+};
+
+void init_fastrand(void) {
+ Py_InitModule("_fastrand", methods);
+
+ // Just seed it here; we don't give the application any control.
+ int seed = time(NULL);
+ srand(seed);
+}
diff --git a/tests/analyze_assoc.R b/tests/analyze_assoc.R
new file mode 100755
index 0000000..5d78806
--- /dev/null
+++ b/tests/analyze_assoc.R
@@ -0,0 +1,126 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Reads map files, report files, and RAPPOR parameters to run
+# an EM algorithm to estimate joint distribution over two or more variables
+#
+# Usage:
+# $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
+# -reports reports.csv \
+# Inputs: map1, map2, reports, params
+# see how options are parsed below for more information
+# Outputs:
+# prints a table with estimated joint probability masses
+# over candidate strings
+# Ex.
+# ssl nossl
+# intel 0.1 0.3
+# google 0.5 0.1
+
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+ option_list <- list(
+ # Flags
+ make_option(c("--map1", "-m1"), default = "map_1.csv",
+ help = "Hashed candidates for 1st variable"),
+ make_option(c("--map2", "-m2"), default = "map_2.csv",
+ help = "Hashed candidates for 2nd variable"),
+ make_option(c("--reports", "-r"), default = "reports.csv",
+ help = "File with raw reports as <cohort, report1, report2>"),
+ make_option(c("--params", "-p"), default = "params.csv",
+ help = "Filename for RAPPOR parameters")
+ )
+ opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+source("../analysis/R/encode.R")
+source("../analysis/R/decode.R")
+source("../analysis/R/simulation.R")
+source("../analysis/R/read_input.R")
+source("../analysis/R/association.R")
+
+# This function processes the maps loaded using ReadMapFile
+# Association analysis requires a map object with a map
+# field that has the map split into cohorts and an rmap field
+# that has all the cohorts combined
+# Arguments:
+# map = map object with cohorts as sparse matrix in
+# object map$map
+# This is the expected object from ReadMapFile
+# params = data field with parameters
+# TODO(pseudorandom): move this functionality to ReadMapFile
+ProcessMap <- function(map, params) {
+ map$rmap <- map$map
+ split_map <- function(i, map_struct) {
+ numbits <- params$k
+ indices <- which(as.matrix(
+ map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
+ arr.ind = TRUE)
+ sparseMatrix(indices[, "row"], indices[, "col"],
+ dims = c(numbits, max(indices[, "col"])))
+ }
+ map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap))
+ map
+}
+
+main <- function(opts) {
+ ptm <- proc.time()
+
+ params <- ReadParameterFile(opts$params)
+ opts_map <- list(opts$map1, opts$map2)
+ map <- lapply(opts_map, function(o)
+ ProcessMap(ReadMapFile(o, params = params),
+ params = params))
+ # Reports must be of the format
+ # cohort no, rappor bitstring 1, rappor bitstring 2
+ reportsObj <- read.csv(opts$reports,
+ colClasses = c("integer", "character", "character"),
+ header = FALSE)
+
+ # Parsing reportsObj
+ # ComputeDistributionEM allows for different sets of cohorts
+ # for each variable. Here, both sets of cohorts are identical
+ co <- as.list(reportsObj[1])[[1]]
+ cohorts <- list(co, co)
+ # Parse reports from reportObj cols 2 and 3
+ reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1]))
+
+ # Split strings into bit arrays (as required by assoc analysis)
+ reports <- lapply(1:2, function(i) {
+ # apply the following function to each of reports[[1]] and reports[[2]]
+ lapply(reports[[i]][[1]], function(x) {
+ # function splits strings and converts them to numeric values
+ as.numeric(strsplit(x, split = "")[[1]])
+ })
+ })
+
+ joint_dist <- ComputeDistributionEM(reports, cohorts, map,
+ ignore_other = TRUE,
+ params, marginals = NULL,
+ estimate_var = FALSE)
+ # TODO(pseudorandom): Export the results to a file for further analysis
+ print("JOINT_DIST$FIT")
+ print(joint_dist$fit)
+ print("PROC.TIME")
+ print(proc.time() - ptm)
+}
+
+if(!interactive()) {
+ main(opts)
+} \ No newline at end of file
diff --git a/tests/assoc_sim.R b/tests/assoc_sim.R
new file mode 100755
index 0000000..3ff1e5d
--- /dev/null
+++ b/tests/assoc_sim.R
@@ -0,0 +1,172 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simulates inputs on which association analysis can be run.
+# Currently assoc_sim.R only supports 2 variables but can
+# be easily extended to support more.
+#
+# Usage:
+# $ ./assoc_sim.R -n 1000
+# Inputs: uvals, params, reports, map, num, unif
+# see how options are parsed below for more information
+# Outputs:
+# reports.csv file containing reports
+# map_{1, 2, ...}.csv file(s) containing maps of variables
+
+library("optparse")
+
+options(stringsAsFactors = FALSE)
+
+if(!interactive()) {
+ option_list <- list(
+ make_option(c("--uvals", "-v"), default = "uvals.csv",
+ help = "Filename for list of values over which
+ distributions are simulated. The file is a list of
+ comma-separated strings each line of which refers
+ to a new variable."),
+ make_option(c("--params", "-p"), default = "params.csv",
+ help = "Filename for RAPPOR parameters"),
+ make_option(c("--reports", "-r"), default = "reports.csv",
+ help = "Filename for reports"),
+ make_option(c("--map", "-m"), default = "map",
+ help = "Filename *prefix* for map(s)"),
+ make_option(c("--num", "-n"), default = 1e05,
+ help = "Number of reports"),
+ make_option(c("--unif", "-u"), default = FALSE,
+ help = "Run simulation with uniform distribution")
+ )
+ opts <- parse_args(OptionParser(option_list = option_list))
+}
+
+source("../analysis/R/encode.R")
+source("../analysis/R/decode.R")
+source("../analysis/R/simulation.R")
+source("../analysis/R/read_input.R")
+source("../analysis/R/association.R")
+
+# Read unique values of reports from a csv file
+# Inputs: filename. The file is expected to contain two rows of strings
+# (one for each variable):
+# "google.com", "apple.com", ...
+# "ssl", "nossl", ...
+# Returns: a list containing strings
+GetUniqueValsFromFile <- function(filename) {
+ contents <- read.csv(filename, header = FALSE)
+ # Expect 2 rows of unique vals
+ if(nrow(contents) != 2) {
+ stop(paste("Unique vals file", filename, "expected to have
+ two rows of strings."))
+ }
+ # Removes superfluous "" entries if the lists of unique values
+ # differ in length
+ strip_empty <- function(vec) {
+ vec[!vec %in% c("")]
+ }
+ list(var1 = strip_empty(as.vector(t(contents[1,]))),
+ var2 = strip_empty(as.vector(t(contents[2,]))))
+}
+
+# Simulate correlated reports and write into reportsfile
+# Inputs: N = number of reports
+# uvals = list containing a list of unique values
+# params = list with RAPPOR parameters
+# unif = whether to replace poisson with uniform
+# mapfile = file to write maps into (with .csv suffixes)
+# reportsfile = file to write reports into (with .csv suffix)
+SimulateReports <- function(N, uvals, params, unif,
+ mapfile, reportsfile) {
+ # Compute true distribution
+ m <- params$m
+
+ if (unif) {
+ # Draw uniformly from 1 to 10
+ v1_samples <- as.integer(runif(N, 1, 10))
+ } else {
+ # Draw from a Poisson random variable
+ v1_samples <- rpois(N, 1) + rep(1, N)
+ }
+
+ # Pr[var2 = N + 1 | var1 = N] = 0.5
+ # Pr[var2 = N | var1 = N] = 0.5
+ v2_samples <- v1_samples + sample(c(0, 1), N, replace = TRUE)
+
+ tmp_samples <- list(v1_samples, v2_samples)
+
+ # Function to pad strings to uval_vec if sample_vec has
+ # larger support than the number of strings in uval_vec
+ # For e.g., if samples have support {1, 2, 3, 4, ...} and uvals
+ # only have "value1", "value2", and "value3", samples now
+ # over support {"value1", "value2", "value3", "str4", ...}
+ PadStrings <- function(sample_vec, uval_vec) {
+ if (max(sample_vec) > length(uval_vec)) {
+ # Padding uvals to required length
+ len <- length(uval_vec)
+ max_of_samples <- max(sample_vec)
+ uval_vec[(len + 1):max_of_samples] <- apply(
+ as.matrix((len + 1):max_of_samples),
+ 1,
+ function(i) sprintf("str%d", i))
+ }
+ uval_vec
+ }
+
+ # Pad and update uvals
+ uvals <- lapply(1:2, function(i) PadStrings(tmp_samples[[i]],
+ uvals[[i]]))
+
+ # Replace integers in tmp_samples with actual sample strings
+ samples <- lapply(1:2, function(i) uvals[[i]][tmp_samples[[i]]])
+
+ # Randomly assign cohorts in each dimension
+ cohorts <- sample(1:m, N, replace = TRUE)
+
+ # Create and write map into mapfile_1.csv and mapfile_2.csv
+ map <- lapply(uvals, function(u) CreateMap(u, params))
+ write.table(map[[1]]$map_pos, file = paste(mapfile, "_1.csv", sep = ""),
+ sep = ",", col.names = FALSE, na = "", quote = FALSE)
+ write.table(map[[2]]$map_pos, file = paste(mapfile, "_2.csv", sep = ""),
+ sep = ",", col.names = FALSE, na = "", quote = FALSE)
+
+ # Write reports into a csv file
+ # Format:
+ # cohort, bloom filter var1, bloom filter var2
+ reports <- lapply(1:2, function(i)
+ EncodeAll(samples[[i]], cohorts, map[[i]]$map, params))
+ # Organize cohorts and reports into format
+ write_matrix <- cbind(as.matrix(cohorts),
+ as.matrix(lapply(reports[[1]],
+ function(x) paste(x, collapse = ""))),
+ as.matrix(lapply(reports[[2]],
+ function(x) paste(x, collapse = ""))))
+ write.table(write_matrix, file = reportsfile, quote = FALSE,
+ row.names = FALSE, col.names = FALSE, sep = ",")
+}
+
+main <- function(opts) {
+ ptm <- proc.time()
+
+ uvals <- GetUniqueValsFromFile(opts$uvals)
+ params <- ReadParameterFile(opts$params)
+ SimulateReports(opts$num, uvals, params, opts$unif, # inputs
+ opts$map, opts$reports) # outputs
+
+ print("PROC.TIME")
+ print(proc.time() - ptm)
+}
+
+if(!interactive()) {
+ main(opts)
+}
diff --git a/tests/compare_dist.R b/tests/compare_dist.R
new file mode 100755
index 0000000..eb6521d
--- /dev/null
+++ b/tests/compare_dist.R
@@ -0,0 +1,264 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simple tool that wraps the analysis/R library.
+#
+# To run this you need:
+# - ggplot
+# - optparse
+# - glmnet -- dependency of analysis library
+
+library(optparse)
+
+# For unit tests
+is_main <- (length(sys.frames()) == 0)
+
+# Do command line parsing first to catch errors. Loading libraries in R is
+# slow.
+if (is_main) {
+ option_list <- list(
+ make_option(c("-t", "--title"), help="Plot Title")
+ )
+ parsed <- parse_args(OptionParser(option_list = option_list),
+ positional_arguments = 3) # input and output
+}
+
+library(ggplot2)
+
+# Use CairoPNG if available. Useful for headless R.
+if (library(Cairo, quietly = TRUE, logical.return = TRUE)) {
+ png_func = CairoPNG
+ cat('Using CairoPNG\n')
+} else {
+ png_func = png
+ cat('Using png\n')
+}
+
+source("analysis/R/read_input.R")
+source("analysis/R/decode.R")
+source("analysis/R/util.R")
+
+source("analysis/R/alternative.R") # temporary
+
+LoadContext <- function(prefix_case) {
+ # Creates the context, filling it with privacy parameters
+ # Arg:
+ # prefix_case: path prefix to the test case, e.g. '_tmp/exp'
+
+ p <- paste0(prefix_case, '_params.csv')
+
+ params <- ReadParameterFile(p)
+
+ ctx <- new.env()
+
+ ctx$params <- params # so we can write it out later
+
+ ctx
+}
+
+RunRappor <- function(prefix_case, prefix_instance, ctx) {
+ # Reads counts, map files, runs RAPPOR analysis engine.
+ # Args:
+ # prefix_case: path prefix to the test case, e.g., '_tmp/exp'
+ # prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1'
+ # ctx: context file with params field filled in
+
+ c <- paste0(prefix_instance, '_counts.csv')
+ counts <- ReadCountsFile(c, ctx$params)
+
+ m <- paste0(prefix_case, '_map.csv')
+
+ # Switch to LoadMapFile if want to cache the result
+ map <- ReadMapFile(m, ctx$params)
+
+ # Main decode.R API
+ timing <- system.time({
+ res <- Decode(counts, map$map, ctx$params)
+ })
+
+ # The line is searched for, and elapsed time is extracted, by make_summary.py.
+ # Should the formating or wording change, make_summary must be updated too.
+ Log("Inference took %.3f seconds", timing[["elapsed"]])
+
+ if (is.null(res)) {
+ stop("RAPPOR analysis failed.")
+ }
+
+ Log("Decoded results:")
+ str(res$fit)
+
+ res$fit
+}
+
+LoadActual <- function(prefix_instance) {
+ hist_path <- paste0(prefix_instance, '_hist.csv') # case.csv
+
+ # gen_counts.R (fast_counts mode) outputs this, since we never have true
+ # client values.
+ if (file.exists(hist_path)) {
+ return(read.csv(hist_path))
+ }
+
+ # Load ground truth into context
+ input_path <- paste0(prefix_instance, '_true_values.csv') # case.csv
+ client_values <- read.csv(input_path)
+
+ # Create a histogram, or R "table". Column 2 is the true value.
+ t <- table(client_values$value)
+
+ d <- as.data.frame(t) # convert it to a data frame with 'string' and 'count' columns
+ colnames(d) <- c('string', 'count')
+
+ d # return this data frame
+}
+
+CompareRapporVsActual <- function(ctx) {
+ # Prepare input data to be plotted
+
+ actual <- ctx$actual # from the ground truth file
+ rappor <- ctx$rappor # from output of AnalyzeRAPPOR
+
+ # "s12" -> 12, for graphing
+ StringToInt <- function(x) as.integer(substring(x, 2))
+
+ actual_values <- StringToInt(actual$string)
+ rappor_values <- StringToInt(rappor$string)
+
+ # False negatives: AnalyzeRAPPOR failed to find this value (e.g. because it
+ # occurs too rarely)
+ actual_only <- setdiff(actual_values, rappor_values)
+
+ # False positives: AnalyzeRAPPOR attributed a proportion to a string in the
+ # map that wasn't in the true input.
+ rappor_only <- setdiff(rappor_values, actual_values)
+
+ total <- sum(actual$count)
+ a <- data.frame(index = actual_values,
+ # Calculate the true proportion
+ proportion = actual$count / total,
+ dist = "actual")
+
+ r <- data.frame(index = rappor_values,
+ proportion = rappor$proportion,
+ dist = rep("rappor", length(rappor_values)))
+
+ # Extend a and r with the values that they are missing.
+ if (length(rappor_only) > 0) {
+ z <- data.frame(index = rappor_only,
+ proportion = 0.0,
+ dist = "actual")
+ a <- rbind(a, z)
+ }
+ if (length(actual_only) > 0) {
+ z <- data.frame(index = actual_only,
+ proportion = 0.0,
+ dist = "rappor")
+ r <- rbind(r, z)
+ }
+
+ # IMPORTANT: Now a and r have the same rows, but in the wrong order. Sort by index.
+ a <- a[order(a$index), ]
+ r <- r[order(r$index), ]
+
+ # L1 distance between actual and rappor distributions
+ l1 <- sum(abs(a$proportion - r$proportion))
+ # The max L1 distance between two distributions is 2; the max total variation
+ # distance is 1.
+ total_variation <- l1 / 2
+
+ # Choose false positive strings and their proportion from rappor estimates
+ false_pos <- r[r$index %in% rappor_only, c('index', 'proportion')]
+ false_neg <- a[a$index %in% actual_only, c('index', 'proportion')]
+
+ Log("False positives:")
+ str(false_pos)
+
+ Log("False negatives:")
+ str(false_neg)
+
+ # NOTE: We should call Decode() directly, and then num_rappor is
+ # metrics$num_detected, and sum_proportion is metrics$allocated_mass.
+ metrics <- list(
+ num_actual = nrow(actual), # data frames
+ num_rappor = nrow(rappor),
+ num_false_pos = nrow(false_pos),
+ num_false_neg = nrow(false_neg),
+ total_variation = total_variation,
+ sum_proportion = sum(rappor$proportion)
+ )
+
+ Log("Metrics:")
+ str(metrics)
+
+ # Return plot data and metrics
+ list(plot_data = rbind(r, a), metrics = metrics)
+}
+
+# Colors selected to be friendly to the color blind:
+# http://www.cookbook-r.com/Graphs/Colors_%28ggplot2%29/
+palette <- c("#E69F00", "#56B4E9")
+
+PlotAll <- function(d, title) {
+ # NOTE: geom_bar makes a histogram by default; need stat = "identity"
+ g <- ggplot(d, aes(x = index, y = proportion, fill = factor(dist)))
+ b <- geom_bar(stat = "identity", width = 0.7,
+ position = position_dodge(width = 0.8))
+ t <- ggtitle(title)
+ g + b + t + scale_fill_manual(values=palette)
+}
+
+WritePlot <- function(p, outdir, width = 800, height = 600) {
+ filename <- file.path(outdir, 'dist.png')
+ png_func(filename, width=width, height=height)
+ plot(p)
+ dev.off()
+ Log('Wrote %s', filename)
+}
+
+WriteSummary <- function(metrics, outdir) {
+ filename <- file.path(outdir, 'metrics.csv')
+ write.csv(metrics, file = filename, row.names = FALSE)
+ Log('Wrote %s', filename)
+}
+
+main <- function(parsed) {
+ args <- parsed$args
+ options <- parsed$options
+
+ input_case_prefix <- args[[1]]
+ input_instance_prefix <- args[[2]]
+ output_dir <- args[[3]]
+
+ # increase ggplot font size globally
+ theme_set(theme_grey(base_size = 16))
+
+ # NOTE: It takes more than 2000+ ms to get here, while the analysis only
+ # takes 500 ms or so (as measured by system.time).
+
+ ctx <- LoadContext(input_case_prefix)
+ ctx$rappor <- RunRappor(input_case_prefix, input_instance_prefix, ctx)
+ ctx$actual <- LoadActual(input_instance_prefix)
+
+ d <- CompareRapporVsActual(ctx)
+ p <- PlotAll(d$plot_data, options$title)
+
+ WriteSummary(d$metrics, output_dir)
+ WritePlot(p, output_dir)
+}
+
+if (is_main) {
+ main(parsed)
+}
diff --git a/tests/compare_dist_test.R b/tests/compare_dist_test.R
new file mode 100755
index 0000000..e67c95f
--- /dev/null
+++ b/tests/compare_dist_test.R
@@ -0,0 +1,43 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library(RUnit)
+
+source('tests/compare_dist.R')
+
+TestProcessAll <- function() {
+ ctx <- new.env()
+ ctx$actual <- data.frame(string = c('v1', 'v2', 'v3'), proportion = c(0.2, 0.3, 0.5),
+ count = c(2, 3, 5))
+ ctx$rappor <- data.frame(strings = c('v2', 'v3', 'v4'), proportion = c(0.1, 0.2, 0.3))
+
+ metrics <- CompareRapporVsActual(ctx)$metrics
+ str(metrics)
+
+ # sum of rappor proportions
+ checkEqualsNumeric(0.6, metrics$sum_proportion)
+
+ # v1 v2 v3 v4
+ # 0.2 0.3 0.5 0.0
+ # 0.0 0.1 0.2 0.3
+
+ # (0.2 + 0.2 + 0.3 + 0.3) / 2
+ checkEqualsNumeric(0.5, metrics$total_variation)
+
+ print(metrics$total_variation)
+}
+
+TestProcessAll()
diff --git a/tests/fastrand.py b/tests/fastrand.py
new file mode 100755
index 0000000..8137427
--- /dev/null
+++ b/tests/fastrand.py
@@ -0,0 +1,35 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""fastrand.py - Python wrapper for _fastrand."""
+
+# NOTE: We could retire this module in favor of the C++ client? One reason to
+# keep it is if it supports a wider range of params (e.g. more than 32 or 64
+# bits.)
+
+import random
+
+import _fastrand
+
+
+class FastIrrRand(object):
+ """Fast insecure version of rappor.SecureIrrRand."""
+
+ def __init__(self, params):
+ randbits = _fastrand.randbits # accelerated function
+ num_bits = params.num_bloombits
+
+ # IRR probabilities
+ self.p_gen = lambda: randbits(params.prob_p, num_bits)
+ self.q_gen = lambda: randbits(params.prob_q, num_bits)
diff --git a/tests/fastrand_test.py b/tests/fastrand_test.py
new file mode 100755
index 0000000..d3bdbf1
--- /dev/null
+++ b/tests/fastrand_test.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python -S
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+fastrand_test.py: Tests for _fastrand extension module.
+"""
+import unittest
+
+import _fastrand # module under test
+
+
+BIT_WIDTHS = [8, 16, 32, 64]
+
+
+class FastRandTest(unittest.TestCase):
+
+ def testRandbits64(self):
+ for n in BIT_WIDTHS:
+ #print '== %d' % n
+ for p1 in [0.1, 0.5, 0.9]:
+ #print '-- %f' % p1
+ for i in xrange(5):
+ r = _fastrand.randbits(p1, n)
+ # Rough sanity check
+ self.assertLess(r, 2 ** n)
+
+ # Visual check
+ #b = bin(r)
+ #print b
+ #print b.count('1')
+
+
+ def testRandbits64_EdgeCases(self):
+ for n in BIT_WIDTHS:
+ r = _fastrand.randbits(0.0, n)
+ self.assertEqual(0, r)
+
+ for n in BIT_WIDTHS:
+ r = _fastrand.randbits(1.0, n)
+ self.assertEqual(2 ** n - 1, r)
+
+ def testRandbitsError(self):
+ r = _fastrand.randbits(-1, 64)
+ # TODO: Should probably raise exceptions
+ self.assertEqual(None, r)
+
+ r = _fastrand.randbits(0.0, 65)
+ self.assertEqual(None, r)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/tests/gen_counts.R b/tests/gen_counts.R
new file mode 100755
index 0000000..769677c
--- /dev/null
+++ b/tests/gen_counts.R
@@ -0,0 +1,213 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source('analysis/R/read_input.R')
+
+RandomPartition <- function(total, weights) {
+ # Outputs a random partition according to a specified distribution
+ # Args:
+ # total - number of samples
+ # weights - weights that are proportional to the probability density
+ # function of the target distribution
+ # Returns:
+ # a histogram sampled according to the pdf
+ # Example:
+ # > RandomPartition(100, c(3, 2, 1, 0, 1))
+ # [1] 47 24 15 0 14
+ if (any(weights < 0))
+ stop("Probabilities cannot be negative")
+
+ if (sum(weights) == 0)
+ stop("Probabilities cannot sum up to 0")
+
+ bins <- length(weights)
+ result <- rep(0, bins)
+
+ # idiomatic way:
+ # rnd_list <- sample(strs, total, replace = TRUE, weights)
+ # apply(as.array(strs), 1, function(x) length(rnd_list[rnd_list == x]))
+ #
+ # The following is much faster for larger totals. We can replace a loop with
+ # (tail) recusion, but R chokes with the recursion depth > 850.
+
+ w <- sum(weights)
+
+ for (i in 1:bins)
+ if (total > 0) { # if total == 0, nothing else to do
+ # invariant: w = sum(weights[i:bins])
+ # rather than computing sum every time leading to quadratic time, keep
+ # updating it
+
+ # The probability p is clamped to [0, 1] to avoid under/overflow errors.
+ p <- min(max(weights[i] / w, 0), 1)
+ # draw the number of balls falling into the current bin
+ rnd_draw <- rbinom(n = 1, size = total, prob = p)
+ result[i] <- rnd_draw # push rnd_draw balls from total to result[i]
+ total <- total - rnd_draw
+ w <- w - weights[i]
+ }
+
+ names(result) <- names(weights)
+
+ return(result)
+}
+
+GenerateCounts <- function(params, true_map, partition, reports_per_client) {
+ # Fast simulation of the marginal table for RAPPOR reports
+ # Args:
+ # params - parameters of the RAPPOR reporting process
+ # true_map - hashed true inputs
+ # partition - allocation of clients between true values
+ # reports_per_client - number of reports (IRRs) per client
+ if (nrow(true_map) != (params$m * params$k)) {
+ stop(cat("Map does not match the params file!",
+ "mk =", params$m * params$k,
+ "nrow(map):", nrow(true_map),
+ sep = " "))
+ }
+
+ # For each reporting type computes its allocation to cohorts.
+ # Output is an m x strs matrix.
+ cohorts <- as.matrix(
+ apply(as.data.frame(partition), 1,
+ function(count) RandomPartition(count, rep(1, params$m))))
+
+ # Expands to (m x k) x strs matrix, where each element (corresponding to the
+ # bit in the aggregate Bloom filter) is repeated k times.
+ expanded <- apply(cohorts, 2, function(vec) rep(vec, each = params$k))
+
+ # For each bit, the number of clients reporting this bit:
+ clients_per_bit <- rep(apply(cohorts, 1, sum), each = params$k)
+
+ # Computes the true number of bits set to one BEFORE PRR.
+ true_ones <- apply(expanded * true_map, 1, sum)
+
+ ones_in_prr <-
+ unlist(lapply(true_ones,
+ function(x) rbinom(n = 1, size = x, prob = 1 - params$f / 2))) +
+ unlist(lapply(clients_per_bit - true_ones, # clients where the bit is 0
+ function(x) rbinom(n = 1, size = x, prob = params$f / 2)))
+
+ # Number of IRRs where each bit is reported (either as 0 or as 1)
+ reports_per_bit <- clients_per_bit * reports_per_client
+
+ ones_before_irr <- ones_in_prr * reports_per_client
+
+ ones_after_irr <-
+ unlist(lapply(ones_before_irr,
+ function(x) rbinom(n = 1, size = x, prob = params$q))) +
+ unlist(lapply(reports_per_bit - ones_before_irr,
+ function(x) rbinom(n = 1, size = x, prob = params$p)))
+
+ counts <- cbind(apply(cohorts, 1, sum) * reports_per_client,
+ matrix(ones_after_irr, nrow = params$m, ncol = params$k, byrow = TRUE))
+
+ if(any(is.na(counts)))
+ stop("Failed to generate bit counts. Likely due to integer overflow.")
+
+ counts
+}
+
+ComputePdf <- function(distr, range) {
+ # Outputs discrete probability density function for a given distribution
+
+ # These are the five distributions in gen_sim_input.py
+ if (distr == 'exp') {
+ pdf <- dexp(1:range, rate = 5 / range)
+ } else if (distr == 'gauss') {
+ half <- range / 2
+ left <- -half + 1
+ pdf <- dnorm(left : half, sd = range / 6)
+ } else if (distr == 'unif') {
+ # e.g. for N = 4, weights are [0.25, 0.25, 0.25, 0.25]
+ pdf <- dunif(1:range, max = range)
+ } else if (distr == 'zipf1') {
+ # Since the distrubition defined over a finite set, we allow the parameter
+ # of the Zipf distribution to be 1.
+ pdf <- sapply(1:range, function(x) 1 / x)
+ } else if (distr == 'zipf1.5') {
+ pdf <- sapply(1:range, function(x) 1 / x^1.5)
+ }
+ else {
+ stop(sprintf("Invalid distribution '%s'", distr))
+ }
+
+ pdf <- pdf / sum(pdf) # normalize
+
+ pdf
+}
+
+# Usage:
+#
+# $ ./gen_counts.R exp 10000 1 foo_params.csv foo_true_map.csv foo
+#
+# Inputs:
+# distribution name
+# number of clients
+# reports per client
+# parameters file
+# map file
+# prefix for output files
+# Outputs:
+# foo_counts.csv
+# foo_hist.csv
+#
+# Warning: the number of reports in any cohort must be less than
+# .Machine$integer.max
+
+main <- function(argv) {
+ distr <- argv[[1]]
+ num_clients <- as.integer(argv[[2]])
+ reports_per_client <- as.integer(argv[[3]])
+ params_file <- argv[[4]]
+ true_map_file <- argv[[5]]
+ out_prefix <- argv[[6]]
+
+ params <- ReadParameterFile(params_file)
+
+ true_map <- ReadMapFile(true_map_file, params)
+
+ num_unique_values <- length(true_map$strs)
+
+ pdf <- ComputePdf(distr, num_unique_values)
+
+ # Computes the number of clients reporting each string
+ # according to the pre-specified distribution.
+ partition <- RandomPartition(num_clients, pdf)
+
+ # Histogram
+ true_hist <- data.frame(string = true_map$strs, count = partition)
+
+ counts <- GenerateCounts(params, true_map$map, partition, reports_per_client)
+
+ # Now create a CSV file
+
+ # Opposite of ReadCountsFile in read_input.R
+ # http://stackoverflow.com/questions/6750546/export-csv-without-col-names
+ counts_path <- paste0(out_prefix, '_counts.csv')
+ write.table(counts, file = counts_path,
+ row.names = FALSE, col.names = FALSE, sep = ',')
+ cat(sprintf('Wrote %s\n', counts_path))
+
+ # TODO: Don't write strings that appear 0 times?
+ hist_path <- paste0(out_prefix, '_hist.csv')
+ write.csv(true_hist, file = hist_path, row.names = FALSE)
+ cat(sprintf('Wrote %s\n', hist_path))
+}
+
+if (length(sys.frames()) == 0) {
+ main(commandArgs(TRUE))
+}
diff --git a/tests/gen_counts_test.R b/tests/gen_counts_test.R
new file mode 100755
index 0000000..87e6e8b
--- /dev/null
+++ b/tests/gen_counts_test.R
@@ -0,0 +1,109 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library(RUnit)
+library(Matrix) # for sparse matrices
+
+source('tests/gen_counts.R')
+
+TestGenerateCounts <- function() {
+ report_params <- list(k = 4, m = 2) # 2 cohorts, 4 bits each
+ map <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values
+ map[1,] <- c(1, 0, 0)
+ map[2,] <- c(0, 1, 0)
+ map[3,] <- c(0, 0, 1)
+ map[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all
+ map[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3
+
+ colnames(map) <- c('v1', 'v2', 'v3')
+
+ partition <- c(3, 2, 1) * 10000
+ v <- 100 # reports per client
+
+ noise0 <- list(p = 0, q = 1, f = 0) # no noise at all
+ counts0 <- GenerateCounts(c(report_params, noise0), map, partition, v)
+
+ checkEqualsNumeric(sum(counts0[1,2:4]), counts0[1,1])
+ checkEqualsNumeric(counts0[1,5], counts0[1,1])
+ checkEqualsNumeric(partition[3] * v, counts0[1,4] + counts0[2,2])
+ checkEqualsNumeric(sum(partition) * v, counts0[1,1] + counts0[2,1])
+
+ pvalues <- chisq.test(counts0[,1] / v, p = c(.5, .5))$p.value
+ for(i in 2:4)
+ pvalues <- c(pvalues,
+ chisq.test(
+ c(counts0[1,i] / v, partition[i - 1] - counts0[1,i] / v),
+ p = c(.5, .5))$p.value)
+
+ noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs
+ counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v)
+
+ for(i in 2:5)
+ for(j in 1:2)
+ pvalues <- c(pvalues,
+ chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
+ p = c(.5, .5))$p.value)
+
+ noise2 <- list(p = 0, q = 1, f = 1.0) # truly random PRRs
+ counts2 <- GenerateCounts(c(report_params, noise2), map, partition, v)
+
+ checkEqualsNumeric(0, max(counts2 %% v)) # all entries must be divisible by v
+
+ counts2 <- counts2 / v
+
+ for(i in 2:5)
+ for(j in 1:2)
+ pvalues <- c(pvalues,
+ chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
+ p = c(.5, .5))$p.value)
+
+ checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed")
+}
+
+TestRandomPartition <- function() {
+
+ p1 <- RandomPartition(total = 100, dgeom(0:999, prob = .1))
+ p2 <- RandomPartition(total = 1000, dnorm(1:1000, mean = 500, sd = 1000 / 6))
+ p3 <- RandomPartition(total = 10000, dunif(1:1000))
+
+ # Totals must check out.
+ checkEqualsNumeric(100, sum(p1))
+ checkEqualsNumeric(1000, sum(p2))
+ checkEqualsNumeric(10000, sum(p3))
+
+ # Initialize the weights vector to 1 0 1 0 1 0 ...
+ weights <- rep(c(1, 0), 100)
+
+ p4 <- RandomPartition(total = 10000, weights)
+
+ # Check that all mass is allocated to non-zero weights.
+ checkEqualsNumeric(10000, sum(p4[weights == 1]))
+ checkTrue(all(p4[weights == 0] == 0))
+
+ p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4))
+ p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value
+
+ # Apply the chi squared test and fail if p.value is too high or too low.
+ # Probability of failure is 2 * 1E-9, which should never happen.
+ checkTrue(p.value > 1E-9)
+}
+
+TestAll <- function(){
+ TestRandomPartition()
+ TestGenerateCounts()
+}
+
+TestAll() \ No newline at end of file
diff --git a/tests/gen_true_values.R b/tests/gen_true_values.R
new file mode 100755
index 0000000..1ab1b33
--- /dev/null
+++ b/tests/gen_true_values.R
@@ -0,0 +1,82 @@
+#!/usr/bin/env Rscript
+#
+# Copyright 2015 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+source('tests/gen_counts.R')
+
+# Usage:
+#
+# $ ./gen_true_values.R exp 100 10000 1 foo.csv
+#
+# Inputs:
+# distribution name
+# size of the distribution's support
+# number of clients
+# reports per client
+# name of the output file
+# Output:
+# csv file with reports sampled according to the specified distribution.
+
+GenerateTrueValues <- function(distr, distr_range, num_clients,
+ reports_per_client, num_cohorts) {
+
+ # Sums to 1.0, e.g. [0.2 0.2 0.2 0.2 0.2] for uniform distribution of 5.
+ pdf <- ComputePdf(distr, distr_range)
+
+ num_reports <- num_clients * reports_per_client
+
+ # Computes the number of clients reporting each value, where the numbers are
+ # sampled according to pdf. (sums to num_reports)
+ partition <- RandomPartition(num_reports, pdf)
+
+ value_ints <- rep(1:distr_range, partition) # expand partition
+
+ stopifnot(length(value_ints) == num_reports)
+
+ # Shuffle values randomly (may take a few sec for > 10^8 inputs)
+ value_ints <- sample(value_ints)
+
+ # Reported values are strings, so prefix integers "v". Even slower than
+ # shuffling.
+ values <- sprintf("v%d", value_ints)
+
+ # e.g. [1 1 2 2 3 3] if num_clients is 3 and reports_per_client is 2
+ client_ints <- rep(1:num_clients, each = reports_per_client)
+
+ # Cohorts are assigned to clients. Cohorts are 0-based.
+ cohorts <- client_ints %% num_cohorts # %% is integer modulus
+
+ clients <- sprintf("c%d", client_ints)
+
+ data.frame(client = clients, cohort = cohorts, value = values)
+}
+
+main <- function(argv) {
+ distr <- argv[[1]]
+ distr_range <- as.integer(argv[[2]])
+ num_clients <- as.integer(argv[[3]])
+ reports_per_client <- as.integer(argv[[4]])
+ num_cohorts <- as.integer(argv[[5]])
+ out_file <- argv[[6]]
+
+ reports <- GenerateTrueValues(distr, distr_range, num_clients,
+ reports_per_client, num_cohorts)
+
+ write.csv(reports, file = out_file, row.names = FALSE, quote = FALSE)
+}
+
+if (length(sys.frames()) == 0) {
+ main(commandArgs(TRUE))
+}
diff --git a/tests/gen_true_values_test.R b/tests/gen_true_values_test.R
new file mode 100755
index 0000000..e46d1e2
--- /dev/null
+++ b/tests/gen_true_values_test.R
@@ -0,0 +1,50 @@
+#!/usr/bin/Rscript
+#
+# gen_reports_test.R
+
+source('analysis/R/util.R') # Log()
+
+source('tests/gen_true_values.R') # module under test
+
+library(RUnit)
+
+TestGenerateTrueValues = function() {
+ num_clients <- 10
+ reports_per_client <- 2
+ num_cohorts <- 4
+ reports <- GenerateTrueValues('exp', 10, num_clients, reports_per_client,
+ num_cohorts)
+ print(reports)
+
+ # 10 clients, 2 reports per client
+ checkEquals(20, nrow(reports))
+
+ # 10 unique clients
+ checkEquals(10, length(unique(reports$client)))
+
+ # Whether a given client reports different values
+ reports_different_values <- rep(FALSE, num_clients)
+
+ for (c in 1:num_clients) {
+ my_reports <- reports[reports$client == c, ]
+ #Log("CLIENT %d", c)
+ #print(my_reports)
+
+ # If every report for this client isn't same, make note of it
+ if (length(unique(my_reports$value)) != 1) {
+ reports_different_values[[c]] <- TRUE
+ }
+ }
+
+ # At least one client should report different values. (Technically this
+ # could fail, but is unlikely with 10 clients).
+ checkTrue(any(reports_different_values))
+
+ checkEquals(num_cohorts, length(unique(reports$cohort)))
+}
+
+TestAll <- function(){
+ TestGenerateTrueValues()
+}
+
+TestAll()
diff --git a/tests/make_summary.py b/tests/make_summary.py
new file mode 100755
index 0000000..b55559c
--- /dev/null
+++ b/tests/make_summary.py
@@ -0,0 +1,401 @@
+#!/usr/bin/python
+"""Given a regtest result tree, prints an HTML summary to a file.
+
+See HTML skeleton in tests/regtest.html.
+"""
+
+import os
+import re
+import sys
+
+
+SUMMARY_ROW = """\
+<tfoot style="font-weight: bold; text-align: right">
+<tr>
+ <td>
+ %(name)s
+ </td>
+
+ <!-- input params -->
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+
+ <!-- RAPPOR params -->
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+ <td></td>
+
+ <!-- MAP params -->
+ <td></td>
+ <td></td>
+
+ <!-- Result metrics -->
+ <td></td>
+ <td></td>
+ <td>%(mean_fpr)s</td>
+ <td>%(mean_fnr)s</td>
+ <td>%(mean_tv)s</td>
+ <td>%(mean_am)s</td>
+ <td>%(mean_time)s</td>
+</tr>
+</tfoot>
+"""
+
+# Navigation and links to plot.
+DETAILS = """\
+<p style="text-align: right">
+ <a href="#top">Up</a>
+</p>
+
+<a id="%(anchor)s"></a>
+
+<p style="text-align: center">
+ <img src="%(instance_dir)s/dist.png"/>
+</p>
+
+<p>
+<a href="%(instance_dir)s">%(name)s files</a>
+</p>
+"""
+
+
+def FormatFloat(x, percent):
+ """Formats a floating-point number."""
+ if percent:
+ return '{:.1f}%'.format(x * 100.0)
+ else:
+ return '{:.3f}'.format(x)
+
+
+def FormatMeanWithSem(m_std_error, percent=False):
+ """Formats an estimate with standard error."""
+ if m_std_error is None:
+ return ''
+ m, std_error = m_std_error
+ if std_error is None:
+ return FormatFloat(m, percent)
+ else:
+ return '{}&plusmn;{}'.format(
+ FormatFloat(m, percent),
+ FormatFloat(std_error, percent))
+
+
+def Mean(l):
+ """Computes the mean (average) for a list of numbers."""
+ if l:
+ return float(sum(l)) / len(l)
+ else:
+ return None
+
+
+def SampleVar(l):
+ """Computes the sample variance for a list of numbers."""
+ if len(l) > 1:
+ mean = Mean(l)
+ var = sum([(x - mean) ** 2 for x in l]) / (len(l) - 1)
+ return var
+ else:
+ return None
+
+
+def StandardErrorEstimate(l):
+ """Returns the standard error estimate for a list of numbers.
+
+ For a singleton the standard error is assumed to be 10% of its value.
+ """
+ if len(l) > 1:
+ return (SampleVar(l) / len(l)) ** .5
+ elif l:
+ return l[0] / 10.0
+ else:
+ return None
+
+
+def MeanOfMeans(dict_of_lists):
+ """Returns the average of averages with the standard error of the estimate.
+ """
+ means = [Mean(dict_of_lists[key]) for key in dict_of_lists
+ if dict_of_lists[key]]
+ if means:
+ # Compute variances of the estimate for each sublist.
+ se = [StandardErrorEstimate(dict_of_lists[key]) ** 2 for key
+ in dict_of_lists if dict_of_lists[key]]
+ return (Mean(means), # Mean over all sublists
+ sum(se) ** .5 / len(se)) # Standard deviation of the mean
+ else:
+ return None
+
+
+def ParseSpecFile(spec_filename):
+ """Parses the spec (parameters) file.
+
+ Returns:
+ An integer and a string. The integer is the number of bogus candidates
+ and the string is parameters in the HTML format.
+ """
+ with open(spec_filename) as s:
+ spec_row = s.readline().split()
+
+ # Second to last column is 'num_additional' -- the number of bogus
+ # candidates added
+ num_additional = int(spec_row[-2])
+
+ spec_in_html = ' '.join('<td>%s</td>' % cell for cell in spec_row[1:])
+
+ return num_additional, spec_in_html
+
+
+def ExtractTime(log_filename):
+ """Extracts the elapsed time information from the log file.
+
+ Returns:
+ Elapsed time (in seconds) or None in case of failure.
+ """
+ if os.path.isfile(log_filename):
+ with open(log_filename) as log:
+ log_str = log.read()
+ # Matching a line output by analyze.R.
+ match = re.search(r'Inference took ([0-9.]+) seconds', log_str)
+ if match:
+ return float(match.group(1))
+ return None
+
+
+def ParseMetrics(metrics_file, log_file, num_additional):
+ """Processes the metrics file.
+
+ Args:
+ metrics_file: name of the metrics file
+ log_file: name of the log.txt file
+ num_additional: A number of bogus candidates added to the candidate list.
+
+ Returns a pair:
+ - A dictionary of metrics (some can be []).
+ - An HTML-formatted portion of the report row.
+ """
+
+ if not os.path.isfile(metrics_file):
+ metrics_row_str = ['', '', '', '', '', '']
+ metrics_row_dict = {}
+ else:
+ with open(metrics_file) as m:
+ m.readline()
+ metrics_row = m.readline().split(',')
+
+ (num_actual, num_rappor, num_false_pos, num_false_neg, total_variation,
+ allocated_mass) = metrics_row
+
+ num_actual = int(num_actual)
+ num_rappor = int(num_rappor)
+
+ num_false_pos = int(num_false_pos)
+ num_false_neg = int(num_false_neg)
+
+ total_variation = float(total_variation)
+ allocated_mass = float(allocated_mass)
+
+ # e.g. if there are 20 additional candidates added, and 1 false positive,
+ # the false positive rate is 5%.
+ fp_rate = float(num_false_pos) / num_additional if num_additional else 0
+ # e.g. if there are 100 strings in the true input, and 80 strings
+ # detected by RAPPOR, then we have 20 false negatives, and a false
+ # negative rate of 20%.
+ fn_rate = float(num_false_neg) / num_actual
+
+ metrics_row_str = [
+ str(num_actual),
+ str(num_rappor),
+ '%.1f%% (%d)' % (fp_rate * 100, num_false_pos) if num_additional
+ else '',
+ '%.1f%% (%d)' % (fn_rate * 100, num_false_neg),
+ '%.3f' % total_variation,
+ '%.3f' % allocated_mass,
+ ]
+
+ metrics_row_dict = {
+ 'tv': [total_variation],
+ 'fpr': [fp_rate] if num_additional else [],
+ 'fnr': [fn_rate],
+ 'am': [allocated_mass],
+ }
+
+ elapsed_time = ExtractTime(log_file)
+ if elapsed_time is not None:
+ metrics_row_str = metrics_row_str + ['%.2f' % elapsed_time]
+ metrics_row_dict['time'] = [elapsed_time]
+
+ # return metrics formatted as HTML table entries
+ return (metrics_row_dict,
+ ' '.join('<td>%s</td>' % cell for cell in metrics_row_str))
+
+
+def FormatCell1(test_case, test_instance, metrics_file, log_file, plot_file,
+ link_to_plots):
+ """Outputs an HTML table entry for the first cell of the row.
+
+ The row is filled if the metrics file exist. The first cell contains a link
+ that for short tables points to a plot file inline, for large tables to an
+ external file.
+
+ If the metrics file is missing, the link points to the log file (if one
+ exists)
+ """
+ relpath_report = '{}/{}_report'.format(test_case, test_instance)
+ if os.path.isfile(metrics_file):
+ external_file = plot_file
+ if link_to_plots:
+ link = '#{}_{}'.format(test_case, test_instance) # anchor
+ else:
+ link = os.path.join(relpath_report, 'dist.png')
+ else: # no results likely due to an error, puts a link to the log file
+ external_file = log_file
+ link = os.path.join(relpath_report, 'log.txt')
+
+ if os.path.isfile(external_file):
+ return '<td><a href="{}">{}</a></td>'.format(link, test_case)
+ else: # if no file to link to
+ return '<td>{}</td>'.format(test_case)
+
+
+def FormatSummaryRow(metrics_lists):
+ """Outputs an HTML-formatted summary row."""
+ means_with_sem = {} # SEM - standard error of the mean
+
+ for key in metrics_lists:
+ means_with_sem[key] = MeanOfMeans(metrics_lists[key])
+ # If none of the lists is longer than one element, drop the SEM component.
+ if means_with_sem[key] and max([len(l) for l in metrics_lists[key]]) < 2:
+ means_with_sem[key] = [means_with_sem[key][0], None]
+
+ summary = {
+ 'name': 'Means',
+ 'mean_fpr': FormatMeanWithSem(means_with_sem['fpr'], percent=True),
+ 'mean_fnr': FormatMeanWithSem(means_with_sem['fnr'], percent=True),
+ 'mean_tv': FormatMeanWithSem(means_with_sem['tv'], percent=True),
+ 'mean_am': FormatMeanWithSem(means_with_sem['am'], percent=True),
+ 'mean_time': FormatMeanWithSem(means_with_sem['time']),
+ }
+ return SUMMARY_ROW % summary
+
+
+def FormatPlots(base_dir, test_instances):
+ """Outputs HTML-formatted plots."""
+ result = ''
+ for instance in test_instances:
+ # A test instance is identified by the test name and the test run.
+ test_case, test_instance, _ = instance.split(' ')
+ instance_dir = test_case + '/' + test_instance + '_report'
+ if os.path.isfile(os.path.join(base_dir, instance_dir, 'dist.png')):
+ result += DETAILS % {'anchor': test_case + '_' + test_instance,
+ 'name': '{} (instance {})'.format(test_case,
+ test_instance),
+ 'instance_dir': instance_dir}
+ return result
+
+
+def main(argv):
+ base_dir = argv[1]
+ output_file = open(argv[2], 'w')
+
+ # This file has the test case names, in the order that they should be
+ # displayed.
+ instances_file = os.path.join(base_dir, 'test-instances.txt')
+ if not os.path.isfile(instances_file):
+ raise RuntimeError('{} is missing'.format(instances_file))
+
+ with open(instances_file) as f:
+ test_instances = [line.strip() for line in f]
+
+ # Metrics are assembled into a dictionary of dictionaries. The top-level
+ # key is the metric name ('tv', 'fpr', etc.), the second level key is
+ # the test case. These keys reference a list of floats, which can be empty.
+ metrics = {
+ 'tv': {}, # total_variation for all test cases
+ 'fpr': {}, # dictionary of false positive rates
+ 'fnr': {}, # dictionary of false negative rates
+ 'am': {}, # dictionary of total allocated masses
+ 'time': {}, # dictionary of total elapsed time measurements
+ }
+
+ # If there are too many tests, the plots are not included in the results
+ # file. Instead, rows' names are links to the corresponding .png files.
+ include_plots = len(test_instances) < 20
+
+ instances_succeeded = 0
+ instances_failed = 0
+ instances_running = 0
+
+ for instance in test_instances:
+ # A test instance is idenfied by the test name and the test run.
+ test_case, test_instance, _ = instance.split(' ')
+
+ spec_file = os.path.join(base_dir, test_case, 'spec.txt')
+ if not os.path.isfile(spec_file):
+ raise RuntimeError('{} is missing'.format(spec_file))
+
+ num_additional, spec_html = ParseSpecFile(spec_file)
+ metrics_html = '' # will be filled in later on, if metrics exist
+
+ report_dir = os.path.join(base_dir, test_case, test_instance + '_report')
+
+ metrics_file = os.path.join(report_dir, 'metrics.csv')
+ log_file = os.path.join(report_dir, 'log.txt')
+ plot_file = os.path.join(report_dir, 'dist.png')
+
+ cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file,
+ plot_file, include_plots)
+
+ # ParseMetrics outputs an HTML table row and also updates lists
+ metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file,
+ num_additional)
+
+ # Update the metrics structure. Initialize dictionaries if necessary.
+ for m in metrics:
+ if m in metrics_dict:
+ if not test_case in metrics[m]:
+ metrics[m][test_case] = metrics_dict[m]
+ else:
+ metrics[m][test_case] += metrics_dict[m]
+
+ print >>output_file, '<tr>{}{}{}</tr>'.format(cell1_html,
+ spec_html, metrics_html)
+
+ # Update counters
+ if 'tv' in metrics_dict:
+ instances_succeeded += 1
+ else:
+ if 'time' in metrics_dict:
+ instances_failed += 1
+ else:
+ if os.path.isfile(log_file):
+ instances_running += 1
+
+ print >>output_file, FormatSummaryRow(metrics)
+
+ print >>output_file, '</tbody>'
+ print >>output_file, '</table>'
+ print >>output_file, '<p style="padding-bottom: 3em"></p>' # vertical space
+
+ # Plot links.
+ if include_plots:
+ print >>output_file, FormatPlots(base_dir, test_instances)
+ else:
+ print >>output_file, ('<p>Too many tests to include plots. '
+ 'Click links within rows for details.</p>')
+
+ print ('Instances'
+ ' succeeded: {} failed: {} running: {} total: {}'.
+ format(instances_succeeded, instances_failed, instances_running,
+ len(test_instances)))
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/tests/params.csv b/tests/params.csv
new file mode 100644
index 0000000..a2114c9
--- /dev/null
+++ b/tests/params.csv
@@ -0,0 +1,2 @@
+k, h, m, p, q, f
+16, 2, 4, 0.1, 0.9, 0.2
diff --git a/tests/rappor_sim.py b/tests/rappor_sim.py
new file mode 100755
index 0000000..66c7fc3
--- /dev/null
+++ b/tests/rappor_sim.py
@@ -0,0 +1,242 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Run the RAPPOR Python client on simulated input.
+
+It takes a 3-column CSV file as generated by gen_reports.R, and outputs a 5
+column CSV of RAPPOR'd data.
+
+Input columns: client,true_value
+Output coumns: client,cohort,bloom,prr,rappor
+
+TODO:
+- cohort should be in the input _input.csv file.
+
+See http://google.github.io/rappor/doc/data-flow.html for details.
+"""
+
+import csv
+import collections
+import optparse
+import os
+import random
+import sys
+import time
+
+import rappor # client library
+try:
+ import fastrand
+except ImportError:
+ print >>sys.stderr, (
+ "Native fastrand module not imported; see README for speedups")
+ fastrand = None
+
+
+def log(msg, *args):
+ if args:
+ msg = msg % args
+ print >>sys.stderr, msg
+
+
+def CreateOptionsParser():
+ p = optparse.OptionParser()
+
+ p.add_option(
+ '--num-bits', type='int', metavar='INT', dest='num_bits', default=16,
+ help='Number of bloom filter bits.')
+ p.add_option(
+ '--num-hashes', type='int', metavar='INT', dest='num_hashes', default=2,
+ help='Number of hashes.')
+ p.add_option(
+ '--num-cohorts', type='int', metavar='INT', dest='num_cohorts',
+ default=64, help='Number of cohorts.')
+
+ p.add_option(
+ '-p', type='float', metavar='FLOAT', dest='prob_p', default=1,
+ help='Probability p')
+ p.add_option(
+ '-q', type='float', metavar='FLOAT', dest='prob_q', default=1,
+ help='Probability q')
+ p.add_option(
+ '-f', type='float', metavar='FLOAT', dest='prob_f', default=1,
+ help='Probability f')
+ p.add_option(
+ '--assoc-testdata', type='int', dest='assoc_testdata', default=0,
+ help='Generate association testdata from true values on stdin.')
+
+ choices = ['simple', 'fast']
+ p.add_option(
+ '-r', type='choice', metavar='STR',
+ dest='random_mode', default='fast', choices=choices,
+ help='Random algorithm (%s)' % '|'.join(choices))
+
+ return p
+
+
+def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
+ csv_in, csv_out):
+ """Read true values from csv_in and output encoded values on csv_out.
+
+ Replicate assoc_testdata_count times. First value is a string, second is a
+ bool. TODO: Generalize this.
+ """
+ rows = []
+ for i, (true_value1, true_value2) in enumerate(csv_in):
+ if i == 0:
+ v1_name = true_value1
+ v2_name = true_value2
+ continue # skip header row
+
+ rows.append((true_value1, true_value2))
+
+ # Use the same column names
+ header = ('client', 'cohort', v1_name, v2_name)
+ csv_out.writerow(header)
+
+ n = assoc_testdata_count
+ report_index = 0
+ for i in xrange(n):
+ for v1, v2 in rows:
+ client_str = 'c%d' % report_index
+
+ # randint(a, b) gives i such that a <= i <= b
+ cohort = random.randint(0, params1.num_cohorts - 1)
+
+ string_encoder = rappor.Encoder(params1, cohort, client_str, irr_rand)
+ bool_encoder = rappor.Encoder(params2, cohort, client_str, irr_rand)
+
+ # Real users should call e.encode(). For testing purposes, we also want
+ # the PRR.
+ irr1 = string_encoder.encode(v1)
+
+ # TODO: Convert to bool and encode with basic RAPPOR
+ v2_int = int(v2)
+ #print v2_int
+ irr2 = bool_encoder.encode_bits(v2_int)
+
+ irr1_str = rappor.bit_string(irr1, params1.num_bloombits)
+ irr2_str = rappor.bit_string(irr2, params2.num_bloombits)
+
+ csv_out.writerow((client_str, cohort, irr1_str, irr2_str))
+
+ report_index += 1
+
+
+def RapporClientSim(params, irr_rand, csv_in, csv_out):
+ """Read true values from csv_in and output encoded values on csv_out."""
+ header = ('client', 'cohort', 'bloom', 'prr', 'irr')
+ csv_out.writerow(header)
+
+ # TODO: It would be more instructive/efficient to construct an encoder
+ # instance up front per client, rather than one per row below.
+ start_time = time.time()
+
+ for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
+ if i == 0:
+ if client_str != 'client':
+ raise RuntimeError('Expected client header, got %s' % client_str)
+ if cohort_str != 'cohort':
+ raise RuntimeError('Expected cohort header, got %s' % cohort_str)
+ if true_value != 'value':
+ raise RuntimeError('Expected value header, got %s' % value)
+ continue # skip header row
+
+ #if i == 30: # EARLY STOP
+ # break
+
+ if i % 10000 == 0:
+ elapsed = time.time() - start_time
+ log('Processed %d inputs in %.2f seconds', i, elapsed)
+
+ cohort = int(cohort_str)
+ secret = client_str
+ e = rappor.Encoder(params, cohort, secret, irr_rand)
+
+ # Real users should call e.encode(). For testing purposes, we also want
+ # the PRR.
+ bloom, prr, irr = e._internal_encode(true_value)
+
+ bloom_str = rappor.bit_string(bloom, params.num_bloombits)
+ prr_str = rappor.bit_string(prr, params.num_bloombits)
+ irr_str = rappor.bit_string(irr, params.num_bloombits)
+
+ out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
+ csv_out.writerow(out_row)
+
+
+def main(argv):
+ (opts, argv) = CreateOptionsParser().parse_args(argv)
+
+ # Copy flags into params
+ params = rappor.Params()
+ params.num_bloombits = opts.num_bits
+ params.num_hashes = opts.num_hashes
+ params.num_cohorts = opts.num_cohorts
+ params.prob_p = opts.prob_p
+ params.prob_q = opts.prob_q
+ params.prob_f = opts.prob_f
+
+ if opts.random_mode == 'simple':
+ irr_rand = rappor.SecureIrrRand(params)
+ elif opts.random_mode == 'fast':
+ if fastrand:
+ log('Using fastrand extension')
+ # NOTE: This doesn't take 'rand'. It's seeded in C with srand().
+ irr_rand = fastrand.FastIrrRand(params)
+ else:
+ log('Warning: fastrand module not importable; see README for build '
+ 'instructions. Falling back to simple randomness.')
+ irr_rand = rappor.SecureIrrRand(params)
+ else:
+ raise AssertionError
+ # Other possible implementations:
+ # - random.SystemRandom (probably uses /dev/urandom on Linux)
+ # - HMAC-SHA256 with another secret? This could match C++ byte for byte.
+ # - or srand(0) might do it.
+
+ csv_in = csv.reader(sys.stdin)
+ csv_out = csv.writer(sys.stdout)
+
+ if opts.assoc_testdata:
+ # Copy flags into params
+ params1 = rappor.Params()
+ params1.num_bloombits = opts.num_bits
+ params1.num_hashes = opts.num_hashes
+ params1.num_cohorts = opts.num_cohorts
+ params1.prob_p = opts.prob_p
+ params1.prob_q = opts.prob_q
+ params1.prob_f = opts.prob_f
+
+ # Second one is boolean
+ params2 = rappor.Params()
+ params2.num_bloombits = 1 # 1 bit for boolean
+ params2.num_hashes = opts.num_hashes
+ params2.num_cohorts = opts.num_cohorts
+ params2.prob_p = opts.prob_p
+ params2.prob_q = opts.prob_q
+ params2.prob_f = opts.prob_f
+
+ GenAssocTestdata(
+ params1, params2, irr_rand, opts.assoc_testdata, csv_in, csv_out)
+ else:
+ RapporClientSim(params, irr_rand, csv_in, csv_out)
+
+
+if __name__ == "__main__":
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ log('rappor_sim.py: FATAL: %s', e)
diff --git a/tests/rappor_sim_test.py b/tests/rappor_sim_test.py
new file mode 100755
index 0000000..f483c25
--- /dev/null
+++ b/tests/rappor_sim_test.py
@@ -0,0 +1,33 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+rappor_sim_test.py: Tests for rappor_sim.py
+"""
+
+import unittest
+
+import rappor_sim # module under test
+
+
+class RapporSimTest(unittest.TestCase):
+
+ def testFoo(self):
+ pass
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/regtest.html b/tests/regtest.html
new file mode 100644
index 0000000..8cb50f2
--- /dev/null
+++ b/tests/regtest.html
@@ -0,0 +1,118 @@
+<!DOCTYPE html>
+<html>
+<head>
+ <title>RAPPOR regtest.sh (_IMPL_)</title>
+ <style type="text/css">
+ h2 { text-align: center }
+ p { margin: 0 auto; width: 80%; text-align: center }
+
+ table { width: 100%; border-spacing: 0 }
+ .tophead { text-align: center; font-weight: bold }
+ .explain { text-align: left; font-weight: normal }
+ .subhead { text-align: right; font-weight: bold }
+ .highlight { background-color: #eeeeee }
+ tbody td { text-align: right }
+ #parent { text-align: right }
+ </style>
+</head>
+
+<body>
+ <a id="top"></a>
+
+ <p id="parent"><a href="..">Parent</a></p>
+
+ <h2>RAPPOR regtest.sh (_IMPL_)</h2>
+
+ <!-- These strings will be replaced by a sed script. -->
+
+ <table cellspacing="0" cellpadding="5">
+ <colgroup>
+ <col span="1" class="highlight" />
+ <col span="4" />
+ <col span="6" class="highlight" />
+ <col span="2" />
+ <col span="7" class="highlight" />
+ </colgroup>
+
+ <thead>
+ <tr class="tophead">
+ <td>
+ Test Case
+ </td>
+ <td colspan=4>
+ Input Params
+ </td>
+ <td colspan=6>
+ RAPPOR Params
+ </td>
+ <td colspan=2>
+ Map Params
+ </td>
+ <td colspan=7>
+ Result Metrics
+ </td>
+ </tr>
+
+ <tr class="explain">
+ <td></td>
+ <td colspan=4>
+ d: distribution drawn from<br/>
+ u: total unique values<br/>
+ c: clients<br/>
+ v: values per client<br/>
+ </td>
+ <td colspan=6>
+ k: report bits<br/>
+ h: hashes<br/>
+ m: cohorts<br/>
+ p, q, f: probabilities<br/>
+ </td>
+ <td colspan=2>
+ +: num additional candidates<br/>
+ -: regex for true values removed<br/>
+ </td>
+ <td colspan=7>
+ a: actual values<br/>
+ r: values RAPPOR detected<br/>
+ fp: false positive rate<br/>
+ fn: false negative rate<br/>
+ tv: total variation distance<br/>
+ am: allocated mass<br/>
+ time: time in seconds<br/>
+ </td>
+ </tr>
+
+ <tr class="subhead">
+ <td></td>
+
+ <td>d</td>
+ <td>u</td>
+ <td>c</td>
+ <td>v</td>
+
+ <td>k</td>
+ <td>h</td>
+ <td>m</td>
+ <td>p</td>
+ <td>q</td>
+ <td>f</td>
+
+ <td>+</td>
+ <td>-</td>
+
+ <td>a</td>
+ <td>r</td>
+ <td>fp</td>
+ <td>fn</td>
+ <td>tv</td>
+ <td>am</td>
+ <td>time</td>
+ </tr>
+ </thead>
+
+ <tbody>
+ <!-- __TABLE_ROWS__ -->
+
+</body>
+
+</html>
diff --git a/tests/regtest_spec.py b/tests/regtest_spec.py
new file mode 100755
index 0000000..5a29f39
--- /dev/null
+++ b/tests/regtest_spec.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python
+"""Print a test spec on stdout.
+
+Each line has parameters for a test case. The regtest.sh shell script reads
+these lines and runs parallel processes.
+
+We use Python data structures so the test cases are easier to read and edit.
+"""
+
+import optparse
+import sys
+
+#
+# TEST CONFIGURATION
+#
+
+DEMO = (
+ # (case_name distr num_unique_values num_clients values_per_client)
+ # (num_bits num_hashes num_cohorts)
+ # (p q f) (num_additional regexp_to_remove)
+ ('demo1 unif 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
+ ('demo2 gauss 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
+ ('demo3 exp 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
+ ('demo4 zipf1 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
+ ('demo5 zipf1.5 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
+)
+
+DISTRIBUTIONS = (
+ 'unif',
+ 'exp',
+ 'gauss',
+ 'zipf1',
+ 'zipf1.5',
+)
+
+DISTRIBUTION_PARAMS = (
+ # name, num unique values, num clients, values per client
+ ('tiny', 100, 1000, 1), # test for insufficient data
+ ('small', 100, 1000000, 1),
+ ('medium', 1000, 10000000, 1),
+ ('large', 10000, 100000000, 1),
+)
+
+# 'k, h, m' as in params file.
+BLOOMFILTER_PARAMS = {
+ '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each
+ '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each
+ '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each
+ '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each
+}
+
+# 'p, q, f' as in params file.
+PRIVACY_PARAMS = {
+ 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5:
+ 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf
+}
+
+# For deriving candidates from true inputs.
+MAP_REGEX_MISSING = {
+ 'sharp': 'NONE', # Categorical data
+ '10%': 'v[0-9]*9$', # missing every 10th string
+}
+
+# test configuration ->
+# (name modifier, Bloom filter, privacy params, fraction of extra,
+# regex missing)
+TEST_CONFIGS = [
+ ('typical', '8x128', 'eps_1_1', .2, '10%'),
+ ('sharp', '8x128', 'eps_1_1', .0, 'sharp'), # no extra candidates
+ ('loose', '8x128', 'eps_1_5', .2, '10%'), # loose privacy
+ ('over_x2', '8x128', 'eps_1_1', 2.0, '10%'), # overshoot by x2
+ ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'), # overshoot by x10
+]
+
+#
+# END TEST CONFIGURATION
+#
+
+
+def main(argv):
+ rows = []
+
+ test_case = []
+ for (distr_params, num_values, num_clients,
+ num_reports_per_client) in DISTRIBUTION_PARAMS:
+ for distribution in DISTRIBUTIONS:
+ for (config_name, bloom_name, privacy_params, fr_extra,
+ regex_missing) in TEST_CONFIGS:
+ test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
+ config_name)
+
+ params = (BLOOMFILTER_PARAMS[bloom_name]
+ + PRIVACY_PARAMS[privacy_params]
+ + tuple([int(num_values * fr_extra)])
+ + tuple([MAP_REGEX_MISSING[regex_missing]]))
+
+ test_case = (test_name, distribution, num_values, num_clients,
+ num_reports_per_client) + params
+ row_str = [str(element) for element in test_case]
+ rows.append(row_str)
+
+ for params in DEMO:
+ rows.append(params)
+
+ for row in rows:
+ print ' '.join(row)
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/tests/setup.py b/tests/setup.py
new file mode 100755
index 0000000..f874cbe
--- /dev/null
+++ b/tests/setup.py
@@ -0,0 +1,26 @@
+#!/usr/bin/python
+#
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from distutils.core import setup, Extension
+
+module = Extension('_fastrand',
+ sources = ['_fastrand.c'])
+
+setup(name = '_fastrand',
+ version = '1.0',
+ description = 'Module to speed up RAPPOR simulation',
+ ext_modules = [module])
diff --git a/tests/user_spec.py b/tests/user_spec.py
new file mode 100755
index 0000000..5df5879
--- /dev/null
+++ b/tests/user_spec.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+"""Print a test spec on stdout.
+
+Each line has parmaeters for a test case. The regtest.sh shell script reads
+these lines and runs parallel processes.
+
+We use Python data structures so the test cases are easier to read and edit.
+"""
+
+import sys
+
+#
+# TEST CONFIGURATION
+#
+
+# For gen_sim_input.py
+INPUT_PARAMS = {
+ # distribution, num unique values, num clients, values per client
+ 'exp-100k': ('exp', 100, 100000, 1),
+ 'exp-1m': ('exp', 100, 1000000, 1),
+}
+
+# For rappor_sim.py
+# 'k, h, m, p, q, f' as in params file.
+RAPPOR_PARAMS = {
+ # Initial chrome params from 2014.
+ # NOTE: fastrand simulation only supports 64 bits! Make sure to use the
+ # 'fast_counts' code path.
+ 'chrome128': (128, 2, 128, 0.25, 0.75, 0.50),
+
+ # Chrome params from early 2015 -- changed to 8 bit reports.
+ 'chrome8': (8, 2, 128, 0.25, 0.75, 0.50),
+
+ # Original demo params
+ 'demo': (16, 2, 64, 0.5, 0.75, 0.5),
+}
+
+# For deriving candidates from true inputs.
+MAP_PARAMS = {
+ # 1. Number of extra candidates to add.
+ # 2. Candidate strings to remove from the map. This FORCES false
+ # negatives, e.g. for common strings, since a string has to be in the map
+ # for RAPPOR to choose it.
+ 'add-100': (100, []),
+ 'add-1000': (1000, []),
+ 'add-2000': (2000, []),
+ # also thrashes on 128 bits
+ 'add-3000': (3000, []),
+ 'add-10000': (10000, []),
+ 'add-15000': (15000, []), # approx number of candidates for eTLD+1
+ 'add-100000': (100000, []),
+ 'remove-top-2': (20, ['v1', 'v2']),
+}
+
+# test case name -> (input params name, RAPPOR params name, map params name)
+TEST_CASES = [
+ ('chrome128-100k-100', 'exp-100k', 'chrome128', 'add-100'),
+ ('chrome128-100k-1000', 'exp-100k', 'chrome128', 'add-1000'),
+ ('chrome128-100k-2000', 'exp-100k', 'chrome128', 'add-2000'),
+ ('chrome128-100k-3000', 'exp-100k', 'chrome128', 'add-3000'),
+ # 128 bits and 15k candidates fails on a machine with 8 GB memory.
+ # Lasso finishes with 7508 non-zero coefficients, and then allocation
+ # fails. TODO: just take the highest ones?
+ #('chrome128-100k-15000', 'exp-100k', 'chrome128', 'add-15000'),
+ #('chrome128-100k-100000', 'exp-100k', 'chrome128', 'add-100000'),
+
+ # NOTE: Adding more candidates exercises LASSO
+ ('chrome8-100k-100', 'exp-100k', 'chrome8', 'add-100'),
+ ('chrome8-100k-1000', 'exp-100k', 'chrome8', 'add-1000'),
+ ('chrome8-100k-2000', 'exp-100k', 'chrome8', 'add-2000'),
+ ('chrome8-100k-3000', 'exp-100k', 'chrome8', 'add-3000'),
+ ('chrome8-100k-15000', 'exp-100k', 'chrome8', 'add-15000'),
+
+ # NOTE: This one takes too much memory! More than 4 GB. This is because
+ # Lasso gets a huge matrix (100,000). We got 1564 non-zero coefficients.
+ ('chrome8-100k-100000', 'exp-100k', 'chrome8', 'add-100000'),
+
+ # What happens when the the candidates are missing top values?
+ ('chrome8-badcand', 'exp-100k', 'chrome8', 'remove-top-2'),
+
+ # TODO: Use chrome params with real map from Alexa 1M ?
+]
+
+#
+# END TEST CONFIGURATION
+#
+
+
+def main(argv):
+ rows = []
+ for test_case, input_name, rappor_name, map_name in TEST_CASES:
+ input_params = INPUT_PARAMS[input_name]
+ rappor_params = RAPPOR_PARAMS[rappor_name]
+ map_params = MAP_PARAMS[map_name]
+ row = tuple([test_case]) + input_params + rappor_params + map_params
+ rows.append(row)
+
+ for row in rows:
+ for cell in row:
+ if isinstance(cell, list):
+ if cell:
+ cell_str = '|'.join(cell)
+ else:
+ cell_str = 'NONE' # we don't want an empty string
+ else:
+ cell_str = cell
+ print cell_str, # print it with a space after it
+ print # new line after row
+
+
+if __name__ == '__main__':
+ try:
+ main(sys.argv)
+ except RuntimeError, e:
+ print >>sys.stderr, 'FATAL: %s' % e
+ sys.exit(1)
diff --git a/tests/uvals.csv b/tests/uvals.csv
new file mode 100644
index 0000000..cebc17e
--- /dev/null
+++ b/tests/uvals.csv
@@ -0,0 +1,2 @@
+google.com,intel.com,yahoo.com
+ssl,nossl
diff --git a/third_party/dygraph-combined.js b/third_party/dygraph-combined.js
new file mode 100644
index 0000000..7d6121e
--- /dev/null
+++ b/third_party/dygraph-combined.js
@@ -0,0 +1,6 @@
+/*! @license Copyright 2014 Dan Vanderkam (danvdk@gmail.com) MIT-licensed (http://opensource.org/licenses/MIT) */
+!function(t){"use strict";for(var e,a,i={},r=function(){},n="memory".split(","),o="assert,clear,count,debug,dir,dirxml,error,exception,group,groupCollapsed,groupEnd,info,log,markTimeline,profile,profiles,profileEnd,show,table,time,timeEnd,timeline,timelineEnd,timeStamp,trace,warn".split(",");e=n.pop();)t[e]=t[e]||i;for(;a=o.pop();)t[a]=t[a]||r}(this.console=this.console||{}),function(){"use strict";CanvasRenderingContext2D.prototype.installPattern=function(t){if("undefined"!=typeof this.isPatternInstalled)throw"Must un-install old line pattern before installing a new one.";this.isPatternInstalled=!0;var e=[0,0],a=[],i=this.beginPath,r=this.lineTo,n=this.moveTo,o=this.stroke;this.uninstallPattern=function(){this.beginPath=i,this.lineTo=r,this.moveTo=n,this.stroke=o,this.uninstallPattern=void 0,this.isPatternInstalled=void 0},this.beginPath=function(){a=[],i.call(this)},this.moveTo=function(t,e){a.push([[t,e]]),n.call(this,t,e)},this.lineTo=function(t,e){var i=a[a.length-1];i.push([t,e])},this.stroke=function(){if(0===a.length)return void o.call(this);for(var i=0;i<a.length;i++)for(var s=a[i],l=s[0][0],h=s[0][1],p=1;p<s.length;p++){var g=s[p][0],d=s[p][1];this.save();var u=g-l,c=d-h,y=Math.sqrt(u*u+c*c),_=Math.atan2(c,u);this.translate(l,h),n.call(this,0,0),this.rotate(_);for(var v=e[0],f=0;y>f;){var x=t[v];f+=e[1]?e[1]:x,f>y?(e=[v,f-y],f=y):e=[(v+1)%t.length,0],v%2===0?r.call(this,f,0):n.call(this,f,0),v=(v+1)%t.length}this.restore(),l=g,h=d}o.call(this),a=[]}},CanvasRenderingContext2D.prototype.uninstallPattern=function(){throw"Must install a line pattern before uninstalling it."}}();var DygraphOptions=function(){return function(){"use strict";var t=function(t){this.dygraph_=t,this.yAxes_=[],this.xAxis_={},this.series_={},this.global_=this.dygraph_.attrs_,this.user_=this.dygraph_.user_attrs_||{},this.labels_=[],this.highlightSeries_=this.get("highlightSeriesOpts")||{},this.reparseSeries()};t.AXIS_STRING_MAPPINGS_={y:0,Y:0,y1:0,Y1:0,y2:1,Y2:1},t.axisToIndex_=function(e){if("string"==typeof e){if(t.AXIS_STRING_MAPPINGS_.hasOwnProperty(e))return t.AXIS_STRING_MAPPINGS_[e];throw"Unknown axis : "+e}if("number"==typeof e){if(0===e||1===e)return e;throw"Dygraphs only supports two y-axes, indexed from 0-1."}if(e)throw"Unknown axis : "+e;return 0},t.prototype.reparseSeries=function(){var e=this.get("labels");if(e){this.labels_=e.slice(1),this.yAxes_=[{series:[],options:{}}],this.xAxis_={options:{}},this.series_={};var a=!this.user_.series;if(a){for(var i=0,r=0;r<this.labels_.length;r++){var n=this.labels_[r],o=this.user_[n]||{},s=0,l=o.axis;"object"==typeof l&&(s=++i,this.yAxes_[s]={series:[n],options:l}),l||this.yAxes_[0].series.push(n),this.series_[n]={idx:r,yAxis:s,options:o}}for(var r=0;r<this.labels_.length;r++){var n=this.labels_[r],o=this.series_[n].options,l=o.axis;if("string"==typeof l){if(!this.series_.hasOwnProperty(l))return void console.error("Series "+n+" wants to share a y-axis with series "+l+", which does not define its own axis.");var s=this.series_[l].yAxis;this.series_[n].yAxis=s,this.yAxes_[s].series.push(n)}}}else for(var r=0;r<this.labels_.length;r++){var n=this.labels_[r],o=this.user_.series[n]||{},s=t.axisToIndex_(o.axis);this.series_[n]={idx:r,yAxis:s,options:o},this.yAxes_[s]?this.yAxes_[s].series.push(n):this.yAxes_[s]={series:[n],options:{}}}var h=this.user_.axes||{};Dygraph.update(this.yAxes_[0].options,h.y||{}),this.yAxes_.length>1&&Dygraph.update(this.yAxes_[1].options,h.y2||{}),Dygraph.update(this.xAxis_.options,h.x||{})}},t.prototype.get=function(t){var e=this.getGlobalUser_(t);return null!==e?e:this.getGlobalDefault_(t)},t.prototype.getGlobalUser_=function(t){return this.user_.hasOwnProperty(t)?this.user_[t]:null},t.prototype.getGlobalDefault_=function(t){return this.global_.hasOwnProperty(t)?this.global_[t]:Dygraph.DEFAULT_ATTRS.hasOwnProperty(t)?Dygraph.DEFAULT_ATTRS[t]:null},t.prototype.getForAxis=function(t,e){var a,i;if("number"==typeof e)a=e,i=0===a?"y":"y2";else{if("y1"==e&&(e="y"),"y"==e)a=0;else if("y2"==e)a=1;else{if("x"!=e)throw"Unknown axis "+e;a=-1}i=e}var r=-1==a?this.xAxis_:this.yAxes_[a];if(r){var n=r.options;if(n.hasOwnProperty(t))return n[t]}if("x"!==e||"logscale"!==t){var o=this.getGlobalUser_(t);if(null!==o)return o}var s=Dygraph.DEFAULT_ATTRS.axes[i];return s.hasOwnProperty(t)?s[t]:this.getGlobalDefault_(t)},t.prototype.getForSeries=function(t,e){if(e===this.dygraph_.getHighlightSeries()&&this.highlightSeries_.hasOwnProperty(t))return this.highlightSeries_[t];if(!this.series_.hasOwnProperty(e))throw"Unknown series: "+e;var a=this.series_[e],i=a.options;return i.hasOwnProperty(t)?i[t]:this.getForAxis(t,a.yAxis)},t.prototype.numAxes=function(){return this.yAxes_.length},t.prototype.axisForSeries=function(t){return this.series_[t].yAxis},t.prototype.axisOptions=function(t){return this.yAxes_[t].options},t.prototype.seriesForAxis=function(t){return this.yAxes_[t].series},t.prototype.seriesNames=function(){return this.labels_};return t}()}(),DygraphLayout=function(){"use strict";var t=function(t){this.dygraph_=t,this.points=[],this.setNames=[],this.annotations=[],this.yAxes_=null,this.xTicks_=null,this.yTicks_=null};return t.prototype.addDataset=function(t,e){this.points.push(e),this.setNames.push(t)},t.prototype.getPlotArea=function(){return this.area_},t.prototype.computePlotArea=function(){var t={x:0,y:0};t.w=this.dygraph_.width_-t.x-this.dygraph_.getOption("rightGap"),t.h=this.dygraph_.height_;var e={chart_div:this.dygraph_.graphDiv,reserveSpaceLeft:function(e){var a={x:t.x,y:t.y,w:e,h:t.h};return t.x+=e,t.w-=e,a},reserveSpaceRight:function(e){var a={x:t.x+t.w-e,y:t.y,w:e,h:t.h};return t.w-=e,a},reserveSpaceTop:function(e){var a={x:t.x,y:t.y,w:t.w,h:e};return t.y+=e,t.h-=e,a},reserveSpaceBottom:function(e){var a={x:t.x,y:t.y+t.h-e,w:t.w,h:e};return t.h-=e,a},chartRect:function(){return{x:t.x,y:t.y,w:t.w,h:t.h}}};this.dygraph_.cascadeEvents_("layout",e),this.area_=t},t.prototype.setAnnotations=function(t){this.annotations=[];for(var e=this.dygraph_.getOption("xValueParser")||function(t){return t},a=0;a<t.length;a++){var i={};if(!t[a].xval&&void 0===t[a].x)return void console.error("Annotations must have an 'x' property");if(t[a].icon&&(!t[a].hasOwnProperty("width")||!t[a].hasOwnProperty("height")))return void console.error("Must set width and height when setting annotation.icon property");Dygraph.update(i,t[a]),i.xval||(i.xval=e(i.x)),this.annotations.push(i)}},t.prototype.setXTicks=function(t){this.xTicks_=t},t.prototype.setYAxes=function(t){this.yAxes_=t},t.prototype.evaluate=function(){this._xAxis={},this._evaluateLimits(),this._evaluateLineCharts(),this._evaluateLineTicks(),this._evaluateAnnotations()},t.prototype._evaluateLimits=function(){var t=this.dygraph_.xAxisRange();this._xAxis.minval=t[0],this._xAxis.maxval=t[1];var e=t[1]-t[0];this._xAxis.scale=0!==e?1/e:1,this.dygraph_.getOptionForAxis("logscale","x")&&(this._xAxis.xlogrange=Dygraph.log10(this._xAxis.maxval)-Dygraph.log10(this._xAxis.minval),this._xAxis.xlogscale=0!==this._xAxis.xlogrange?1/this._xAxis.xlogrange:1);for(var a=0;a<this.yAxes_.length;a++){var i=this.yAxes_[a];i.minyval=i.computedValueRange[0],i.maxyval=i.computedValueRange[1],i.yrange=i.maxyval-i.minyval,i.yscale=0!==i.yrange?1/i.yrange:1,this.dygraph_.getOption("logscale")&&(i.ylogrange=Dygraph.log10(i.maxyval)-Dygraph.log10(i.minyval),i.ylogscale=0!==i.ylogrange?1/i.ylogrange:1,(!isFinite(i.ylogrange)||isNaN(i.ylogrange))&&console.error("axis "+a+" of graph at "+i.g+" can't be displayed in log scale for range ["+i.minyval+" - "+i.maxyval+"]"))}},t.calcXNormal_=function(t,e,a){return a?(Dygraph.log10(t)-Dygraph.log10(e.minval))*e.xlogscale:(t-e.minval)*e.scale},t.calcYNormal_=function(t,e,a){if(a){var i=1-(Dygraph.log10(e)-Dygraph.log10(t.minyval))*t.ylogscale;return isFinite(i)?i:0/0}return 1-(e-t.minyval)*t.yscale},t.prototype._evaluateLineCharts=function(){for(var e=this.dygraph_.getOption("stackedGraph"),a=this.dygraph_.getOptionForAxis("logscale","x"),i=0;i<this.points.length;i++){for(var r=this.points[i],n=this.setNames[i],o=this.dygraph_.getOption("connectSeparatedPoints",n),s=this.dygraph_.axisPropertiesForSeries(n),l=this.dygraph_.attributes_.getForSeries("logscale",n),h=0;h<r.length;h++){var p=r[h];p.x=t.calcXNormal_(p.xval,this._xAxis,a);var g=p.yval;e&&(p.y_stacked=t.calcYNormal_(s,p.yval_stacked,l),null===g||isNaN(g)||(g=p.yval_stacked)),null===g&&(g=0/0,o||(p.yval=0/0)),p.y=t.calcYNormal_(s,g,l)}this.dygraph_.dataHandler_.onLineEvaluated(r,s,l)}},t.prototype._evaluateLineTicks=function(){var t,e,a,i;for(this.xticks=[],t=0;t<this.xTicks_.length;t++)e=this.xTicks_[t],a=e.label,i=this.dygraph_.toPercentXCoord(e.v),i>=0&&1>i&&this.xticks.push([i,a]);for(this.yticks=[],t=0;t<this.yAxes_.length;t++)for(var r=this.yAxes_[t],n=0;n<r.ticks.length;n++)e=r.ticks[n],a=e.label,i=this.dygraph_.toPercentYCoord(e.v,t),i>0&&1>=i&&this.yticks.push([t,i,a])},t.prototype._evaluateAnnotations=function(){var t,e={};for(t=0;t<this.annotations.length;t++){var a=this.annotations[t];e[a.xval+","+a.series]=a}if(this.annotated_points=[],this.annotations&&this.annotations.length)for(var i=0;i<this.points.length;i++){var r=this.points[i];for(t=0;t<r.length;t++){var n=r[t],o=n.xval+","+n.name;o in e&&(n.annotation=e[o],this.annotated_points.push(n))}}},t.prototype.removeAllDatasets=function(){delete this.points,delete this.setNames,delete this.setPointsLengths,delete this.setPointsOffsets,this.points=[],this.setNames=[],this.setPointsLengths=[],this.setPointsOffsets=[]},t}(),DygraphCanvasRenderer=function(){"use strict";var t=function(t,e,a,i){if(this.dygraph_=t,this.layout=i,this.element=e,this.elementContext=a,this.height=t.height_,this.width=t.width_,!this.isIE&&!Dygraph.isCanvasSupported(this.element))throw"Canvas is not supported.";if(this.area=i.getPlotArea(),this.dygraph_.isUsingExcanvas_)this._createIEClipArea();else if(!Dygraph.isAndroid()){var r=this.dygraph_.canvas_ctx_;r.beginPath(),r.rect(this.area.x,this.area.y,this.area.w,this.area.h),r.clip(),r=this.dygraph_.hidden_ctx_,r.beginPath(),r.rect(this.area.x,this.area.y,this.area.w,this.area.h),r.clip()}};return t.prototype.clear=function(){var t;if(this.isIE)try{this.clearDelay&&(this.clearDelay.cancel(),this.clearDelay=null),t=this.elementContext}catch(e){return}t=this.elementContext,t.clearRect(0,0,this.width,this.height)},t.prototype.render=function(){this._updatePoints(),this._renderLineChart()},t.prototype._createIEClipArea=function(){function t(t){if(0!==t.w&&0!==t.h){var i=document.createElement("div");i.className=e,i.style.backgroundColor=r,i.style.position="absolute",i.style.left=t.x+"px",i.style.top=t.y+"px",i.style.width=t.w+"px",i.style.height=t.h+"px",a.appendChild(i)}}for(var e="dygraph-clip-div",a=this.dygraph_.graphDiv,i=a.childNodes.length-1;i>=0;i--)a.childNodes[i].className==e&&a.removeChild(a.childNodes[i]);for(var r=document.bgColor,n=this.dygraph_.graphDiv;n!=document;){var o=n.currentStyle.backgroundColor;if(o&&"transparent"!=o){r=o;break}n=n.parentNode}var s=this.area;t({x:0,y:0,w:s.x,h:this.height}),t({x:s.x,y:0,w:this.width-s.x,h:s.y}),t({x:s.x+s.w,y:0,w:this.width-s.x-s.w,h:this.height}),t({x:s.x,y:s.y+s.h,w:this.width-s.x,h:this.height-s.h-s.y})},t._getIteratorPredicate=function(e){return e?t._predicateThatSkipsEmptyPoints:null},t._predicateThatSkipsEmptyPoints=function(t,e){return null!==t[e].yval},t._drawStyledLine=function(e,a,i,r,n,o,s){var l=e.dygraph,h=l.getBooleanOption("stepPlot",e.setName);Dygraph.isArrayLike(r)||(r=null);var p=l.getBooleanOption("drawGapEdgePoints",e.setName),g=e.points,d=e.setName,u=Dygraph.createIterator(g,0,g.length,t._getIteratorPredicate(l.getBooleanOption("connectSeparatedPoints",d))),c=r&&r.length>=2,y=e.drawingContext;y.save(),c&&y.installPattern(r);var _=t._drawSeries(e,u,i,s,n,p,h,a);t._drawPointsOnLine(e,_,o,a,s),c&&y.uninstallPattern(),y.restore()},t._drawSeries=function(t,e,a,i,r,n,o,s){var l,h,p=null,g=null,d=null,u=[],c=!0,y=t.drawingContext;y.beginPath(),y.strokeStyle=s,y.lineWidth=a;for(var _=e.array_,v=e.end_,f=e.predicate_,x=e.start_;v>x;x++){if(h=_[x],f){for(;v>x&&!f(_,x);)x++;if(x==v)break;h=_[x]}if(null===h.canvasy||h.canvasy!=h.canvasy)o&&null!==p&&(y.moveTo(p,g),y.lineTo(h.canvasx,g)),p=g=null;else{if(l=!1,n||!p){e.nextIdx_=x,e.next(),d=e.hasNext?e.peek.canvasy:null;var m=null===d||d!=d;l=!p&&m,n&&(!c&&!p||e.hasNext&&m)&&(l=!0)}null!==p?a&&(o&&(y.moveTo(p,g),y.lineTo(h.canvasx,g)),y.lineTo(h.canvasx,h.canvasy)):y.moveTo(h.canvasx,h.canvasy),(r||l)&&u.push([h.canvasx,h.canvasy,h.idx]),p=h.canvasx,g=h.canvasy}c=!1}return y.stroke(),u},t._drawPointsOnLine=function(t,e,a,i,r){for(var n=t.drawingContext,o=0;o<e.length;o++){var s=e[o];n.save(),a.call(t.dygraph,t.dygraph,t.setName,n,s[0],s[1],i,r,s[2]),n.restore()}},t.prototype._updatePoints=function(){for(var t=this.layout.points,e=t.length;e--;)for(var a=t[e],i=a.length;i--;){var r=a[i];r.canvasx=this.area.w*r.x+this.area.x,r.canvasy=this.area.h*r.y+this.area.y}},t.prototype._renderLineChart=function(t,e){var a,i,r=e||this.elementContext,n=this.layout.points,o=this.layout.setNames;this.colors=this.dygraph_.colorsMap_;var s=this.dygraph_.getOption("plotter"),l=s;Dygraph.isArrayLike(l)||(l=[l]);var h={};for(a=0;a<o.length;a++){i=o[a];var p=this.dygraph_.getOption("plotter",i);p!=s&&(h[i]=p)}for(a=0;a<l.length;a++)for(var g=l[a],d=a==l.length-1,u=0;u<n.length;u++)if(i=o[u],!t||i==t){var c=n[u],y=g;if(i in h){if(!d)continue;y=h[i]}var _=this.colors[i],v=this.dygraph_.getOption("strokeWidth",i);r.save(),r.strokeStyle=_,r.lineWidth=v,y({points:c,setName:i,drawingContext:r,color:_,strokeWidth:v,dygraph:this.dygraph_,axis:this.dygraph_.axisPropertiesForSeries(i),plotArea:this.area,seriesIndex:u,seriesCount:n.length,singleSeriesName:t,allSeriesPoints:n}),r.restore()}},t._Plotters={linePlotter:function(e){t._linePlotter(e)},fillPlotter:function(e){t._fillPlotter(e)},errorPlotter:function(e){t._errorPlotter(e)}},t._linePlotter=function(e){var a=e.dygraph,i=e.setName,r=e.strokeWidth,n=a.getNumericOption("strokeBorderWidth",i),o=a.getOption("drawPointCallback",i)||Dygraph.Circles.DEFAULT,s=a.getOption("strokePattern",i),l=a.getBooleanOption("drawPoints",i),h=a.getNumericOption("pointSize",i);n&&r&&t._drawStyledLine(e,a.getOption("strokeBorderColor",i),r+2*n,s,l,o,h),t._drawStyledLine(e,e.color,r,s,l,o,h)},t._errorPlotter=function(e){var a=e.dygraph,i=e.setName,r=a.getBooleanOption("errorBars")||a.getBooleanOption("customBars");if(r){var n=a.getBooleanOption("fillGraph",i);n&&console.warn("Can't use fillGraph option with error bars");var o,s=e.drawingContext,l=e.color,h=a.getNumericOption("fillAlpha",i),p=a.getBooleanOption("stepPlot",i),g=e.points,d=Dygraph.createIterator(g,0,g.length,t._getIteratorPredicate(a.getBooleanOption("connectSeparatedPoints",i))),u=0/0,c=0/0,y=[-1,-1],_=Dygraph.toRGB_(l),v="rgba("+_.r+","+_.g+","+_.b+","+h+")";s.fillStyle=v,s.beginPath();for(var f=function(t){return null===t||void 0===t||isNaN(t)};d.hasNext;){var x=d.next();!p&&f(x.y)||p&&!isNaN(c)&&f(c)?u=0/0:(o=[x.y_bottom,x.y_top],p&&(c=x.y),isNaN(o[0])&&(o[0]=x.y),isNaN(o[1])&&(o[1]=x.y),o[0]=e.plotArea.h*o[0]+e.plotArea.y,o[1]=e.plotArea.h*o[1]+e.plotArea.y,isNaN(u)||(p?(s.moveTo(u,y[0]),s.lineTo(x.canvasx,y[0]),s.lineTo(x.canvasx,y[1])):(s.moveTo(u,y[0]),s.lineTo(x.canvasx,o[0]),s.lineTo(x.canvasx,o[1])),s.lineTo(u,y[1]),s.closePath()),y=o,u=x.canvasx)}s.fill()}},t._fastCanvasProxy=function(t){var e=[],a=null,i=null,r=1,n=2,o=0,s=function(t){if(!(e.length<=1)){for(var a=e.length-1;a>0;a--){var i=e[a];if(i[0]==n){var o=e[a-1];o[1]==i[1]&&o[2]==i[2]&&e.splice(a,1)}}for(var a=0;a<e.length-1;){var i=e[a];i[0]==n&&e[a+1][0]==n?e.splice(a,1):a++}if(e.length>2&&!t){var s=0;e[0][0]==n&&s++;for(var l=null,h=null,a=s;a<e.length;a++){var i=e[a];if(i[0]==r)if(null===l&&null===h)l=a,h=a;else{var p=i[2];p<e[l][2]?l=a:p>e[h][2]&&(h=a)}}var g=e[l],d=e[h];e.splice(s,e.length-s),h>l?(e.push(g),e.push(d)):l>h?(e.push(d),e.push(g)):e.push(g)}}},l=function(a){s(a);for(var l=0,h=e.length;h>l;l++){var p=e[l];p[0]==r?t.lineTo(p[1],p[2]):p[0]==n&&t.moveTo(p[1],p[2])}e.length&&(i=e[e.length-1][1]),o+=e.length,e=[]},h=function(t,r,n){var o=Math.round(r);if(null===a||o!=a){var s=a-i>1,h=o-a>1,p=s||h;l(p),a=o}e.push([t,r,n])};return{moveTo:function(t,e){h(n,t,e)},lineTo:function(t,e){h(r,t,e)},stroke:function(){l(!0),t.stroke()},fill:function(){l(!0),t.fill()},beginPath:function(){l(!0),t.beginPath()},closePath:function(){l(!0),t.closePath()},_count:function(){return o}}},t._fillPlotter=function(e){if(!e.singleSeriesName&&0===e.seriesIndex){for(var a=e.dygraph,i=a.getLabels().slice(1),r=i.length;r>=0;r--)a.visibility()[r]||i.splice(r,1);var n=function(){for(var t=0;t<i.length;t++)if(a.getBooleanOption("fillGraph",i[t]))return!0;return!1}();if(n)for(var o,s,l=e.plotArea,h=e.allSeriesPoints,p=h.length,g=a.getNumericOption("fillAlpha"),d=a.getBooleanOption("stackedGraph"),u=a.getColors(),c={},y=function(t,e,a,i){if(t.lineTo(e,a),d)for(var r=i.length-1;r>=0;r--){var n=i[r];t.lineTo(n[0],n[1])}},_=p-1;_>=0;_--){var v=e.drawingContext,f=i[_];if(a.getBooleanOption("fillGraph",f)){var x=a.getBooleanOption("stepPlot",f),m=u[_],D=a.axisPropertiesForSeries(f),w=1+D.minyval*D.yscale;0>w?w=0:w>1&&(w=1),w=l.h*w+l.y;var A,b=h[_],T=Dygraph.createIterator(b,0,b.length,t._getIteratorPredicate(a.getBooleanOption("connectSeparatedPoints",f))),E=0/0,C=[-1,-1],L=Dygraph.toRGB_(m),P="rgba("+L.r+","+L.g+","+L.b+","+g+")";v.fillStyle=P,v.beginPath();var S,O=!0;(b.length>2*a.width_||Dygraph.FORCE_FAST_PROXY)&&(v=t._fastCanvasProxy(v));for(var M,R=[];T.hasNext;)if(M=T.next(),Dygraph.isOK(M.y)||x){if(d){if(!O&&S==M.xval)continue;O=!1,S=M.xval,o=c[M.canvasx];var F;F=void 0===o?w:s?o[0]:o,A=[M.canvasy,F],x?-1===C[0]?c[M.canvasx]=[M.canvasy,w]:c[M.canvasx]=[M.canvasy,C[0]]:c[M.canvasx]=M.canvasy}else A=isNaN(M.canvasy)&&x?[l.y+l.h,w]:[M.canvasy,w];isNaN(E)?(v.moveTo(M.canvasx,A[1]),v.lineTo(M.canvasx,A[0])):(x?(v.lineTo(M.canvasx,C[0]),v.lineTo(M.canvasx,A[0])):v.lineTo(M.canvasx,A[0]),d&&(R.push([E,C[1]]),R.push(s&&o?[M.canvasx,o[1]]:[M.canvasx,A[1]]))),C=A,E=M.canvasx}else y(v,E,C[1],R),R=[],E=0/0,null===M.y_stacked||isNaN(M.y_stacked)||(c[M.canvasx]=l.h*M.y_stacked+l.y);s=x,A&&M&&(y(v,M.canvasx,A[1],R),R=[]),v.fill()}}}},t}(),Dygraph=function(){"use strict";var t=function(t,e,a,i){this.is_initial_draw_=!0,this.readyFns_=[],void 0!==i?(console.warn("Using deprecated four-argument dygraph constructor"),this.__old_init__(t,e,a,i)):this.__init__(t,e,a)};return t.NAME="Dygraph",t.VERSION="1.1.1",t.__repr__=function(){return"["+t.NAME+" "+t.VERSION+"]"},t.toString=function(){return t.__repr__()},t.DEFAULT_ROLL_PERIOD=1,t.DEFAULT_WIDTH=480,t.DEFAULT_HEIGHT=320,t.ANIMATION_STEPS=12,t.ANIMATION_DURATION=200,t.KMB_LABELS=["K","M","B","T","Q"],t.KMG2_BIG_LABELS=["k","M","G","T","P","E","Z","Y"],t.KMG2_SMALL_LABELS=["m","u","n","p","f","a","z","y"],t.numberValueFormatter=function(e,a){var i=a("sigFigs");if(null!==i)return t.floatFormat(e,i);var r,n=a("digitsAfterDecimal"),o=a("maxNumberWidth"),s=a("labelsKMB"),l=a("labelsKMG2");if(r=0!==e&&(Math.abs(e)>=Math.pow(10,o)||Math.abs(e)<Math.pow(10,-n))?e.toExponential(n):""+t.round_(e,n),s||l){var h,p=[],g=[];s&&(h=1e3,p=t.KMB_LABELS),l&&(s&&console.warn("Setting both labelsKMB and labelsKMG2. Pick one!"),h=1024,p=t.KMG2_BIG_LABELS,g=t.KMG2_SMALL_LABELS);for(var d=Math.abs(e),u=t.pow(h,p.length),c=p.length-1;c>=0;c--,u/=h)if(d>=u){r=t.round_(e/u,n)+p[c];break}if(l){var y=String(e.toExponential()).split("e-");2===y.length&&y[1]>=3&&y[1]<=24&&(r=y[1]%3>0?t.round_(y[0]/t.pow(10,y[1]%3),n):Number(y[0]).toFixed(2),r+=g[Math.floor(y[1]/3)-1])}}return r},t.numberAxisLabelFormatter=function(e,a,i){return t.numberValueFormatter.call(this,e,i)},t.SHORT_MONTH_NAMES_=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],t.dateAxisLabelFormatter=function(e,a,i){var r=i("labelsUTC"),n=r?t.DateAccessorsUTC:t.DateAccessorsLocal,o=n.getFullYear(e),s=n.getMonth(e),l=n.getDate(e),h=n.getHours(e),p=n.getMinutes(e),g=n.getSeconds(e),d=n.getSeconds(e);if(a>=t.DECADAL)return""+o;if(a>=t.MONTHLY)return t.SHORT_MONTH_NAMES_[s]+"&#160;"+o;var u=3600*h+60*p+g+.001*d;return 0===u||a>=t.DAILY?t.zeropad(l)+"&#160;"+t.SHORT_MONTH_NAMES_[s]:t.hmsString_(h,p,g)},t.dateAxisFormatter=t.dateAxisLabelFormatter,t.dateValueFormatter=function(e,a){return t.dateString_(e,a("labelsUTC"))},t.Plotters=DygraphCanvasRenderer._Plotters,t.DEFAULT_ATTRS={highlightCircleSize:3,highlightSeriesOpts:null,highlightSeriesBackgroundAlpha:.5,labelsDivWidth:250,labelsDivStyles:{},labelsSeparateLines:!1,labelsShowZeroValues:!0,labelsKMB:!1,labelsKMG2:!1,showLabelsOnHighlight:!0,digitsAfterDecimal:2,maxNumberWidth:6,sigFigs:null,strokeWidth:1,strokeBorderWidth:0,strokeBorderColor:"white",axisTickSize:3,axisLabelFontSize:14,rightGap:5,showRoller:!1,xValueParser:t.dateParser,delimiter:",",sigma:2,errorBars:!1,fractions:!1,wilsonInterval:!0,customBars:!1,fillGraph:!1,fillAlpha:.15,connectSeparatedPoints:!1,stackedGraph:!1,stackedGraphNaNFill:"all",hideOverlayOnMouseOut:!0,legend:"onmouseover",stepPlot:!1,avoidMinZero:!1,xRangePad:0,yRangePad:null,drawAxesAtZero:!1,titleHeight:28,xLabelHeight:18,yLabelWidth:18,drawXAxis:!0,drawYAxis:!0,axisLineColor:"black",axisLineWidth:.3,gridLineWidth:.3,axisLabelColor:"black",axisLabelWidth:50,drawYGrid:!0,drawXGrid:!0,gridLineColor:"rgb(128,128,128)",interactionModel:null,animatedZooms:!1,showRangeSelector:!1,rangeSelectorHeight:40,rangeSelectorPlotStrokeColor:"#808FAB",rangeSelectorPlotFillColor:"#A7B1C4",showInRangeSelector:null,plotter:[t.Plotters.fillPlotter,t.Plotters.errorPlotter,t.Plotters.linePlotter],plugins:[],axes:{x:{pixelsPerLabel:70,axisLabelWidth:60,axisLabelFormatter:t.dateAxisLabelFormatter,valueFormatter:t.dateValueFormatter,drawGrid:!0,drawAxis:!0,independentTicks:!0,ticker:null},y:{axisLabelWidth:50,pixelsPerLabel:30,valueFormatter:t.numberValueFormatter,axisLabelFormatter:t.numberAxisLabelFormatter,drawGrid:!0,drawAxis:!0,independentTicks:!0,ticker:null},y2:{axisLabelWidth:50,pixelsPerLabel:30,valueFormatter:t.numberValueFormatter,axisLabelFormatter:t.numberAxisLabelFormatter,drawAxis:!0,drawGrid:!1,independentTicks:!1,ticker:null}}},t.HORIZONTAL=1,t.VERTICAL=2,t.PLUGINS=[],t.addedAnnotationCSS=!1,t.prototype.__old_init__=function(e,a,i,r){if(null!==i){for(var n=["Date"],o=0;o<i.length;o++)n.push(i[o]);t.update(r,{labels:n})}this.__init__(e,a,r)},t.prototype.__init__=function(e,a,i){if(/MSIE/.test(navigator.userAgent)&&!window.opera&&"undefined"!=typeof G_vmlCanvasManager&&"complete"!=document.readyState){var r=this;return void setTimeout(function(){r.__init__(e,a,i)},100)}if((null===i||void 0===i)&&(i={}),i=t.mapLegacyOptions_(i),"string"==typeof e&&(e=document.getElementById(e)),!e)return void console.error("Constructing dygraph with a non-existent div!");this.isUsingExcanvas_="undefined"!=typeof G_vmlCanvasManager,this.maindiv_=e,this.file_=a,this.rollPeriod_=i.rollPeriod||t.DEFAULT_ROLL_PERIOD,this.previousVerticalX_=-1,this.fractions_=i.fractions||!1,this.dateWindow_=i.dateWindow||null,this.annotations_=[],this.zoomed_x_=!1,this.zoomed_y_=!1,e.innerHTML="",""===e.style.width&&i.width&&(e.style.width=i.width+"px"),""===e.style.height&&i.height&&(e.style.height=i.height+"px"),""===e.style.height&&0===e.clientHeight&&(e.style.height=t.DEFAULT_HEIGHT+"px",""===e.style.width&&(e.style.width=t.DEFAULT_WIDTH+"px")),this.width_=e.clientWidth||i.width||0,this.height_=e.clientHeight||i.height||0,i.stackedGraph&&(i.fillGraph=!0),this.user_attrs_={},t.update(this.user_attrs_,i),this.attrs_={},t.updateDeep(this.attrs_,t.DEFAULT_ATTRS),this.boundaryIds_=[],this.setIndexByName_={},this.datasetIndex_=[],this.registeredEvents_=[],this.eventListeners_={},this.attributes_=new DygraphOptions(this),this.createInterface_(),this.plugins_=[];for(var n=t.PLUGINS.concat(this.getOption("plugins")),o=0;o<n.length;o++){var s,l=n[o];s="undefined"!=typeof l.activate?l:new l;var h={plugin:s,events:{},options:{},pluginOptions:{}},p=s.activate(this);for(var g in p)p.hasOwnProperty(g)&&(h.events[g]=p[g]);this.plugins_.push(h)}for(var o=0;o<this.plugins_.length;o++){var d=this.plugins_[o];for(var g in d.events)if(d.events.hasOwnProperty(g)){var u=d.events[g],c=[d.plugin,u];g in this.eventListeners_?this.eventListeners_[g].push(c):this.eventListeners_[g]=[c]}}this.createDragInterface_(),this.start_()},t.prototype.cascadeEvents_=function(e,a){if(!(e in this.eventListeners_))return!1;var i={dygraph:this,cancelable:!1,defaultPrevented:!1,preventDefault:function(){if(!i.cancelable)throw"Cannot call preventDefault on non-cancelable event.";i.defaultPrevented=!0},propagationStopped:!1,stopPropagation:function(){i.propagationStopped=!0}};t.update(i,a);var r=this.eventListeners_[e];if(r)for(var n=r.length-1;n>=0;n--){var o=r[n][0],s=r[n][1];if(s.call(o,i),i.propagationStopped)break}return i.defaultPrevented},t.prototype.getPluginInstance_=function(t){for(var e=0;e<this.plugins_.length;e++){var a=this.plugins_[e];if(a.plugin instanceof t)return a.plugin}return null},t.prototype.isZoomed=function(t){if(null===t||void 0===t)return this.zoomed_x_||this.zoomed_y_;if("x"===t)return this.zoomed_x_;if("y"===t)return this.zoomed_y_;throw"axis parameter is ["+t+"] must be null, 'x' or 'y'."},t.prototype.toString=function(){var t=this.maindiv_,e=t&&t.id?t.id:t;return"[Dygraph "+e+"]"},t.prototype.attr_=function(t,e){return e?this.attributes_.getForSeries(t,e):this.attributes_.get(t)},t.prototype.getOption=function(t,e){return this.attr_(t,e)},t.prototype.getNumericOption=function(t,e){return this.getOption(t,e)},t.prototype.getStringOption=function(t,e){return this.getOption(t,e)},t.prototype.getBooleanOption=function(t,e){return this.getOption(t,e)},t.prototype.getFunctionOption=function(t,e){return this.getOption(t,e)},t.prototype.getOptionForAxis=function(t,e){return this.attributes_.getForAxis(t,e)},t.prototype.optionsViewForAxis_=function(t){var e=this;return function(a){var i=e.user_attrs_.axes;return i&&i[t]&&i[t].hasOwnProperty(a)?i[t][a]:"x"===t&&"logscale"===a?!1:"undefined"!=typeof e.user_attrs_[a]?e.user_attrs_[a]:(i=e.attrs_.axes,i&&i[t]&&i[t].hasOwnProperty(a)?i[t][a]:"y"==t&&e.axes_[0].hasOwnProperty(a)?e.axes_[0][a]:"y2"==t&&e.axes_[1].hasOwnProperty(a)?e.axes_[1][a]:e.attr_(a))}},t.prototype.rollPeriod=function(){return this.rollPeriod_},t.prototype.xAxisRange=function(){return this.dateWindow_?this.dateWindow_:this.xAxisExtremes()},t.prototype.xAxisExtremes=function(){var t=this.getNumericOption("xRangePad")/this.plotter_.area.w;if(0===this.numRows())return[0-t,1+t];var e=this.rawData_[0][0],a=this.rawData_[this.rawData_.length-1][0];if(t){var i=a-e;e-=i*t,a+=i*t}return[e,a]},t.prototype.yAxisRange=function(t){if("undefined"==typeof t&&(t=0),0>t||t>=this.axes_.length)return null;var e=this.axes_[t];return[e.computedValueRange[0],e.computedValueRange[1]]},t.prototype.yAxisRanges=function(){for(var t=[],e=0;e<this.axes_.length;e++)t.push(this.yAxisRange(e));return t},t.prototype.toDomCoords=function(t,e,a){return[this.toDomXCoord(t),this.toDomYCoord(e,a)]},t.prototype.toDomXCoord=function(t){if(null===t)return null;var e=this.plotter_.area,a=this.xAxisRange();return e.x+(t-a[0])/(a[1]-a[0])*e.w},t.prototype.toDomYCoord=function(t,e){var a=this.toPercentYCoord(t,e);if(null===a)return null;var i=this.plotter_.area;return i.y+a*i.h},t.prototype.toDataCoords=function(t,e,a){return[this.toDataXCoord(t),this.toDataYCoord(e,a)]},t.prototype.toDataXCoord=function(e){if(null===e)return null;var a=this.plotter_.area,i=this.xAxisRange();if(this.attributes_.getForAxis("logscale","x")){var r=(e-a.x)/a.w,n=t.log10(i[0]),o=t.log10(i[1]),s=n+r*(o-n),l=Math.pow(t.LOG_SCALE,s);return l}return i[0]+(e-a.x)/a.w*(i[1]-i[0])},t.prototype.toDataYCoord=function(e,a){if(null===e)return null;var i=this.plotter_.area,r=this.yAxisRange(a);if("undefined"==typeof a&&(a=0),this.attributes_.getForAxis("logscale",a)){var n=(e-i.y)/i.h,o=t.log10(r[0]),s=t.log10(r[1]),l=s-n*(s-o),h=Math.pow(t.LOG_SCALE,l);return h}return r[0]+(i.y+i.h-e)/i.h*(r[1]-r[0])},t.prototype.toPercentYCoord=function(e,a){if(null===e)return null;"undefined"==typeof a&&(a=0);var i,r=this.yAxisRange(a),n=this.attributes_.getForAxis("logscale",a);if(n){var o=t.log10(r[0]),s=t.log10(r[1]);i=(s-t.log10(e))/(s-o)}else i=(r[1]-e)/(r[1]-r[0]);return i},t.prototype.toPercentXCoord=function(e){if(null===e)return null;var a,i=this.xAxisRange(),r=this.attributes_.getForAxis("logscale","x");if(r===!0){var n=t.log10(i[0]),o=t.log10(i[1]);a=(t.log10(e)-n)/(o-n)}else a=(e-i[0])/(i[1]-i[0]);return a},t.prototype.numColumns=function(){return this.rawData_?this.rawData_[0]?this.rawData_[0].length:this.attr_("labels").length:0},t.prototype.numRows=function(){return this.rawData_?this.rawData_.length:0},t.prototype.getValue=function(t,e){return 0>t||t>this.rawData_.length?null:0>e||e>this.rawData_[t].length?null:this.rawData_[t][e]},t.prototype.createInterface_=function(){var e=this.maindiv_;this.graphDiv=document.createElement("div"),this.graphDiv.style.textAlign="left",this.graphDiv.style.position="relative",e.appendChild(this.graphDiv),this.canvas_=t.createCanvas(),this.canvas_.style.position="absolute",this.hidden_=this.createPlotKitCanvas_(this.canvas_),this.canvas_ctx_=t.getContext(this.canvas_),this.hidden_ctx_=t.getContext(this.hidden_),this.resizeElements_(),this.graphDiv.appendChild(this.hidden_),this.graphDiv.appendChild(this.canvas_),this.mouseEventElement_=this.createMouseEventElement_(),this.layout_=new DygraphLayout(this);var a=this;this.mouseMoveHandler_=function(t){a.mouseMove_(t)},this.mouseOutHandler_=function(e){var i=e.target||e.fromElement,r=e.relatedTarget||e.toElement;t.isNodeContainedBy(i,a.graphDiv)&&!t.isNodeContainedBy(r,a.graphDiv)&&a.mouseOut_(e)},this.addAndTrackEvent(window,"mouseout",this.mouseOutHandler_),this.addAndTrackEvent(this.mouseEventElement_,"mousemove",this.mouseMoveHandler_),this.resizeHandler_||(this.resizeHandler_=function(t){a.resize()},this.addAndTrackEvent(window,"resize",this.resizeHandler_))},t.prototype.resizeElements_=function(){this.graphDiv.style.width=this.width_+"px",this.graphDiv.style.height=this.height_+"px";var e=t.getContextPixelRatio(this.canvas_ctx_);this.canvas_.width=this.width_*e,this.canvas_.height=this.height_*e,this.canvas_.style.width=this.width_+"px",this.canvas_.style.height=this.height_+"px",1!==e&&this.canvas_ctx_.scale(e,e);var a=t.getContextPixelRatio(this.hidden_ctx_);this.hidden_.width=this.width_*a,this.hidden_.height=this.height_*a,this.hidden_.style.width=this.width_+"px",this.hidden_.style.height=this.height_+"px",1!==a&&this.hidden_ctx_.scale(a,a)},t.prototype.destroy=function(){this.canvas_ctx_.restore(),this.hidden_ctx_.restore();for(var e=this.plugins_.length-1;e>=0;e--){var a=this.plugins_.pop();a.plugin.destroy&&a.plugin.destroy()}var i=function(t){for(;t.hasChildNodes();)i(t.firstChild),t.removeChild(t.firstChild)};this.removeTrackedEvents_(),t.removeEvent(window,"mouseout",this.mouseOutHandler_),t.removeEvent(this.mouseEventElement_,"mousemove",this.mouseMoveHandler_),t.removeEvent(window,"resize",this.resizeHandler_),this.resizeHandler_=null,i(this.maindiv_);var r=function(t){for(var e in t)"object"==typeof t[e]&&(t[e]=null)};r(this.layout_),r(this.plotter_),r(this)},t.prototype.createPlotKitCanvas_=function(e){var a=t.createCanvas();return a.style.position="absolute",a.style.top=e.style.top,a.style.left=e.style.left,a.width=this.width_,a.height=this.height_,a.style.width=this.width_+"px",a.style.height=this.height_+"px",a},t.prototype.createMouseEventElement_=function(){if(this.isUsingExcanvas_){var t=document.createElement("div");return t.style.position="absolute",t.style.backgroundColor="white",t.style.filter="alpha(opacity=0)",t.style.width=this.width_+"px",t.style.height=this.height_+"px",this.graphDiv.appendChild(t),t}return this.canvas_},t.prototype.setColors_=function(){var e=this.getLabels(),a=e.length-1;this.colors_=[],this.colorsMap_={};for(var i=this.getNumericOption("colorSaturation")||1,r=this.getNumericOption("colorValue")||.5,n=Math.ceil(a/2),o=this.getOption("colors"),s=this.visibility(),l=0;a>l;l++)if(s[l]){
+var h=e[l+1],p=this.attributes_.getForSeries("color",h);if(!p)if(o)p=o[l%o.length];else{var g=l%2?n+(l+1)/2:Math.ceil((l+1)/2),d=1*g/(1+a);p=t.hsvToRGB(d,i,r)}this.colors_.push(p),this.colorsMap_[h]=p}},t.prototype.getColors=function(){return this.colors_},t.prototype.getPropertiesForSeries=function(t){for(var e=-1,a=this.getLabels(),i=1;i<a.length;i++)if(a[i]==t){e=i;break}return-1==e?null:{name:t,column:e,visible:this.visibility()[e-1],color:this.colorsMap_[t],axis:1+this.attributes_.axisForSeries(t)}},t.prototype.createRollInterface_=function(){this.roller_||(this.roller_=document.createElement("input"),this.roller_.type="text",this.roller_.style.display="none",this.graphDiv.appendChild(this.roller_));var t=this.getBooleanOption("showRoller")?"block":"none",e=this.plotter_.area,a={position:"absolute",zIndex:10,top:e.y+e.h-25+"px",left:e.x+1+"px",display:t};this.roller_.size="2",this.roller_.value=this.rollPeriod_;for(var i in a)a.hasOwnProperty(i)&&(this.roller_.style[i]=a[i]);var r=this;this.roller_.onchange=function(){r.adjustRoll(r.roller_.value)}},t.prototype.createDragInterface_=function(){var e={isZooming:!1,isPanning:!1,is2DPan:!1,dragStartX:null,dragStartY:null,dragEndX:null,dragEndY:null,dragDirection:null,prevEndX:null,prevEndY:null,prevDragDirection:null,cancelNextDblclick:!1,initialLeftmostDate:null,xUnitsPerPixel:null,dateRange:null,px:0,py:0,boundedDates:null,boundedValues:null,tarp:new t.IFrameTarp,initializeMouseDown:function(e,a,i){e.preventDefault?e.preventDefault():(e.returnValue=!1,e.cancelBubble=!0);var r=t.findPos(a.canvas_);i.px=r.x,i.py=r.y,i.dragStartX=t.dragGetX_(e,i),i.dragStartY=t.dragGetY_(e,i),i.cancelNextDblclick=!1,i.tarp.cover()},destroy:function(){var t=this;if((t.isZooming||t.isPanning)&&(t.isZooming=!1,t.dragStartX=null,t.dragStartY=null),t.isPanning){t.isPanning=!1,t.draggingDate=null,t.dateRange=null;for(var e=0;e<i.axes_.length;e++)delete i.axes_[e].draggingValue,delete i.axes_[e].dragValueRange}t.tarp.uncover()}},a=this.getOption("interactionModel"),i=this,r=function(t){return function(a){t(a,i,e)}};for(var n in a)a.hasOwnProperty(n)&&this.addAndTrackEvent(this.mouseEventElement_,n,r(a[n]));if(!a.willDestroyContextMyself){var o=function(t){e.destroy()};this.addAndTrackEvent(document,"mouseup",o)}},t.prototype.drawZoomRect_=function(e,a,i,r,n,o,s,l){var h=this.canvas_ctx_;o==t.HORIZONTAL?h.clearRect(Math.min(a,s),this.layout_.getPlotArea().y,Math.abs(a-s),this.layout_.getPlotArea().h):o==t.VERTICAL&&h.clearRect(this.layout_.getPlotArea().x,Math.min(r,l),this.layout_.getPlotArea().w,Math.abs(r-l)),e==t.HORIZONTAL?i&&a&&(h.fillStyle="rgba(128,128,128,0.33)",h.fillRect(Math.min(a,i),this.layout_.getPlotArea().y,Math.abs(i-a),this.layout_.getPlotArea().h)):e==t.VERTICAL&&n&&r&&(h.fillStyle="rgba(128,128,128,0.33)",h.fillRect(this.layout_.getPlotArea().x,Math.min(r,n),this.layout_.getPlotArea().w,Math.abs(n-r))),this.isUsingExcanvas_&&(this.currentZoomRectArgs_=[e,a,i,r,n,0,0,0])},t.prototype.clearZoomRect_=function(){this.currentZoomRectArgs_=null,this.canvas_ctx_.clearRect(0,0,this.width_,this.height_)},t.prototype.doZoomX_=function(t,e){this.currentZoomRectArgs_=null;var a=this.toDataXCoord(t),i=this.toDataXCoord(e);this.doZoomXDates_(a,i)},t.prototype.doZoomXDates_=function(t,e){var a=this.xAxisRange(),i=[t,e];this.zoomed_x_=!0;var r=this;this.doAnimatedZoom(a,i,null,null,function(){r.getFunctionOption("zoomCallback")&&r.getFunctionOption("zoomCallback").call(r,t,e,r.yAxisRanges())})},t.prototype.doZoomY_=function(t,e){this.currentZoomRectArgs_=null;for(var a=this.yAxisRanges(),i=[],r=0;r<this.axes_.length;r++){var n=this.toDataYCoord(t,r),o=this.toDataYCoord(e,r);i.push([o,n])}this.zoomed_y_=!0;var s=this;this.doAnimatedZoom(null,null,a,i,function(){if(s.getFunctionOption("zoomCallback")){var t=s.xAxisRange();s.getFunctionOption("zoomCallback").call(s,t[0],t[1],s.yAxisRanges())}})},t.zoomAnimationFunction=function(t,e){var a=1.5;return(1-Math.pow(a,-t))/(1-Math.pow(a,-e))},t.prototype.resetZoom=function(){var t=!1,e=!1,a=!1;null!==this.dateWindow_&&(t=!0,e=!0);for(var i=0;i<this.axes_.length;i++)"undefined"!=typeof this.axes_[i].valueWindow&&null!==this.axes_[i].valueWindow&&(t=!0,a=!0);if(this.clearSelection(),t){this.zoomed_x_=!1,this.zoomed_y_=!1;var r=this.rawData_[0][0],n=this.rawData_[this.rawData_.length-1][0];if(!this.getBooleanOption("animatedZooms")){for(this.dateWindow_=null,i=0;i<this.axes_.length;i++)null!==this.axes_[i].valueWindow&&delete this.axes_[i].valueWindow;return this.drawGraph_(),void(this.getFunctionOption("zoomCallback")&&this.getFunctionOption("zoomCallback").call(this,r,n,this.yAxisRanges()))}var o=null,s=null,l=null,h=null;if(e&&(o=this.xAxisRange(),s=[r,n]),a){l=this.yAxisRanges();var p=this.gatherDatasets_(this.rolledSeries_,null),g=p.extremes;for(this.computeYAxisRanges_(g),h=[],i=0;i<this.axes_.length;i++){var d=this.axes_[i];h.push(null!==d.valueRange&&void 0!==d.valueRange?d.valueRange:d.extremeRange)}}var u=this;this.doAnimatedZoom(o,s,l,h,function(){u.dateWindow_=null;for(var t=0;t<u.axes_.length;t++)null!==u.axes_[t].valueWindow&&delete u.axes_[t].valueWindow;u.getFunctionOption("zoomCallback")&&u.getFunctionOption("zoomCallback").call(u,r,n,u.yAxisRanges())})}},t.prototype.doAnimatedZoom=function(e,a,i,r,n){var o,s,l=this.getBooleanOption("animatedZooms")?t.ANIMATION_STEPS:1,h=[],p=[];if(null!==e&&null!==a)for(o=1;l>=o;o++)s=t.zoomAnimationFunction(o,l),h[o-1]=[e[0]*(1-s)+s*a[0],e[1]*(1-s)+s*a[1]];if(null!==i&&null!==r)for(o=1;l>=o;o++){s=t.zoomAnimationFunction(o,l);for(var g=[],d=0;d<this.axes_.length;d++)g.push([i[d][0]*(1-s)+s*r[d][0],i[d][1]*(1-s)+s*r[d][1]]);p[o-1]=g}var u=this;t.repeatAndCleanup(function(t){if(p.length)for(var e=0;e<u.axes_.length;e++){var a=p[t][e];u.axes_[e].valueWindow=[a[0],a[1]]}h.length&&(u.dateWindow_=h[t]),u.drawGraph_()},l,t.ANIMATION_DURATION/l,n)},t.prototype.getArea=function(){return this.plotter_.area},t.prototype.eventToDomCoords=function(e){if(e.offsetX&&e.offsetY)return[e.offsetX,e.offsetY];var a=t.findPos(this.mouseEventElement_),i=t.pageX(e)-a.x,r=t.pageY(e)-a.y;return[i,r]},t.prototype.findClosestRow=function(e){for(var a=1/0,i=-1,r=this.layout_.points,n=0;n<r.length;n++)for(var o=r[n],s=o.length,l=0;s>l;l++){var h=o[l];if(t.isValidPoint(h,!0)){var p=Math.abs(h.canvasx-e);a>p&&(a=p,i=h.idx)}}return i},t.prototype.findClosestPoint=function(e,a){for(var i,r,n,o,s,l,h,p=1/0,g=this.layout_.points.length-1;g>=0;--g)for(var d=this.layout_.points[g],u=0;u<d.length;++u)o=d[u],t.isValidPoint(o)&&(r=o.canvasx-e,n=o.canvasy-a,i=r*r+n*n,p>i&&(p=i,s=o,l=g,h=o.idx));var c=this.layout_.setNames[l];return{row:h,seriesName:c,point:s}},t.prototype.findStackedPoint=function(e,a){for(var i,r,n=this.findClosestRow(e),o=0;o<this.layout_.points.length;++o){var s=this.getLeftBoundary_(o),l=n-s,h=this.layout_.points[o];if(!(l>=h.length)){var p=h[l];if(t.isValidPoint(p)){var g=p.canvasy;if(e>p.canvasx&&l+1<h.length){var d=h[l+1];if(t.isValidPoint(d)){var u=d.canvasx-p.canvasx;if(u>0){var c=(e-p.canvasx)/u;g+=c*(d.canvasy-p.canvasy)}}}else if(e<p.canvasx&&l>0){var y=h[l-1];if(t.isValidPoint(y)){var u=p.canvasx-y.canvasx;if(u>0){var c=(p.canvasx-e)/u;g+=c*(y.canvasy-p.canvasy)}}}(0===o||a>g)&&(i=p,r=o)}}}var _=this.layout_.setNames[r];return{row:n,seriesName:_,point:i}},t.prototype.mouseMove_=function(t){var e=this.layout_.points;if(void 0!==e&&null!==e){var a=this.eventToDomCoords(t),i=a[0],r=a[1],n=this.getOption("highlightSeriesOpts"),o=!1;if(n&&!this.isSeriesLocked()){var s;s=this.getBooleanOption("stackedGraph")?this.findStackedPoint(i,r):this.findClosestPoint(i,r),o=this.setSelection(s.row,s.seriesName)}else{var l=this.findClosestRow(i);o=this.setSelection(l)}var h=this.getFunctionOption("highlightCallback");h&&o&&h.call(this,t,this.lastx_,this.selPoints_,this.lastRow_,this.highlightSet_)}},t.prototype.getLeftBoundary_=function(t){if(this.boundaryIds_[t])return this.boundaryIds_[t][0];for(var e=0;e<this.boundaryIds_.length;e++)if(void 0!==this.boundaryIds_[e])return this.boundaryIds_[e][0];return 0},t.prototype.animateSelection_=function(e){var a=10,i=30;void 0===this.fadeLevel&&(this.fadeLevel=0),void 0===this.animateId&&(this.animateId=0);var r=this.fadeLevel,n=0>e?r:a-r;if(0>=n)return void(this.fadeLevel&&this.updateSelection_(1));var o=++this.animateId,s=this;t.repeatAndCleanup(function(t){s.animateId==o&&(s.fadeLevel+=e,0===s.fadeLevel?s.clearSelection():s.updateSelection_(s.fadeLevel/a))},n,i,function(){})},t.prototype.updateSelection_=function(e){this.cascadeEvents_("select",{selectedRow:this.lastRow_,selectedX:this.lastx_,selectedPoints:this.selPoints_});var a,i=this.canvas_ctx_;if(this.getOption("highlightSeriesOpts")){i.clearRect(0,0,this.width_,this.height_);var r=1-this.getNumericOption("highlightSeriesBackgroundAlpha");if(r){var n=!0;if(n){if(void 0===e)return void this.animateSelection_(1);r*=e}i.fillStyle="rgba(255,255,255,"+r+")",i.fillRect(0,0,this.width_,this.height_)}this.plotter_._renderLineChart(this.highlightSet_,i)}else if(this.previousVerticalX_>=0){var o=0,s=this.attr_("labels");for(a=1;a<s.length;a++){var l=this.getNumericOption("highlightCircleSize",s[a]);l>o&&(o=l)}var h=this.previousVerticalX_;i.clearRect(h-o-1,0,2*o+2,this.height_)}if(this.isUsingExcanvas_&&this.currentZoomRectArgs_&&t.prototype.drawZoomRect_.apply(this,this.currentZoomRectArgs_),this.selPoints_.length>0){var p=this.selPoints_[0].canvasx;for(i.save(),a=0;a<this.selPoints_.length;a++){var g=this.selPoints_[a];if(t.isOK(g.canvasy)){var d=this.getNumericOption("highlightCircleSize",g.name),u=this.getFunctionOption("drawHighlightPointCallback",g.name),c=this.plotter_.colors[g.name];u||(u=t.Circles.DEFAULT),i.lineWidth=this.getNumericOption("strokeWidth",g.name),i.strokeStyle=c,i.fillStyle=c,u.call(this,this,g.name,i,p,g.canvasy,c,d,g.idx)}}i.restore(),this.previousVerticalX_=p}},t.prototype.setSelection=function(t,e,a){this.selPoints_=[];var i=!1;if(t!==!1&&t>=0){t!=this.lastRow_&&(i=!0),this.lastRow_=t;for(var r=0;r<this.layout_.points.length;++r){var n=this.layout_.points[r],o=t-this.getLeftBoundary_(r);if(o<n.length&&n[o].idx==t){var s=n[o];null!==s.yval&&this.selPoints_.push(s)}else for(var l=0;l<n.length;++l){var s=n[l];if(s.idx==t){null!==s.yval&&this.selPoints_.push(s);break}}}}else this.lastRow_>=0&&(i=!0),this.lastRow_=-1;return this.selPoints_.length?this.lastx_=this.selPoints_[0].xval:this.lastx_=-1,void 0!==e&&(this.highlightSet_!==e&&(i=!0),this.highlightSet_=e),void 0!==a&&(this.lockedSet_=a),i&&this.updateSelection_(void 0),i},t.prototype.mouseOut_=function(t){this.getFunctionOption("unhighlightCallback")&&this.getFunctionOption("unhighlightCallback").call(this,t),this.getBooleanOption("hideOverlayOnMouseOut")&&!this.lockedSet_&&this.clearSelection()},t.prototype.clearSelection=function(){return this.cascadeEvents_("deselect",{}),this.lockedSet_=!1,this.fadeLevel?void this.animateSelection_(-1):(this.canvas_ctx_.clearRect(0,0,this.width_,this.height_),this.fadeLevel=0,this.selPoints_=[],this.lastx_=-1,this.lastRow_=-1,void(this.highlightSet_=null))},t.prototype.getSelection=function(){if(!this.selPoints_||this.selPoints_.length<1)return-1;for(var t=0;t<this.layout_.points.length;t++)for(var e=this.layout_.points[t],a=0;a<e.length;a++)if(e[a].x==this.selPoints_[0].x)return e[a].idx;return-1},t.prototype.getHighlightSeries=function(){return this.highlightSet_},t.prototype.isSeriesLocked=function(){return this.lockedSet_},t.prototype.loadedEvent_=function(t){this.rawData_=this.parseCSV_(t),this.cascadeDataDidUpdateEvent_(),this.predraw_()},t.prototype.addXTicks_=function(){var t;t=this.dateWindow_?[this.dateWindow_[0],this.dateWindow_[1]]:this.xAxisExtremes();var e=this.optionsViewForAxis_("x"),a=e("ticker")(t[0],t[1],this.plotter_.area.w,e,this);this.layout_.setXTicks(a)},t.prototype.getHandlerClass_=function(){var e;return e=this.attr_("dataHandler")?this.attr_("dataHandler"):this.fractions_?this.getBooleanOption("errorBars")?t.DataHandlers.FractionsBarsHandler:t.DataHandlers.DefaultFractionHandler:this.getBooleanOption("customBars")?t.DataHandlers.CustomBarsHandler:this.getBooleanOption("errorBars")?t.DataHandlers.ErrorBarsHandler:t.DataHandlers.DefaultHandler},t.prototype.predraw_=function(){var t=new Date;this.dataHandler_=new(this.getHandlerClass_()),this.layout_.computePlotArea(),this.computeYAxes_(),this.is_initial_draw_||(this.canvas_ctx_.restore(),this.hidden_ctx_.restore()),this.canvas_ctx_.save(),this.hidden_ctx_.save(),this.plotter_=new DygraphCanvasRenderer(this,this.hidden_,this.hidden_ctx_,this.layout_),this.createRollInterface_(),this.cascadeEvents_("predraw"),this.rolledSeries_=[null];for(var e=1;e<this.numColumns();e++){var a=this.dataHandler_.extractSeries(this.rawData_,e,this.attributes_);this.rollPeriod_>1&&(a=this.dataHandler_.rollingAverage(a,this.rollPeriod_,this.attributes_)),this.rolledSeries_.push(a)}this.drawGraph_();var i=new Date;this.drawingTimeMs_=i-t},t.PointType=void 0,t.stackPoints_=function(t,e,a,i){for(var r=null,n=null,o=null,s=-1,l=function(e){if(!(s>=e))for(var a=e;a<t.length;++a)if(o=null,!isNaN(t[a].yval)&&null!==t[a].yval){s=a,o=t[a];break}},h=0;h<t.length;++h){var p=t[h],g=p.xval;void 0===e[g]&&(e[g]=0);var d=p.yval;isNaN(d)||null===d?"none"==i?d=0:(l(h),d=n&&o&&"none"!=i?n.yval+(o.yval-n.yval)*((g-n.xval)/(o.xval-n.xval)):n&&"all"==i?n.yval:o&&"all"==i?o.yval:0):n=p;var u=e[g];r!=g&&(u+=d,e[g]=u),r=g,p.yval_stacked=u,u>a[1]&&(a[1]=u),u<a[0]&&(a[0]=u)}},t.prototype.gatherDatasets_=function(e,a){var i,r,n,o,s,l,h=[],p=[],g=[],d={},u=e.length-1;for(i=u;i>=1;i--)if(this.visibility()[i-1]){if(a){l=e[i];var c=a[0],y=a[1];for(n=null,o=null,r=0;r<l.length;r++)l[r][0]>=c&&null===n&&(n=r),l[r][0]<=y&&(o=r);null===n&&(n=0);for(var _=n,v=!0;v&&_>0;)_--,v=null===l[_][1];null===o&&(o=l.length-1);var f=o;for(v=!0;v&&f<l.length-1;)f++,v=null===l[f][1];_!==n&&(n=_),f!==o&&(o=f),h[i-1]=[n,o],l=l.slice(n,o+1)}else l=e[i],h[i-1]=[0,l.length-1];var x=this.attr_("labels")[i],m=this.dataHandler_.getExtremeYValues(l,a,this.getBooleanOption("stepPlot",x)),D=this.dataHandler_.seriesToPoints(l,x,h[i-1][0]);this.getBooleanOption("stackedGraph")&&(s=this.attributes_.axisForSeries(x),void 0===g[s]&&(g[s]=[]),t.stackPoints_(D,g[s],m,this.getBooleanOption("stackedGraphNaNFill"))),d[x]=m,p[i]=D}return{points:p,extremes:d,boundaryIds:h}},t.prototype.drawGraph_=function(){var t=new Date,e=this.is_initial_draw_;this.is_initial_draw_=!1,this.layout_.removeAllDatasets(),this.setColors_(),this.attrs_.pointSize=.5*this.getNumericOption("highlightCircleSize");var a=this.gatherDatasets_(this.rolledSeries_,this.dateWindow_),i=a.points,r=a.extremes;this.boundaryIds_=a.boundaryIds,this.setIndexByName_={};var n=this.attr_("labels");n.length>0&&(this.setIndexByName_[n[0]]=0);for(var o=0,s=1;s<i.length;s++)this.setIndexByName_[n[s]]=s,this.visibility()[s-1]&&(this.layout_.addDataset(n[s],i[s]),this.datasetIndex_[s]=o++);this.computeYAxisRanges_(r),this.layout_.setYAxes(this.axes_),this.addXTicks_();var l=this.zoomed_x_;if(this.zoomed_x_=l,this.layout_.evaluate(),this.renderGraph_(e),this.getStringOption("timingName")){var h=new Date;console.log(this.getStringOption("timingName")+" - drawGraph: "+(h-t)+"ms")}},t.prototype.renderGraph_=function(t){this.cascadeEvents_("clearChart"),this.plotter_.clear(),this.getFunctionOption("underlayCallback")&&this.getFunctionOption("underlayCallback").call(this,this.hidden_ctx_,this.layout_.getPlotArea(),this,this);var e={canvas:this.hidden_,drawingContext:this.hidden_ctx_};if(this.cascadeEvents_("willDrawChart",e),this.plotter_.render(),this.cascadeEvents_("didDrawChart",e),this.lastRow_=-1,this.canvas_.getContext("2d").clearRect(0,0,this.width_,this.height_),null!==this.getFunctionOption("drawCallback")&&this.getFunctionOption("drawCallback").call(this,this,t),t)for(this.readyFired_=!0;this.readyFns_.length>0;){var a=this.readyFns_.pop();a(this)}},t.prototype.computeYAxes_=function(){var e,a,i,r,n;if(void 0!==this.axes_&&this.user_attrs_.hasOwnProperty("valueRange")===!1)for(e=[],i=0;i<this.axes_.length;i++)e.push(this.axes_[i].valueWindow);for(this.axes_=[],a=0;a<this.attributes_.numAxes();a++)r={g:this},t.update(r,this.attributes_.axisOptions(a)),this.axes_[a]=r;if(n=this.attr_("valueRange"),n&&(this.axes_[0].valueRange=n),void 0!==e){var o=Math.min(e.length,this.axes_.length);for(i=0;o>i;i++)this.axes_[i].valueWindow=e[i]}for(a=0;a<this.axes_.length;a++)if(0===a)r=this.optionsViewForAxis_("y"+(a?"2":"")),n=r("valueRange"),n&&(this.axes_[a].valueRange=n);else{var s=this.user_attrs_.axes;s&&s.y2&&(n=s.y2.valueRange,n&&(this.axes_[a].valueRange=n))}},t.prototype.numAxes=function(){return this.attributes_.numAxes()},t.prototype.axisPropertiesForSeries=function(t){return this.axes_[this.attributes_.axisForSeries(t)]},t.prototype.computeYAxisRanges_=function(t){for(var e,a,i,r,n,o=function(t){return isNaN(parseFloat(t))},s=this.attributes_.numAxes(),l=0;s>l;l++){var h=this.axes_[l],p=this.attributes_.getForAxis("logscale",l),g=this.attributes_.getForAxis("includeZero",l),d=this.attributes_.getForAxis("independentTicks",l);if(i=this.attributes_.seriesForAxis(l),e=!0,r=.1,null!==this.getNumericOption("yRangePad")&&(e=!1,r=this.getNumericOption("yRangePad")/this.plotter_.area.h),0===i.length)h.extremeRange=[0,1];else{for(var u,c,y=1/0,_=-(1/0),v=0;v<i.length;v++)t.hasOwnProperty(i[v])&&(u=t[i[v]][0],null!==u&&(y=Math.min(u,y)),c=t[i[v]][1],null!==c&&(_=Math.max(c,_)));g&&!p&&(y>0&&(y=0),0>_&&(_=0)),y==1/0&&(y=0),_==-(1/0)&&(_=1),a=_-y,0===a&&(0!==_?a=Math.abs(_):(_=1,a=1));var f,x;if(p)if(e)f=_+r*a,x=y;else{var m=Math.exp(Math.log(a)*r);f=_*m,x=y/m}else f=_+r*a,x=y-r*a,e&&!this.getBooleanOption("avoidMinZero")&&(0>x&&y>=0&&(x=0),f>0&&0>=_&&(f=0));h.extremeRange=[x,f]}if(h.valueWindow)h.computedValueRange=[h.valueWindow[0],h.valueWindow[1]];else if(h.valueRange){var D=o(h.valueRange[0])?h.extremeRange[0]:h.valueRange[0],w=o(h.valueRange[1])?h.extremeRange[1]:h.valueRange[1];if(!e)if(h.logscale){var m=Math.exp(Math.log(a)*r);D*=m,w/=m}else a=w-D,D-=a*r,w+=a*r;h.computedValueRange=[D,w]}else h.computedValueRange=h.extremeRange;if(d){h.independentTicks=d;var A=this.optionsViewForAxis_("y"+(l?"2":"")),b=A("ticker");h.ticks=b(h.computedValueRange[0],h.computedValueRange[1],this.plotter_.area.h,A,this),n||(n=h)}}if(void 0===n)throw'Configuration Error: At least one axis has to have the "independentTicks" option activated.';for(var l=0;s>l;l++){var h=this.axes_[l];if(!h.independentTicks){for(var A=this.optionsViewForAxis_("y"+(l?"2":"")),b=A("ticker"),T=n.ticks,E=n.computedValueRange[1]-n.computedValueRange[0],C=h.computedValueRange[1]-h.computedValueRange[0],L=[],P=0;P<T.length;P++){var S=(T[P].v-n.computedValueRange[0])/E,O=h.computedValueRange[0]+S*C;L.push(O)}h.ticks=b(h.computedValueRange[0],h.computedValueRange[1],this.plotter_.area.h,A,this,L)}}},t.prototype.detectTypeFromString_=function(t){var e=!1,a=t.indexOf("-");a>0&&"e"!=t[a-1]&&"E"!=t[a-1]||t.indexOf("/")>=0||isNaN(parseFloat(t))?e=!0:8==t.length&&t>"19700101"&&"20371231">t&&(e=!0),this.setXAxisOptions_(e)},t.prototype.setXAxisOptions_=function(e){e?(this.attrs_.xValueParser=t.dateParser,this.attrs_.axes.x.valueFormatter=t.dateValueFormatter,this.attrs_.axes.x.ticker=t.dateTicker,this.attrs_.axes.x.axisLabelFormatter=t.dateAxisLabelFormatter):(this.attrs_.xValueParser=function(t){return parseFloat(t)},this.attrs_.axes.x.valueFormatter=function(t){return t},this.attrs_.axes.x.ticker=t.numericTicks,this.attrs_.axes.x.axisLabelFormatter=this.attrs_.axes.x.valueFormatter)},t.prototype.parseCSV_=function(e){var a,i,r=[],n=t.detectLineDelimiter(e),o=e.split(n||"\n"),s=this.getStringOption("delimiter");-1==o[0].indexOf(s)&&o[0].indexOf(" ")>=0&&(s=" ");var l=0;"labels"in this.user_attrs_||(l=1,this.attrs_.labels=o[0].split(s),this.attributes_.reparseSeries());for(var h,p=0,g=!1,d=this.attr_("labels").length,u=!1,c=l;c<o.length;c++){var y=o[c];if(p=c,0!==y.length&&"#"!=y[0]){var _=y.split(s);if(!(_.length<2)){var v=[];if(g||(this.detectTypeFromString_(_[0]),h=this.getFunctionOption("xValueParser"),g=!0),v[0]=h(_[0],this),this.fractions_)for(i=1;i<_.length;i++)a=_[i].split("/"),2!=a.length?(console.error('Expected fractional "num/den" values in CSV data but found a value \''+_[i]+"' on line "+(1+c)+" ('"+y+"') which is not of this form."),v[i]=[0,0]):v[i]=[t.parseFloat_(a[0],c,y),t.parseFloat_(a[1],c,y)];else if(this.getBooleanOption("errorBars"))for(_.length%2!=1&&console.error("Expected alternating (value, stdev.) pairs in CSV data but line "+(1+c)+" has an odd number of values ("+(_.length-1)+"): '"+y+"'"),i=1;i<_.length;i+=2)v[(i+1)/2]=[t.parseFloat_(_[i],c,y),t.parseFloat_(_[i+1],c,y)];else if(this.getBooleanOption("customBars"))for(i=1;i<_.length;i++){var f=_[i];/^ *$/.test(f)?v[i]=[null,null,null]:(a=f.split(";"),3==a.length?v[i]=[t.parseFloat_(a[0],c,y),t.parseFloat_(a[1],c,y),t.parseFloat_(a[2],c,y)]:console.warn('When using customBars, values must be either blank or "low;center;high" tuples (got "'+f+'" on line '+(1+c)))}else for(i=1;i<_.length;i++)v[i]=t.parseFloat_(_[i],c,y);if(r.length>0&&v[0]<r[r.length-1][0]&&(u=!0),v.length!=d&&console.error("Number of columns in line "+c+" ("+v.length+") does not agree with number of labels ("+d+") "+y),0===c&&this.attr_("labels")){var x=!0;for(i=0;x&&i<v.length;i++)v[i]&&(x=!1);if(x){console.warn("The dygraphs 'labels' option is set, but the first row of CSV data ('"+y+"') appears to also contain labels. Will drop the CSV labels and use the option labels.");continue}}r.push(v)}}}return u&&(console.warn("CSV is out of order; order it correctly to speed loading."),r.sort(function(t,e){return t[0]-e[0]})),r},t.prototype.parseArray_=function(e){if(0===e.length)return console.error("Can't plot empty data set"),null;if(0===e[0].length)return console.error("Data set cannot contain an empty row"),null;var a;if(null===this.attr_("labels")){for(console.warn("Using default labels. Set labels explicitly via 'labels' in the options parameter"),this.attrs_.labels=["X"],a=1;a<e[0].length;a++)this.attrs_.labels.push("Y"+a);this.attributes_.reparseSeries()}else{var i=this.attr_("labels");if(i.length!=e[0].length)return console.error("Mismatch between number of labels ("+i+") and number of columns in array ("+e[0].length+")"),null}if(t.isDateLike(e[0][0])){this.attrs_.axes.x.valueFormatter=t.dateValueFormatter,this.attrs_.axes.x.ticker=t.dateTicker,this.attrs_.axes.x.axisLabelFormatter=t.dateAxisLabelFormatter;var r=t.clone(e);for(a=0;a<e.length;a++){if(0===r[a].length)return console.error("Row "+(1+a)+" of data is empty"),null;if(null===r[a][0]||"function"!=typeof r[a][0].getTime||isNaN(r[a][0].getTime()))return console.error("x value in row "+(1+a)+" is not a Date"),null;r[a][0]=r[a][0].getTime()}return r}return this.attrs_.axes.x.valueFormatter=function(t){return t},this.attrs_.axes.x.ticker=t.numericTicks,this.attrs_.axes.x.axisLabelFormatter=t.numberAxisLabelFormatter,e},t.prototype.parseDataTable_=function(e){var a=function(t){var e=String.fromCharCode(65+t%26);for(t=Math.floor(t/26);t>0;)e=String.fromCharCode(65+(t-1)%26)+e.toLowerCase(),t=Math.floor((t-1)/26);return e},i=e.getNumberOfColumns(),r=e.getNumberOfRows(),n=e.getColumnType(0);if("date"==n||"datetime"==n)this.attrs_.xValueParser=t.dateParser,this.attrs_.axes.x.valueFormatter=t.dateValueFormatter,this.attrs_.axes.x.ticker=t.dateTicker,this.attrs_.axes.x.axisLabelFormatter=t.dateAxisLabelFormatter;else{if("number"!=n)return console.error("only 'date', 'datetime' and 'number' types are supported for column 1 of DataTable input (Got '"+n+"')"),null;this.attrs_.xValueParser=function(t){return parseFloat(t)},this.attrs_.axes.x.valueFormatter=function(t){return t},this.attrs_.axes.x.ticker=t.numericTicks,this.attrs_.axes.x.axisLabelFormatter=this.attrs_.axes.x.valueFormatter}var o,s,l=[],h={},p=!1;for(o=1;i>o;o++){var g=e.getColumnType(o);if("number"==g)l.push(o);else if("string"==g&&this.getBooleanOption("displayAnnotations")){var d=l[l.length-1];h.hasOwnProperty(d)?h[d].push(o):h[d]=[o],p=!0}else console.error("Only 'number' is supported as a dependent type with Gviz. 'string' is only supported if displayAnnotations is true")}var u=[e.getColumnLabel(0)];for(o=0;o<l.length;o++)u.push(e.getColumnLabel(l[o])),this.getBooleanOption("errorBars")&&(o+=1);this.attrs_.labels=u,i=u.length;var c=[],y=!1,_=[];for(o=0;r>o;o++){var v=[];if("undefined"!=typeof e.getValue(o,0)&&null!==e.getValue(o,0)){if(v.push("date"==n||"datetime"==n?e.getValue(o,0).getTime():e.getValue(o,0)),this.getBooleanOption("errorBars"))for(s=0;i-1>s;s++)v.push([e.getValue(o,1+2*s),e.getValue(o,2+2*s)]);else{for(s=0;s<l.length;s++){var f=l[s];if(v.push(e.getValue(o,f)),p&&h.hasOwnProperty(f)&&null!==e.getValue(o,h[f][0])){var x={};x.series=e.getColumnLabel(f),x.xval=v[0],x.shortText=a(_.length),x.text="";for(var m=0;m<h[f].length;m++)m&&(x.text+="\n"),x.text+=e.getValue(o,h[f][m]);_.push(x)}}for(s=0;s<v.length;s++)isFinite(v[s])||(v[s]=null)}c.length>0&&v[0]<c[c.length-1][0]&&(y=!0),c.push(v)}else console.warn("Ignoring row "+o+" of DataTable because of undefined or null first column.")}y&&(console.warn("DataTable is out of order; order it correctly to speed loading."),c.sort(function(t,e){return t[0]-e[0]})),this.rawData_=c,_.length>0&&this.setAnnotations(_,!0),this.attributes_.reparseSeries()},t.prototype.cascadeDataDidUpdateEvent_=function(){this.cascadeEvents_("dataDidUpdate",{})},t.prototype.start_=function(){var e=this.file_;if("function"==typeof e&&(e=e()),t.isArrayLike(e))this.rawData_=this.parseArray_(e),this.cascadeDataDidUpdateEvent_(),this.predraw_();else if("object"==typeof e&&"function"==typeof e.getColumnRange)this.parseDataTable_(e),this.cascadeDataDidUpdateEvent_(),this.predraw_();else if("string"==typeof e){var a=t.detectLineDelimiter(e);if(a)this.loadedEvent_(e);else{var i;i=window.XMLHttpRequest?new XMLHttpRequest:new ActiveXObject("Microsoft.XMLHTTP");var r=this;i.onreadystatechange=function(){4==i.readyState&&(200===i.status||0===i.status)&&r.loadedEvent_(i.responseText)},i.open("GET",e,!0),i.send(null)}}else console.error("Unknown data format: "+typeof e)},t.prototype.updateOptions=function(e,a){"undefined"==typeof a&&(a=!1);var i=e.file,r=t.mapLegacyOptions_(e);"rollPeriod"in r&&(this.rollPeriod_=r.rollPeriod),"dateWindow"in r&&(this.dateWindow_=r.dateWindow,"isZoomedIgnoreProgrammaticZoom"in r||(this.zoomed_x_=null!==r.dateWindow)),"valueRange"in r&&!("isZoomedIgnoreProgrammaticZoom"in r)&&(this.zoomed_y_=null!==r.valueRange);var n=t.isPixelChangingOptionList(this.attr_("labels"),r);t.updateDeep(this.user_attrs_,r),this.attributes_.reparseSeries(),i?(this.cascadeEvents_("dataWillUpdate",{}),this.file_=i,a||this.start_()):a||(n?this.predraw_():this.renderGraph_(!1))},t.mapLegacyOptions_=function(t){var e={};for(var a in t)t.hasOwnProperty(a)&&"file"!=a&&t.hasOwnProperty(a)&&(e[a]=t[a]);var i=function(t,a,i){e.axes||(e.axes={}),e.axes[t]||(e.axes[t]={}),e.axes[t][a]=i},r=function(a,r,n){"undefined"!=typeof t[a]&&(console.warn("Option "+a+" is deprecated. Use the "+n+" option for the "+r+" axis instead. (e.g. { axes : { "+r+" : { "+n+" : ... } } } (see http://dygraphs.com/per-axis.html for more information."),i(r,n,t[a]),delete e[a])};return r("xValueFormatter","x","valueFormatter"),r("pixelsPerXLabel","x","pixelsPerLabel"),r("xAxisLabelFormatter","x","axisLabelFormatter"),r("xTicker","x","ticker"),r("yValueFormatter","y","valueFormatter"),r("pixelsPerYLabel","y","pixelsPerLabel"),r("yAxisLabelFormatter","y","axisLabelFormatter"),r("yTicker","y","ticker"),r("drawXGrid","x","drawGrid"),r("drawXAxis","x","drawAxis"),r("drawYGrid","y","drawGrid"),r("drawYAxis","y","drawAxis"),r("xAxisLabelWidth","x","axisLabelWidth"),r("yAxisLabelWidth","y","axisLabelWidth"),e},t.prototype.resize=function(t,e){if(!this.resize_lock){this.resize_lock=!0,null===t!=(null===e)&&(console.warn("Dygraph.resize() should be called with zero parameters or two non-NULL parameters. Pretending it was zero."),t=e=null);var a=this.width_,i=this.height_;t?(this.maindiv_.style.width=t+"px",this.maindiv_.style.height=e+"px",this.width_=t,this.height_=e):(this.width_=this.maindiv_.clientWidth,this.height_=this.maindiv_.clientHeight),(a!=this.width_||i!=this.height_)&&(this.resizeElements_(),this.predraw_()),this.resize_lock=!1}},t.prototype.adjustRoll=function(t){this.rollPeriod_=t,this.predraw_()},t.prototype.visibility=function(){for(this.getOption("visibility")||(this.attrs_.visibility=[]);this.getOption("visibility").length<this.numColumns()-1;)this.attrs_.visibility.push(!0);return this.getOption("visibility")},t.prototype.setVisibility=function(t,e){var a=this.visibility();0>t||t>=a.length?console.warn("invalid series number in setVisibility: "+t):(a[t]=e,this.predraw_())},t.prototype.size=function(){return{width:this.width_,height:this.height_}},t.prototype.setAnnotations=function(e,a){return t.addAnnotationRule(),this.annotations_=e,this.layout_?(this.layout_.setAnnotations(this.annotations_),void(a||this.predraw_())):void console.warn("Tried to setAnnotations before dygraph was ready. Try setting them in a ready() block. See dygraphs.com/tests/annotation.html")},t.prototype.annotations=function(){return this.annotations_},t.prototype.getLabels=function(){var t=this.attr_("labels");return t?t.slice():null},t.prototype.indexFromSetName=function(t){return this.setIndexByName_[t]},t.prototype.ready=function(t){this.is_initial_draw_?this.readyFns_.push(t):t.call(this,this)},t.addAnnotationRule=function(){if(!t.addedAnnotationCSS){var e="border: 1px solid black; background-color: white; text-align: center;",a=document.createElement("style");a.type="text/css",document.getElementsByTagName("head")[0].appendChild(a);for(var i=0;i<document.styleSheets.length;i++)if(!document.styleSheets[i].disabled){var r=document.styleSheets[i];try{if(r.insertRule){var n=r.cssRules?r.cssRules.length:0;r.insertRule(".dygraphDefaultAnnotation { "+e+" }",n)}else r.addRule&&r.addRule(".dygraphDefaultAnnotation",e);return void(t.addedAnnotationCSS=!0)}catch(o){}}console.warn("Unable to add default annotation CSS rule; display may be off.")}},"object"==typeof exports&&"undefined"!=typeof module&&(module.exports=t),t}();!function(){"use strict";function t(t){var e=a.exec(t);if(!e)return null;var i=parseInt(e[1],10),r=parseInt(e[2],10),n=parseInt(e[3],10);return e[4]?{r:i,g:r,b:n,a:parseFloat(e[4])}:{r:i,g:r,b:n}}Dygraph.LOG_SCALE=10,Dygraph.LN_TEN=Math.log(Dygraph.LOG_SCALE),Dygraph.log10=function(t){return Math.log(t)/Dygraph.LN_TEN},Dygraph.DOTTED_LINE=[2,2],Dygraph.DASHED_LINE=[7,3],Dygraph.DOT_DASH_LINE=[7,2,2,2],Dygraph.getContext=function(t){return t.getContext("2d")},Dygraph.addEvent=function(t,e,a){t.addEventListener?t.addEventListener(e,a,!1):(t[e+a]=function(){a(window.event)},t.attachEvent("on"+e,t[e+a]))},Dygraph.prototype.addAndTrackEvent=function(t,e,a){Dygraph.addEvent(t,e,a),this.registeredEvents_.push({elem:t,type:e,fn:a})},Dygraph.removeEvent=function(t,e,a){if(t.removeEventListener)t.removeEventListener(e,a,!1);else{try{t.detachEvent("on"+e,t[e+a])}catch(i){}t[e+a]=null}},Dygraph.prototype.removeTrackedEvents_=function(){if(this.registeredEvents_)for(var t=0;t<this.registeredEvents_.length;t++){var e=this.registeredEvents_[t];Dygraph.removeEvent(e.elem,e.type,e.fn)}this.registeredEvents_=[]},Dygraph.cancelEvent=function(t){return t=t?t:window.event,t.stopPropagation&&t.stopPropagation(),t.preventDefault&&t.preventDefault(),t.cancelBubble=!0,t.cancel=!0,t.returnValue=!1,!1},Dygraph.hsvToRGB=function(t,e,a){var i,r,n;if(0===e)i=a,r=a,n=a;else{var o=Math.floor(6*t),s=6*t-o,l=a*(1-e),h=a*(1-e*s),p=a*(1-e*(1-s));switch(o){case 1:i=h,r=a,n=l;break;case 2:i=l,r=a,n=p;break;case 3:i=l,r=h,n=a;break;case 4:i=p,r=l,n=a;break;case 5:i=a,r=l,n=h;break;case 6:case 0:i=a,r=p,n=l}}return i=Math.floor(255*i+.5),r=Math.floor(255*r+.5),n=Math.floor(255*n+.5),"rgb("+i+","+r+","+n+")"},Dygraph.findPos=function(t){var e=0,a=0;if(t.offsetParent)for(var i=t;;){var r="0",n="0";if(window.getComputedStyle){
+var o=window.getComputedStyle(i,null);r=o.borderLeft||"0",n=o.borderTop||"0"}if(e+=parseInt(r,10),a+=parseInt(n,10),e+=i.offsetLeft,a+=i.offsetTop,!i.offsetParent)break;i=i.offsetParent}else t.x&&(e+=t.x),t.y&&(a+=t.y);for(;t&&t!=document.body;)e-=t.scrollLeft,a-=t.scrollTop,t=t.parentNode;return{x:e,y:a}},Dygraph.pageX=function(t){if(t.pageX)return!t.pageX||t.pageX<0?0:t.pageX;var e=document.documentElement,a=document.body;return t.clientX+(e.scrollLeft||a.scrollLeft)-(e.clientLeft||0)},Dygraph.pageY=function(t){if(t.pageY)return!t.pageY||t.pageY<0?0:t.pageY;var e=document.documentElement,a=document.body;return t.clientY+(e.scrollTop||a.scrollTop)-(e.clientTop||0)},Dygraph.dragGetX_=function(t,e){return Dygraph.pageX(t)-e.px},Dygraph.dragGetY_=function(t,e){return Dygraph.pageY(t)-e.py},Dygraph.isOK=function(t){return!!t&&!isNaN(t)},Dygraph.isValidPoint=function(t,e){return t?null===t.yval?!1:null===t.x||void 0===t.x?!1:null===t.y||void 0===t.y?!1:isNaN(t.x)||!e&&isNaN(t.y)?!1:!0:!1},Dygraph.floatFormat=function(t,e){var a=Math.min(Math.max(1,e||2),21);return Math.abs(t)<.001&&0!==t?t.toExponential(a-1):t.toPrecision(a)},Dygraph.zeropad=function(t){return 10>t?"0"+t:""+t},Dygraph.DateAccessorsLocal={getFullYear:function(t){return t.getFullYear()},getMonth:function(t){return t.getMonth()},getDate:function(t){return t.getDate()},getHours:function(t){return t.getHours()},getMinutes:function(t){return t.getMinutes()},getSeconds:function(t){return t.getSeconds()},getMilliseconds:function(t){return t.getMilliseconds()},getDay:function(t){return t.getDay()},makeDate:function(t,e,a,i,r,n,o){return new Date(t,e,a,i,r,n,o)}},Dygraph.DateAccessorsUTC={getFullYear:function(t){return t.getUTCFullYear()},getMonth:function(t){return t.getUTCMonth()},getDate:function(t){return t.getUTCDate()},getHours:function(t){return t.getUTCHours()},getMinutes:function(t){return t.getUTCMinutes()},getSeconds:function(t){return t.getUTCSeconds()},getMilliseconds:function(t){return t.getUTCMilliseconds()},getDay:function(t){return t.getUTCDay()},makeDate:function(t,e,a,i,r,n,o){return new Date(Date.UTC(t,e,a,i,r,n,o))}},Dygraph.hmsString_=function(t,e,a){var i=Dygraph.zeropad,r=i(t)+":"+i(e);return a&&(r+=":"+i(a)),r},Dygraph.dateString_=function(t,e){var a=Dygraph.zeropad,i=e?Dygraph.DateAccessorsUTC:Dygraph.DateAccessorsLocal,r=new Date(t),n=i.getFullYear(r),o=i.getMonth(r),s=i.getDate(r),l=i.getHours(r),h=i.getMinutes(r),p=i.getSeconds(r),g=""+n,d=a(o+1),u=a(s),c=3600*l+60*h+p,y=g+"/"+d+"/"+u;return c&&(y+=" "+Dygraph.hmsString_(l,h,p)),y},Dygraph.round_=function(t,e){var a=Math.pow(10,e);return Math.round(t*a)/a},Dygraph.binarySearch=function(t,e,a,i,r){if((null===i||void 0===i||null===r||void 0===r)&&(i=0,r=e.length-1),i>r)return-1;(null===a||void 0===a)&&(a=0);var n,o=function(t){return t>=0&&t<e.length},s=parseInt((i+r)/2,10),l=e[s];return l==t?s:l>t?a>0&&(n=s-1,o(n)&&e[n]<t)?s:Dygraph.binarySearch(t,e,a,i,s-1):t>l?0>a&&(n=s+1,o(n)&&e[n]>t)?s:Dygraph.binarySearch(t,e,a,s+1,r):-1},Dygraph.dateParser=function(t){var e,a;if((-1==t.search("-")||-1!=t.search("T")||-1!=t.search("Z"))&&(a=Dygraph.dateStrToMillis(t),a&&!isNaN(a)))return a;if(-1!=t.search("-")){for(e=t.replace("-","/","g");-1!=e.search("-");)e=e.replace("-","/");a=Dygraph.dateStrToMillis(e)}else 8==t.length?(e=t.substr(0,4)+"/"+t.substr(4,2)+"/"+t.substr(6,2),a=Dygraph.dateStrToMillis(e)):a=Dygraph.dateStrToMillis(t);return(!a||isNaN(a))&&console.error("Couldn't parse "+t+" as a date"),a},Dygraph.dateStrToMillis=function(t){return new Date(t).getTime()},Dygraph.update=function(t,e){if("undefined"!=typeof e&&null!==e)for(var a in e)e.hasOwnProperty(a)&&(t[a]=e[a]);return t},Dygraph.updateDeep=function(t,e){function a(t){return"object"==typeof Node?t instanceof Node:"object"==typeof t&&"number"==typeof t.nodeType&&"string"==typeof t.nodeName}if("undefined"!=typeof e&&null!==e)for(var i in e)e.hasOwnProperty(i)&&(null===e[i]?t[i]=null:Dygraph.isArrayLike(e[i])?t[i]=e[i].slice():a(e[i])?t[i]=e[i]:"object"==typeof e[i]?(("object"!=typeof t[i]||null===t[i])&&(t[i]={}),Dygraph.updateDeep(t[i],e[i])):t[i]=e[i]);return t},Dygraph.isArrayLike=function(t){var e=typeof t;return"object"!=e&&("function"!=e||"function"!=typeof t.item)||null===t||"number"!=typeof t.length||3===t.nodeType?!1:!0},Dygraph.isDateLike=function(t){return"object"!=typeof t||null===t||"function"!=typeof t.getTime?!1:!0},Dygraph.clone=function(t){for(var e=[],a=0;a<t.length;a++)e.push(Dygraph.isArrayLike(t[a])?Dygraph.clone(t[a]):t[a]);return e},Dygraph.createCanvas=function(){var t=document.createElement("canvas"),e=/MSIE/.test(navigator.userAgent)&&!window.opera;return e&&"undefined"!=typeof G_vmlCanvasManager&&(t=G_vmlCanvasManager.initElement(t)),t},Dygraph.getContextPixelRatio=function(t){try{var e=window.devicePixelRatio,a=t.webkitBackingStorePixelRatio||t.mozBackingStorePixelRatio||t.msBackingStorePixelRatio||t.oBackingStorePixelRatio||t.backingStorePixelRatio||1;return void 0!==e?e/a:1}catch(i){return 1}},Dygraph.isAndroid=function(){return/Android/.test(navigator.userAgent)},Dygraph.Iterator=function(t,e,a,i){e=e||0,a=a||t.length,this.hasNext=!0,this.peek=null,this.start_=e,this.array_=t,this.predicate_=i,this.end_=Math.min(t.length,e+a),this.nextIdx_=e-1,this.next()},Dygraph.Iterator.prototype.next=function(){if(!this.hasNext)return null;for(var t=this.peek,e=this.nextIdx_+1,a=!1;e<this.end_;){if(!this.predicate_||this.predicate_(this.array_,e)){this.peek=this.array_[e],a=!0;break}e++}return this.nextIdx_=e,a||(this.hasNext=!1,this.peek=null),t},Dygraph.createIterator=function(t,e,a,i){return new Dygraph.Iterator(t,e,a,i)},Dygraph.requestAnimFrame=function(){return window.requestAnimationFrame||window.webkitRequestAnimationFrame||window.mozRequestAnimationFrame||window.oRequestAnimationFrame||window.msRequestAnimationFrame||function(t){window.setTimeout(t,1e3/60)}}(),Dygraph.repeatAndCleanup=function(t,e,a,i){var r,n=0,o=(new Date).getTime();if(t(n),1==e)return void i();var s=e-1;!function l(){n>=e||Dygraph.requestAnimFrame.call(window,function(){var e=(new Date).getTime(),h=e-o;r=n,n=Math.floor(h/a);var p=n-r,g=n+p>s;g||n>=s?(t(s),i()):(0!==p&&t(n),l())})}()};var e={annotationClickHandler:!0,annotationDblClickHandler:!0,annotationMouseOutHandler:!0,annotationMouseOverHandler:!0,axisLabelColor:!0,axisLineColor:!0,axisLineWidth:!0,clickCallback:!0,drawCallback:!0,drawHighlightPointCallback:!0,drawPoints:!0,drawPointCallback:!0,drawXGrid:!0,drawYGrid:!0,fillAlpha:!0,gridLineColor:!0,gridLineWidth:!0,hideOverlayOnMouseOut:!0,highlightCallback:!0,highlightCircleSize:!0,interactionModel:!0,isZoomedIgnoreProgrammaticZoom:!0,labelsDiv:!0,labelsDivStyles:!0,labelsDivWidth:!0,labelsKMB:!0,labelsKMG2:!0,labelsSeparateLines:!0,labelsShowZeroValues:!0,legend:!0,panEdgeFraction:!0,pixelsPerYLabel:!0,pointClickCallback:!0,pointSize:!0,rangeSelectorPlotFillColor:!0,rangeSelectorPlotStrokeColor:!0,showLabelsOnHighlight:!0,showRoller:!0,strokeWidth:!0,underlayCallback:!0,unhighlightCallback:!0,zoomCallback:!0};Dygraph.isPixelChangingOptionList=function(t,a){var i={};if(t)for(var r=1;r<t.length;r++)i[t[r]]=!0;var n=function(t){for(var a in t)if(t.hasOwnProperty(a)&&!e[a])return!0;return!1};for(var o in a)if(a.hasOwnProperty(o))if("highlightSeriesOpts"==o||i[o]&&!a.series){if(n(a[o]))return!0}else if("series"==o||"axes"==o){var s=a[o];for(var l in s)if(s.hasOwnProperty(l)&&n(s[l]))return!0}else if(!e[o])return!0;return!1},Dygraph.Circles={DEFAULT:function(t,e,a,i,r,n,o){a.beginPath(),a.fillStyle=n,a.arc(i,r,o,0,2*Math.PI,!1),a.fill()}},Dygraph.IFrameTarp=function(){this.tarps=[]},Dygraph.IFrameTarp.prototype.cover=function(){for(var t=document.getElementsByTagName("iframe"),e=0;e<t.length;e++){var a=t[e],i=Dygraph.findPos(a),r=i.x,n=i.y,o=a.offsetWidth,s=a.offsetHeight,l=document.createElement("div");l.style.position="absolute",l.style.left=r+"px",l.style.top=n+"px",l.style.width=o+"px",l.style.height=s+"px",l.style.zIndex=999,document.body.appendChild(l),this.tarps.push(l)}},Dygraph.IFrameTarp.prototype.uncover=function(){for(var t=0;t<this.tarps.length;t++)this.tarps[t].parentNode.removeChild(this.tarps[t]);this.tarps=[]},Dygraph.detectLineDelimiter=function(t){for(var e=0;e<t.length;e++){var a=t.charAt(e);if("\r"===a)return e+1<t.length&&"\n"===t.charAt(e+1)?"\r\n":a;if("\n"===a)return e+1<t.length&&"\r"===t.charAt(e+1)?"\n\r":a}return null},Dygraph.isNodeContainedBy=function(t,e){if(null===e||null===t)return!1;for(var a=t;a&&a!==e;)a=a.parentNode;return a===e},Dygraph.pow=function(t,e){return 0>e?1/Math.pow(t,-e):Math.pow(t,e)};var a=/^rgba?\((\d{1,3}),\s*(\d{1,3}),\s*(\d{1,3})(?:,\s*([01](?:\.\d+)?))?\)$/;Dygraph.toRGB_=function(e){var a=t(e);if(a)return a;var i=document.createElement("div");i.style.backgroundColor=e,i.style.visibility="hidden",document.body.appendChild(i);var r;return r=window.getComputedStyle?window.getComputedStyle(i,null).backgroundColor:i.currentStyle.backgroundColor,document.body.removeChild(i),t(r)},Dygraph.isCanvasSupported=function(t){var e;try{e=t||document.createElement("canvas"),e.getContext("2d")}catch(a){var i=navigator.appVersion.match(/MSIE (\d\.\d)/),r=-1!=navigator.userAgent.toLowerCase().indexOf("opera");return!i||i[1]<6||r?!1:!0}return!0},Dygraph.parseFloat_=function(t,e,a){var i=parseFloat(t);if(!isNaN(i))return i;if(/^ *$/.test(t))return null;if(/^ *nan *$/i.test(t))return 0/0;var r="Unable to parse '"+t+"' as a number";return void 0!==a&&void 0!==e&&(r+=" on line "+(1+(e||0))+" ('"+a+"') of CSV."),console.error(r),null}}(),function(){"use strict";Dygraph.GVizChart=function(t){this.container=t},Dygraph.GVizChart.prototype.draw=function(t,e){this.container.innerHTML="","undefined"!=typeof this.date_graph&&this.date_graph.destroy(),this.date_graph=new Dygraph(this.container,t,e)},Dygraph.GVizChart.prototype.setSelection=function(t){var e=!1;t.length&&(e=t[0].row),this.date_graph.setSelection(e)},Dygraph.GVizChart.prototype.getSelection=function(){var t=[],e=this.date_graph.getSelection();if(0>e)return t;for(var a=this.date_graph.layout_.points,i=0;i<a.length;++i)t.push({row:e,column:i+1});return t}}(),function(){"use strict";var t=100;Dygraph.Interaction={},Dygraph.Interaction.maybeTreatMouseOpAsClick=function(t,e,a){a.dragEndX=Dygraph.dragGetX_(t,a),a.dragEndY=Dygraph.dragGetY_(t,a);var i=Math.abs(a.dragEndX-a.dragStartX),r=Math.abs(a.dragEndY-a.dragStartY);2>i&&2>r&&void 0!==e.lastx_&&-1!=e.lastx_&&Dygraph.Interaction.treatMouseOpAsClick(e,t,a),a.regionWidth=i,a.regionHeight=r},Dygraph.Interaction.startPan=function(t,e,a){var i,r;a.isPanning=!0;var n=e.xAxisRange();if(e.getOptionForAxis("logscale","x")?(a.initialLeftmostDate=Dygraph.log10(n[0]),a.dateRange=Dygraph.log10(n[1])-Dygraph.log10(n[0])):(a.initialLeftmostDate=n[0],a.dateRange=n[1]-n[0]),a.xUnitsPerPixel=a.dateRange/(e.plotter_.area.w-1),e.getNumericOption("panEdgeFraction")){var o=e.width_*e.getNumericOption("panEdgeFraction"),s=e.xAxisExtremes(),l=e.toDomXCoord(s[0])-o,h=e.toDomXCoord(s[1])+o,p=e.toDataXCoord(l),g=e.toDataXCoord(h);a.boundedDates=[p,g];var d=[],u=e.height_*e.getNumericOption("panEdgeFraction");for(i=0;i<e.axes_.length;i++){r=e.axes_[i];var c=r.extremeRange,y=e.toDomYCoord(c[0],i)+u,_=e.toDomYCoord(c[1],i)-u,v=e.toDataYCoord(y,i),f=e.toDataYCoord(_,i);d[i]=[v,f]}a.boundedValues=d}for(a.is2DPan=!1,a.axes=[],i=0;i<e.axes_.length;i++){r=e.axes_[i];var x={},m=e.yAxisRange(i),D=e.attributes_.getForAxis("logscale",i);D?(x.initialTopValue=Dygraph.log10(m[1]),x.dragValueRange=Dygraph.log10(m[1])-Dygraph.log10(m[0])):(x.initialTopValue=m[1],x.dragValueRange=m[1]-m[0]),x.unitsPerPixel=x.dragValueRange/(e.plotter_.area.h-1),a.axes.push(x),(r.valueWindow||r.valueRange)&&(a.is2DPan=!0)}},Dygraph.Interaction.movePan=function(t,e,a){a.dragEndX=Dygraph.dragGetX_(t,a),a.dragEndY=Dygraph.dragGetY_(t,a);var i=a.initialLeftmostDate-(a.dragEndX-a.dragStartX)*a.xUnitsPerPixel;a.boundedDates&&(i=Math.max(i,a.boundedDates[0]));var r=i+a.dateRange;if(a.boundedDates&&r>a.boundedDates[1]&&(i-=r-a.boundedDates[1],r=i+a.dateRange),e.getOptionForAxis("logscale","x")?e.dateWindow_=[Math.pow(Dygraph.LOG_SCALE,i),Math.pow(Dygraph.LOG_SCALE,r)]:e.dateWindow_=[i,r],a.is2DPan)for(var n=a.dragEndY-a.dragStartY,o=0;o<e.axes_.length;o++){var s=e.axes_[o],l=a.axes[o],h=n*l.unitsPerPixel,p=a.boundedValues?a.boundedValues[o]:null,g=l.initialTopValue+h;p&&(g=Math.min(g,p[1]));var d=g-l.dragValueRange;p&&d<p[0]&&(g-=d-p[0],d=g-l.dragValueRange),e.attributes_.getForAxis("logscale",o)?s.valueWindow=[Math.pow(Dygraph.LOG_SCALE,d),Math.pow(Dygraph.LOG_SCALE,g)]:s.valueWindow=[d,g]}e.drawGraph_(!1)},Dygraph.Interaction.endPan=Dygraph.Interaction.maybeTreatMouseOpAsClick,Dygraph.Interaction.startZoom=function(t,e,a){a.isZooming=!0,a.zoomMoved=!1},Dygraph.Interaction.moveZoom=function(t,e,a){a.zoomMoved=!0,a.dragEndX=Dygraph.dragGetX_(t,a),a.dragEndY=Dygraph.dragGetY_(t,a);var i=Math.abs(a.dragStartX-a.dragEndX),r=Math.abs(a.dragStartY-a.dragEndY);a.dragDirection=r/2>i?Dygraph.VERTICAL:Dygraph.HORIZONTAL,e.drawZoomRect_(a.dragDirection,a.dragStartX,a.dragEndX,a.dragStartY,a.dragEndY,a.prevDragDirection,a.prevEndX,a.prevEndY),a.prevEndX=a.dragEndX,a.prevEndY=a.dragEndY,a.prevDragDirection=a.dragDirection},Dygraph.Interaction.treatMouseOpAsClick=function(t,e,a){for(var i=t.getFunctionOption("clickCallback"),r=t.getFunctionOption("pointClickCallback"),n=null,o=-1,s=Number.MAX_VALUE,l=0;l<t.selPoints_.length;l++){var h=t.selPoints_[l],p=Math.pow(h.canvasx-a.dragEndX,2)+Math.pow(h.canvasy-a.dragEndY,2);!isNaN(p)&&(-1==o||s>p)&&(s=p,o=l)}var g=t.getNumericOption("highlightCircleSize")+2;if(g*g>=s&&(n=t.selPoints_[o]),n){var d={cancelable:!0,point:n,canvasx:a.dragEndX,canvasy:a.dragEndY},u=t.cascadeEvents_("pointClick",d);if(u)return;r&&r.call(t,e,n)}var d={cancelable:!0,xval:t.lastx_,pts:t.selPoints_,canvasx:a.dragEndX,canvasy:a.dragEndY};t.cascadeEvents_("click",d)||i&&i.call(t,e,t.lastx_,t.selPoints_)},Dygraph.Interaction.endZoom=function(t,e,a){e.clearZoomRect_(),a.isZooming=!1,Dygraph.Interaction.maybeTreatMouseOpAsClick(t,e,a);var i=e.getArea();if(a.regionWidth>=10&&a.dragDirection==Dygraph.HORIZONTAL){var r=Math.min(a.dragStartX,a.dragEndX),n=Math.max(a.dragStartX,a.dragEndX);r=Math.max(r,i.x),n=Math.min(n,i.x+i.w),n>r&&e.doZoomX_(r,n),a.cancelNextDblclick=!0}else if(a.regionHeight>=10&&a.dragDirection==Dygraph.VERTICAL){var o=Math.min(a.dragStartY,a.dragEndY),s=Math.max(a.dragStartY,a.dragEndY);o=Math.max(o,i.y),s=Math.min(s,i.y+i.h),s>o&&e.doZoomY_(o,s),a.cancelNextDblclick=!0}a.dragStartX=null,a.dragStartY=null},Dygraph.Interaction.startTouch=function(t,e,a){t.preventDefault(),t.touches.length>1&&(a.startTimeForDoubleTapMs=null);for(var i=[],r=0;r<t.touches.length;r++){var n=t.touches[r];i.push({pageX:n.pageX,pageY:n.pageY,dataX:e.toDataXCoord(n.pageX),dataY:e.toDataYCoord(n.pageY)})}if(a.initialTouches=i,1==i.length)a.initialPinchCenter=i[0],a.touchDirections={x:!0,y:!0};else if(i.length>=2){a.initialPinchCenter={pageX:.5*(i[0].pageX+i[1].pageX),pageY:.5*(i[0].pageY+i[1].pageY),dataX:.5*(i[0].dataX+i[1].dataX),dataY:.5*(i[0].dataY+i[1].dataY)};var o=180/Math.PI*Math.atan2(a.initialPinchCenter.pageY-i[0].pageY,i[0].pageX-a.initialPinchCenter.pageX);o=Math.abs(o),o>90&&(o=90-o),a.touchDirections={x:67.5>o,y:o>22.5}}a.initialRange={x:e.xAxisRange(),y:e.yAxisRange()}},Dygraph.Interaction.moveTouch=function(t,e,a){a.startTimeForDoubleTapMs=null;var i,r=[];for(i=0;i<t.touches.length;i++){var n=t.touches[i];r.push({pageX:n.pageX,pageY:n.pageY})}var o,s=a.initialTouches,l=a.initialPinchCenter;o=1==r.length?r[0]:{pageX:.5*(r[0].pageX+r[1].pageX),pageY:.5*(r[0].pageY+r[1].pageY)};var h={pageX:o.pageX-l.pageX,pageY:o.pageY-l.pageY},p=a.initialRange.x[1]-a.initialRange.x[0],g=a.initialRange.y[0]-a.initialRange.y[1];h.dataX=h.pageX/e.plotter_.area.w*p,h.dataY=h.pageY/e.plotter_.area.h*g;var d,u;if(1==r.length)d=1,u=1;else if(r.length>=2){var c=s[1].pageX-l.pageX;d=(r[1].pageX-o.pageX)/c;var y=s[1].pageY-l.pageY;u=(r[1].pageY-o.pageY)/y}d=Math.min(8,Math.max(.125,d)),u=Math.min(8,Math.max(.125,u));var _=!1;if(a.touchDirections.x&&(e.dateWindow_=[l.dataX-h.dataX+(a.initialRange.x[0]-l.dataX)/d,l.dataX-h.dataX+(a.initialRange.x[1]-l.dataX)/d],_=!0),a.touchDirections.y)for(i=0;1>i;i++){var v=e.axes_[i],f=e.attributes_.getForAxis("logscale",i);f||(v.valueWindow=[l.dataY-h.dataY+(a.initialRange.y[0]-l.dataY)/u,l.dataY-h.dataY+(a.initialRange.y[1]-l.dataY)/u],_=!0)}if(e.drawGraph_(!1),_&&r.length>1&&e.getFunctionOption("zoomCallback")){var x=e.xAxisRange();e.getFunctionOption("zoomCallback").call(e,x[0],x[1],e.yAxisRanges())}},Dygraph.Interaction.endTouch=function(t,e,a){if(0!==t.touches.length)Dygraph.Interaction.startTouch(t,e,a);else if(1==t.changedTouches.length){var i=(new Date).getTime(),r=t.changedTouches[0];a.startTimeForDoubleTapMs&&i-a.startTimeForDoubleTapMs<500&&a.doubleTapX&&Math.abs(a.doubleTapX-r.screenX)<50&&a.doubleTapY&&Math.abs(a.doubleTapY-r.screenY)<50?e.resetZoom():(a.startTimeForDoubleTapMs=i,a.doubleTapX=r.screenX,a.doubleTapY=r.screenY)}};var e=function(t,e,a){return e>t?e-t:t>a?t-a:0},a=function(t,a){var i=Dygraph.findPos(a.canvas_),r={left:i.x,right:i.x+a.canvas_.offsetWidth,top:i.y,bottom:i.y+a.canvas_.offsetHeight},n={x:Dygraph.pageX(t),y:Dygraph.pageY(t)},o=e(n.x,r.left,r.right),s=e(n.y,r.top,r.bottom);return Math.max(o,s)};Dygraph.Interaction.defaultModel={mousedown:function(e,i,r){if(!e.button||2!=e.button){r.initializeMouseDown(e,i,r),e.altKey||e.shiftKey?Dygraph.startPan(e,i,r):Dygraph.startZoom(e,i,r);var n=function(e){if(r.isZooming){var n=a(e,i);t>n?Dygraph.moveZoom(e,i,r):null!==r.dragEndX&&(r.dragEndX=null,r.dragEndY=null,i.clearZoomRect_())}else r.isPanning&&Dygraph.movePan(e,i,r)},o=function(t){r.isZooming?null!==r.dragEndX?Dygraph.endZoom(t,i,r):Dygraph.Interaction.maybeTreatMouseOpAsClick(t,i,r):r.isPanning&&Dygraph.endPan(t,i,r),Dygraph.removeEvent(document,"mousemove",n),Dygraph.removeEvent(document,"mouseup",o),r.destroy()};i.addAndTrackEvent(document,"mousemove",n),i.addAndTrackEvent(document,"mouseup",o)}},willDestroyContextMyself:!0,touchstart:function(t,e,a){Dygraph.Interaction.startTouch(t,e,a)},touchmove:function(t,e,a){Dygraph.Interaction.moveTouch(t,e,a)},touchend:function(t,e,a){Dygraph.Interaction.endTouch(t,e,a)},dblclick:function(t,e,a){if(a.cancelNextDblclick)return void(a.cancelNextDblclick=!1);var i={canvasx:a.dragEndX,canvasy:a.dragEndY};e.cascadeEvents_("dblclick",i)||t.altKey||t.shiftKey||e.resetZoom()}},Dygraph.DEFAULT_ATTRS.interactionModel=Dygraph.Interaction.defaultModel,Dygraph.defaultInteractionModel=Dygraph.Interaction.defaultModel,Dygraph.endZoom=Dygraph.Interaction.endZoom,Dygraph.moveZoom=Dygraph.Interaction.moveZoom,Dygraph.startZoom=Dygraph.Interaction.startZoom,Dygraph.endPan=Dygraph.Interaction.endPan,Dygraph.movePan=Dygraph.Interaction.movePan,Dygraph.startPan=Dygraph.Interaction.startPan,Dygraph.Interaction.nonInteractiveModel_={mousedown:function(t,e,a){a.initializeMouseDown(t,e,a)},mouseup:Dygraph.Interaction.maybeTreatMouseOpAsClick},Dygraph.Interaction.dragIsPanInteractionModel={mousedown:function(t,e,a){a.initializeMouseDown(t,e,a),Dygraph.startPan(t,e,a)},mousemove:function(t,e,a){a.isPanning&&Dygraph.movePan(t,e,a)},mouseup:function(t,e,a){a.isPanning&&Dygraph.endPan(t,e,a)}}}(),function(){"use strict";Dygraph.TickList=void 0,Dygraph.Ticker=void 0,Dygraph.numericLinearTicks=function(t,e,a,i,r,n){var o=function(t){return"logscale"===t?!1:i(t)};return Dygraph.numericTicks(t,e,a,o,r,n)},Dygraph.numericTicks=function(t,e,a,i,r,n){var o,s,l,h,p=i("pixelsPerLabel"),g=[];if(n)for(o=0;o<n.length;o++)g.push({v:n[o]});else{if(i("logscale")){h=Math.floor(a/p);var d=Dygraph.binarySearch(t,Dygraph.PREFERRED_LOG_TICK_VALUES,1),u=Dygraph.binarySearch(e,Dygraph.PREFERRED_LOG_TICK_VALUES,-1);-1==d&&(d=0),-1==u&&(u=Dygraph.PREFERRED_LOG_TICK_VALUES.length-1);var c=null;if(u-d>=h/4){for(var y=u;y>=d;y--){var _=Dygraph.PREFERRED_LOG_TICK_VALUES[y],v=Math.log(_/t)/Math.log(e/t)*a,f={v:_};null===c?c={tickValue:_,pixel_coord:v}:Math.abs(v-c.pixel_coord)>=p?c={tickValue:_,pixel_coord:v}:f.label="",g.push(f)}g.reverse()}}if(0===g.length){var x,m,D=i("labelsKMG2");D?(x=[1,2,4,8,16,32,64,128,256],m=16):(x=[1,2,5,10,20,50,100],m=10);var w,A,b,T,E=Math.ceil(a/p),C=Math.abs(e-t)/E,L=Math.floor(Math.log(C)/Math.log(m)),P=Math.pow(m,L);for(s=0;s<x.length&&(w=P*x[s],A=Math.floor(t/w)*w,b=Math.ceil(e/w)*w,h=Math.abs(b-A)/w,T=a/h,!(T>p));s++);for(A>b&&(w*=-1),o=0;h>=o;o++)l=A+o*w,g.push({v:l})}}var S=i("axisLabelFormatter");for(o=0;o<g.length;o++)void 0===g[o].label&&(g[o].label=S.call(r,g[o].v,0,i,r));return g},Dygraph.dateTicker=function(t,e,a,i,r,n){var o=Dygraph.pickDateTickGranularity(t,e,a,i);return o>=0?Dygraph.getDateAxis(t,e,o,i,r):[]},Dygraph.SECONDLY=0,Dygraph.TWO_SECONDLY=1,Dygraph.FIVE_SECONDLY=2,Dygraph.TEN_SECONDLY=3,Dygraph.THIRTY_SECONDLY=4,Dygraph.MINUTELY=5,Dygraph.TWO_MINUTELY=6,Dygraph.FIVE_MINUTELY=7,Dygraph.TEN_MINUTELY=8,Dygraph.THIRTY_MINUTELY=9,Dygraph.HOURLY=10,Dygraph.TWO_HOURLY=11,Dygraph.SIX_HOURLY=12,Dygraph.DAILY=13,Dygraph.TWO_DAILY=14,Dygraph.WEEKLY=15,Dygraph.MONTHLY=16,Dygraph.QUARTERLY=17,Dygraph.BIANNUAL=18,Dygraph.ANNUAL=19,Dygraph.DECADAL=20,Dygraph.CENTENNIAL=21,Dygraph.NUM_GRANULARITIES=22,Dygraph.DATEFIELD_Y=0,Dygraph.DATEFIELD_M=1,Dygraph.DATEFIELD_D=2,Dygraph.DATEFIELD_HH=3,Dygraph.DATEFIELD_MM=4,Dygraph.DATEFIELD_SS=5,Dygraph.DATEFIELD_MS=6,Dygraph.NUM_DATEFIELDS=7,Dygraph.TICK_PLACEMENT=[],Dygraph.TICK_PLACEMENT[Dygraph.SECONDLY]={datefield:Dygraph.DATEFIELD_SS,step:1,spacing:1e3},Dygraph.TICK_PLACEMENT[Dygraph.TWO_SECONDLY]={datefield:Dygraph.DATEFIELD_SS,step:2,spacing:2e3},Dygraph.TICK_PLACEMENT[Dygraph.FIVE_SECONDLY]={datefield:Dygraph.DATEFIELD_SS,step:5,spacing:5e3},Dygraph.TICK_PLACEMENT[Dygraph.TEN_SECONDLY]={datefield:Dygraph.DATEFIELD_SS,step:10,spacing:1e4},Dygraph.TICK_PLACEMENT[Dygraph.THIRTY_SECONDLY]={datefield:Dygraph.DATEFIELD_SS,step:30,spacing:3e4},Dygraph.TICK_PLACEMENT[Dygraph.MINUTELY]={datefield:Dygraph.DATEFIELD_MM,step:1,spacing:6e4},Dygraph.TICK_PLACEMENT[Dygraph.TWO_MINUTELY]={datefield:Dygraph.DATEFIELD_MM,step:2,spacing:12e4},Dygraph.TICK_PLACEMENT[Dygraph.FIVE_MINUTELY]={datefield:Dygraph.DATEFIELD_MM,step:5,spacing:3e5},Dygraph.TICK_PLACEMENT[Dygraph.TEN_MINUTELY]={datefield:Dygraph.DATEFIELD_MM,step:10,spacing:6e5},Dygraph.TICK_PLACEMENT[Dygraph.THIRTY_MINUTELY]={datefield:Dygraph.DATEFIELD_MM,step:30,spacing:18e5},Dygraph.TICK_PLACEMENT[Dygraph.HOURLY]={datefield:Dygraph.DATEFIELD_HH,step:1,spacing:36e5},Dygraph.TICK_PLACEMENT[Dygraph.TWO_HOURLY]={datefield:Dygraph.DATEFIELD_HH,step:2,spacing:72e5},Dygraph.TICK_PLACEMENT[Dygraph.SIX_HOURLY]={datefield:Dygraph.DATEFIELD_HH,step:6,spacing:216e5},Dygraph.TICK_PLACEMENT[Dygraph.DAILY]={datefield:Dygraph.DATEFIELD_D,step:1,spacing:864e5},Dygraph.TICK_PLACEMENT[Dygraph.TWO_DAILY]={datefield:Dygraph.DATEFIELD_D,step:2,spacing:1728e5},Dygraph.TICK_PLACEMENT[Dygraph.WEEKLY]={datefield:Dygraph.DATEFIELD_D,step:7,spacing:6048e5},Dygraph.TICK_PLACEMENT[Dygraph.MONTHLY]={datefield:Dygraph.DATEFIELD_M,step:1,spacing:2629817280},Dygraph.TICK_PLACEMENT[Dygraph.QUARTERLY]={datefield:Dygraph.DATEFIELD_M,step:3,spacing:216e5*365.2524},Dygraph.TICK_PLACEMENT[Dygraph.BIANNUAL]={datefield:Dygraph.DATEFIELD_M,step:6,spacing:432e5*365.2524},Dygraph.TICK_PLACEMENT[Dygraph.ANNUAL]={datefield:Dygraph.DATEFIELD_Y,step:1,spacing:864e5*365.2524},Dygraph.TICK_PLACEMENT[Dygraph.DECADAL]={datefield:Dygraph.DATEFIELD_Y,step:10,spacing:315578073600},Dygraph.TICK_PLACEMENT[Dygraph.CENTENNIAL]={datefield:Dygraph.DATEFIELD_Y,step:100,spacing:3155780736e3},Dygraph.PREFERRED_LOG_TICK_VALUES=function(){for(var t=[],e=-39;39>=e;e++)for(var a=Math.pow(10,e),i=1;9>=i;i++){var r=a*i;t.push(r)}return t}(),Dygraph.pickDateTickGranularity=function(t,e,a,i){for(var r=i("pixelsPerLabel"),n=0;n<Dygraph.NUM_GRANULARITIES;n++){var o=Dygraph.numDateTicks(t,e,n);if(a/o>=r)return n}return-1},Dygraph.numDateTicks=function(t,e,a){var i=Dygraph.TICK_PLACEMENT[a].spacing;return Math.round(1*(e-t)/i)},Dygraph.getDateAxis=function(t,e,a,i,r){var n=i("axisLabelFormatter"),o=i("labelsUTC"),s=o?Dygraph.DateAccessorsUTC:Dygraph.DateAccessorsLocal,l=Dygraph.TICK_PLACEMENT[a].datefield,h=Dygraph.TICK_PLACEMENT[a].step,p=Dygraph.TICK_PLACEMENT[a].spacing,g=new Date(t),d=[];d[Dygraph.DATEFIELD_Y]=s.getFullYear(g),d[Dygraph.DATEFIELD_M]=s.getMonth(g),d[Dygraph.DATEFIELD_D]=s.getDate(g),d[Dygraph.DATEFIELD_HH]=s.getHours(g),d[Dygraph.DATEFIELD_MM]=s.getMinutes(g),d[Dygraph.DATEFIELD_SS]=s.getSeconds(g),d[Dygraph.DATEFIELD_MS]=s.getMilliseconds(g);var u=d[l]%h;a==Dygraph.WEEKLY&&(u=s.getDay(g)),d[l]-=u;for(var c=l+1;c<Dygraph.NUM_DATEFIELDS;c++)d[c]=c===Dygraph.DATEFIELD_D?1:0;var y=[],_=s.makeDate.apply(null,d),v=_.getTime();if(a<=Dygraph.HOURLY)for(t>v&&(v+=p,_=new Date(v));e>=v;)y.push({v:v,label:n.call(r,_,a,i,r)}),v+=p,_=new Date(v);else for(t>v&&(d[l]+=h,_=s.makeDate.apply(null,d),v=_.getTime());e>=v;)(a>=Dygraph.DAILY||s.getHours(_)%h===0)&&y.push({v:v,label:n.call(r,_,a,i,r)}),d[l]+=h,_=s.makeDate.apply(null,d),v=_.getTime();return y},Dygraph&&Dygraph.DEFAULT_ATTRS&&Dygraph.DEFAULT_ATTRS.axes&&Dygraph.DEFAULT_ATTRS.axes.x&&Dygraph.DEFAULT_ATTRS.axes.y&&Dygraph.DEFAULT_ATTRS.axes.y2&&(Dygraph.DEFAULT_ATTRS.axes.x.ticker=Dygraph.dateTicker,Dygraph.DEFAULT_ATTRS.axes.y.ticker=Dygraph.numericTicks,Dygraph.DEFAULT_ATTRS.axes.y2.ticker=Dygraph.numericTicks)}(),Dygraph.Plugins={},Dygraph.Plugins.Annotations=function(){"use strict";var t=function(){this.annotations_=[]};return t.prototype.toString=function(){return"Annotations Plugin"},t.prototype.activate=function(t){return{clearChart:this.clearChart,didDrawChart:this.didDrawChart}},t.prototype.detachLabels=function(){for(var t=0;t<this.annotations_.length;t++){var e=this.annotations_[t];e.parentNode&&e.parentNode.removeChild(e),this.annotations_[t]=null}this.annotations_=[]},t.prototype.clearChart=function(t){this.detachLabels()},t.prototype.didDrawChart=function(t){var e=t.dygraph,a=e.layout_.annotated_points;if(a&&0!==a.length)for(var i=t.canvas.parentNode,r={position:"absolute",fontSize:e.getOption("axisLabelFontSize")+"px",zIndex:10,overflow:"hidden"},n=function(t,a,i){return function(r){var n=i.annotation;n.hasOwnProperty(t)?n[t](n,i,e,r):e.getOption(a)&&e.getOption(a)(n,i,e,r)}},o=t.dygraph.plotter_.area,s={},l=0;l<a.length;l++){var h=a[l];if(!(h.canvasx<o.x||h.canvasx>o.x+o.w||h.canvasy<o.y||h.canvasy>o.y+o.h)){var p=h.annotation,g=6;p.hasOwnProperty("tickHeight")&&(g=p.tickHeight);var d=document.createElement("div");for(var u in r)r.hasOwnProperty(u)&&(d.style[u]=r[u]);p.hasOwnProperty("icon")||(d.className="dygraphDefaultAnnotation"),p.hasOwnProperty("cssClass")&&(d.className+=" "+p.cssClass);var c=p.hasOwnProperty("width")?p.width:16,y=p.hasOwnProperty("height")?p.height:16;if(p.hasOwnProperty("icon")){var _=document.createElement("img");_.src=p.icon,_.width=c,_.height=y,d.appendChild(_)}else h.annotation.hasOwnProperty("shortText")&&d.appendChild(document.createTextNode(h.annotation.shortText));var v=h.canvasx-c/2;d.style.left=v+"px";var f=0;if(p.attachAtBottom){var x=o.y+o.h-y-g;s[v]?x-=s[v]:s[v]=0,s[v]+=g+y,f=x}else f=h.canvasy-y-g;d.style.top=f+"px",d.style.width=c+"px",d.style.height=y+"px",d.title=h.annotation.text,d.style.color=e.colorsMap_[h.name],d.style.borderColor=e.colorsMap_[h.name],p.div=d,e.addAndTrackEvent(d,"click",n("clickHandler","annotationClickHandler",h,this)),e.addAndTrackEvent(d,"mouseover",n("mouseOverHandler","annotationMouseOverHandler",h,this)),e.addAndTrackEvent(d,"mouseout",n("mouseOutHandler","annotationMouseOutHandler",h,this)),e.addAndTrackEvent(d,"dblclick",n("dblClickHandler","annotationDblClickHandler",h,this)),i.appendChild(d),this.annotations_.push(d);var m=t.drawingContext;if(m.save(),m.strokeStyle=e.colorsMap_[h.name],m.beginPath(),p.attachAtBottom){var x=f+y;m.moveTo(h.canvasx,x),m.lineTo(h.canvasx,x+g)}else m.moveTo(h.canvasx,h.canvasy),m.lineTo(h.canvasx,h.canvasy-2-g);m.closePath(),m.stroke(),m.restore()}}},t.prototype.destroy=function(){this.detachLabels()},t}(),Dygraph.Plugins.Axes=function(){"use strict";var t=function(){this.xlabels_=[],this.ylabels_=[]};return t.prototype.toString=function(){return"Axes Plugin"},t.prototype.activate=function(t){return{layout:this.layout,clearChart:this.clearChart,willDrawChart:this.willDrawChart}},t.prototype.layout=function(t){var e=t.dygraph;if(e.getOptionForAxis("drawAxis","y")){var a=e.getOptionForAxis("axisLabelWidth","y")+2*e.getOptionForAxis("axisTickSize","y");t.reserveSpaceLeft(a)}if(e.getOptionForAxis("drawAxis","x")){var i;i=e.getOption("xAxisHeight")?e.getOption("xAxisHeight"):e.getOptionForAxis("axisLabelFontSize","x")+2*e.getOptionForAxis("axisTickSize","x"),t.reserveSpaceBottom(i)}if(2==e.numAxes()){if(e.getOptionForAxis("drawAxis","y2")){var a=e.getOptionForAxis("axisLabelWidth","y2")+2*e.getOptionForAxis("axisTickSize","y2");t.reserveSpaceRight(a)}}else e.numAxes()>2&&e.error("Only two y-axes are supported at this time. (Trying to use "+e.numAxes()+")")},t.prototype.detachLabels=function(){function t(t){for(var e=0;e<t.length;e++){var a=t[e];a.parentNode&&a.parentNode.removeChild(a)}}t(this.xlabels_),t(this.ylabels_),this.xlabels_=[],this.ylabels_=[]},t.prototype.clearChart=function(t){this.detachLabels()},t.prototype.willDrawChart=function(t){function e(t){return Math.round(t)+.5}function a(t){return Math.round(t)-.5}var i=t.dygraph;if(i.getOptionForAxis("drawAxis","x")||i.getOptionForAxis("drawAxis","y")||i.getOptionForAxis("drawAxis","y2")){var r,n,o,s,l,h=t.drawingContext,p=t.canvas.parentNode,g=i.width_,d=i.height_,u=function(t){return{position:"absolute",fontSize:i.getOptionForAxis("axisLabelFontSize",t)+"px",zIndex:10,color:i.getOptionForAxis("axisLabelColor",t),width:i.getOptionForAxis("axisLabelWidth",t)+"px",lineHeight:"normal",overflow:"hidden"}},c={x:u("x"),y:u("y"),y2:u("y2")},y=function(t,e,a){var i=document.createElement("div"),r=c["y2"==a?"y2":e];for(var n in r)r.hasOwnProperty(n)&&(i.style[n]=r[n]);var o=document.createElement("div");return o.className="dygraph-axis-label dygraph-axis-label-"+e+(a?" dygraph-axis-label-"+a:""),o.innerHTML=t,i.appendChild(o),i};h.save();var _=i.layout_,v=t.dygraph.plotter_.area,f=function(t){return function(e){return i.getOptionForAxis(e,t)}};if(i.getOptionForAxis("drawAxis","y")){if(_.yticks&&_.yticks.length>0){var x=i.numAxes(),m=[f("y"),f("y2")];for(l=0;l<_.yticks.length;l++){if(s=_.yticks[l],"function"==typeof s)return;n=v.x;var D=1,w="y1",A=m[0];1==s[0]&&(n=v.x+v.w,D=-1,w="y2",A=m[1]);var b=A("axisLabelFontSize");o=v.y+s[1]*v.h,r=y(s[2],"y",2==x?w:null);var T=o-b/2;0>T&&(T=0),T+b+3>d?r.style.bottom="0":r.style.top=T+"px",0===s[0]?(r.style.left=v.x-A("axisLabelWidth")-A("axisTickSize")+"px",r.style.textAlign="right"):1==s[0]&&(r.style.left=v.x+v.w+A("axisTickSize")+"px",r.style.textAlign="left"),r.style.width=A("axisLabelWidth")+"px",p.appendChild(r),this.ylabels_.push(r)}var E=this.ylabels_[0],b=i.getOptionForAxis("axisLabelFontSize","y"),C=parseInt(E.style.top,10)+b;C>d-b&&(E.style.top=parseInt(E.style.top,10)-b/2+"px")}var L;if(i.getOption("drawAxesAtZero")){var P=i.toPercentXCoord(0);(P>1||0>P||isNaN(P))&&(P=0),L=e(v.x+P*v.w)}else L=e(v.x);h.strokeStyle=i.getOptionForAxis("axisLineColor","y"),h.lineWidth=i.getOptionForAxis("axisLineWidth","y"),h.beginPath(),h.moveTo(L,a(v.y)),h.lineTo(L,a(v.y+v.h)),h.closePath(),h.stroke(),2==i.numAxes()&&(h.strokeStyle=i.getOptionForAxis("axisLineColor","y2"),h.lineWidth=i.getOptionForAxis("axisLineWidth","y2"),h.beginPath(),h.moveTo(a(v.x+v.w),a(v.y)),h.lineTo(a(v.x+v.w),a(v.y+v.h)),h.closePath(),h.stroke())}if(i.getOptionForAxis("drawAxis","x")){if(_.xticks){var A=f("x");for(l=0;l<_.xticks.length;l++){s=_.xticks[l],n=v.x+s[0]*v.w,o=v.y+v.h,r=y(s[1],"x"),r.style.textAlign="center",r.style.top=o+A("axisTickSize")+"px";var S=n-A("axisLabelWidth")/2;S+A("axisLabelWidth")>g&&(S=g-A("axisLabelWidth"),r.style.textAlign="right"),0>S&&(S=0,r.style.textAlign="left"),r.style.left=S+"px",r.style.width=A("axisLabelWidth")+"px",
+p.appendChild(r),this.xlabels_.push(r)}}h.strokeStyle=i.getOptionForAxis("axisLineColor","x"),h.lineWidth=i.getOptionForAxis("axisLineWidth","x"),h.beginPath();var O;if(i.getOption("drawAxesAtZero")){var P=i.toPercentYCoord(0,0);(P>1||0>P)&&(P=1),O=a(v.y+P*v.h)}else O=a(v.y+v.h);h.moveTo(e(v.x),O),h.lineTo(e(v.x+v.w),O),h.closePath(),h.stroke()}h.restore()}},t}(),Dygraph.Plugins.ChartLabels=function(){"use strict";var t=function(){this.title_div_=null,this.xlabel_div_=null,this.ylabel_div_=null,this.y2label_div_=null};t.prototype.toString=function(){return"ChartLabels Plugin"},t.prototype.activate=function(t){return{layout:this.layout,didDrawChart:this.didDrawChart}};var e=function(t){var e=document.createElement("div");return e.style.position="absolute",e.style.left=t.x+"px",e.style.top=t.y+"px",e.style.width=t.w+"px",e.style.height=t.h+"px",e};t.prototype.detachLabels_=function(){for(var t=[this.title_div_,this.xlabel_div_,this.ylabel_div_,this.y2label_div_],e=0;e<t.length;e++){var a=t[e];a&&a.parentNode&&a.parentNode.removeChild(a)}this.title_div_=null,this.xlabel_div_=null,this.ylabel_div_=null,this.y2label_div_=null};var a=function(t,e,a,i,r){var n=document.createElement("div");n.style.position="absolute",1==a?n.style.left="0px":n.style.left=e.x+"px",n.style.top=e.y+"px",n.style.width=e.w+"px",n.style.height=e.h+"px",n.style.fontSize=t.getOption("yLabelWidth")-2+"px";var o=document.createElement("div");o.style.position="absolute",o.style.width=e.h+"px",o.style.height=e.w+"px",o.style.top=e.h/2-e.w/2+"px",o.style.left=e.w/2-e.h/2+"px",o.style.textAlign="center";var s="rotate("+(1==a?"-":"")+"90deg)";o.style.transform=s,o.style.WebkitTransform=s,o.style.MozTransform=s,o.style.OTransform=s,o.style.msTransform=s,"undefined"!=typeof document.documentMode&&document.documentMode<9&&(o.style.filter="progid:DXImageTransform.Microsoft.BasicImage(rotation="+(1==a?"3":"1")+")",o.style.left="0px",o.style.top="0px");var l=document.createElement("div");return l.className=i,l.innerHTML=r,o.appendChild(l),n.appendChild(o),n};return t.prototype.layout=function(t){this.detachLabels_();var i=t.dygraph,r=t.chart_div;if(i.getOption("title")){var n=t.reserveSpaceTop(i.getOption("titleHeight"));this.title_div_=e(n),this.title_div_.style.textAlign="center",this.title_div_.style.fontSize=i.getOption("titleHeight")-8+"px",this.title_div_.style.fontWeight="bold",this.title_div_.style.zIndex=10;var o=document.createElement("div");o.className="dygraph-label dygraph-title",o.innerHTML=i.getOption("title"),this.title_div_.appendChild(o),r.appendChild(this.title_div_)}if(i.getOption("xlabel")){var s=t.reserveSpaceBottom(i.getOption("xLabelHeight"));this.xlabel_div_=e(s),this.xlabel_div_.style.textAlign="center",this.xlabel_div_.style.fontSize=i.getOption("xLabelHeight")-2+"px";var o=document.createElement("div");o.className="dygraph-label dygraph-xlabel",o.innerHTML=i.getOption("xlabel"),this.xlabel_div_.appendChild(o),r.appendChild(this.xlabel_div_)}if(i.getOption("ylabel")){var l=t.reserveSpaceLeft(0);this.ylabel_div_=a(i,l,1,"dygraph-label dygraph-ylabel",i.getOption("ylabel")),r.appendChild(this.ylabel_div_)}if(i.getOption("y2label")&&2==i.numAxes()){var h=t.reserveSpaceRight(0);this.y2label_div_=a(i,h,2,"dygraph-label dygraph-y2label",i.getOption("y2label")),r.appendChild(this.y2label_div_)}},t.prototype.didDrawChart=function(t){var e=t.dygraph;this.title_div_&&(this.title_div_.children[0].innerHTML=e.getOption("title")),this.xlabel_div_&&(this.xlabel_div_.children[0].innerHTML=e.getOption("xlabel")),this.ylabel_div_&&(this.ylabel_div_.children[0].children[0].innerHTML=e.getOption("ylabel")),this.y2label_div_&&(this.y2label_div_.children[0].children[0].innerHTML=e.getOption("y2label"))},t.prototype.clearChart=function(){},t.prototype.destroy=function(){this.detachLabels_()},t}(),Dygraph.Plugins.Grid=function(){"use strict";var t=function(){};return t.prototype.toString=function(){return"Gridline Plugin"},t.prototype.activate=function(t){return{willDrawChart:this.willDrawChart}},t.prototype.willDrawChart=function(t){function e(t){return Math.round(t)+.5}function a(t){return Math.round(t)-.5}var i,r,n,o,s=t.dygraph,l=t.drawingContext,h=s.layout_,p=t.dygraph.plotter_.area;if(s.getOptionForAxis("drawGrid","y")){for(var g=["y","y2"],d=[],u=[],c=[],y=[],_=[],n=0;n<g.length;n++)c[n]=s.getOptionForAxis("drawGrid",g[n]),c[n]&&(d[n]=s.getOptionForAxis("gridLineColor",g[n]),u[n]=s.getOptionForAxis("gridLineWidth",g[n]),_[n]=s.getOptionForAxis("gridLinePattern",g[n]),y[n]=_[n]&&_[n].length>=2);for(o=h.yticks,l.save(),n=0;n<o.length;n++){var v=o[n][0];c[v]&&(y[v]&&l.installPattern(_[v]),l.strokeStyle=d[v],l.lineWidth=u[v],i=e(p.x),r=a(p.y+o[n][1]*p.h),l.beginPath(),l.moveTo(i,r),l.lineTo(i+p.w,r),l.closePath(),l.stroke(),y[v]&&l.uninstallPattern())}l.restore()}if(s.getOptionForAxis("drawGrid","x")){o=h.xticks,l.save();var _=s.getOptionForAxis("gridLinePattern","x"),y=_&&_.length>=2;for(y&&l.installPattern(_),l.strokeStyle=s.getOptionForAxis("gridLineColor","x"),l.lineWidth=s.getOptionForAxis("gridLineWidth","x"),n=0;n<o.length;n++)i=e(p.x+o[n][0]*p.w),r=a(p.y+p.h),l.beginPath(),l.moveTo(i,r),l.lineTo(i,p.y),l.closePath(),l.stroke();y&&l.uninstallPattern(),l.restore()}},t.prototype.destroy=function(){},t}(),Dygraph.Plugins.Legend=function(){"use strict";var t=function(){this.legend_div_=null,this.is_generated_div_=!1};t.prototype.toString=function(){return"Legend Plugin"};var e;t.prototype.activate=function(t){var e,a=t.getOption("labelsDivWidth"),i=t.getOption("labelsDiv");if(i&&null!==i)e="string"==typeof i||i instanceof String?document.getElementById(i):i;else{var r={position:"absolute",fontSize:"14px",zIndex:10,width:a+"px",top:"0px",left:t.size().width-a-2+"px",background:"white",lineHeight:"normal",textAlign:"left",overflow:"hidden"};Dygraph.update(r,t.getOption("labelsDivStyles")),e=document.createElement("div"),e.className="dygraph-legend";for(var n in r)if(r.hasOwnProperty(n))try{e.style[n]=r[n]}catch(o){console.warn("You are using unsupported css properties for your browser in labelsDivStyles")}t.graphDiv.appendChild(e),this.is_generated_div_=!0}return this.legend_div_=e,this.one_em_width_=10,{select:this.select,deselect:this.deselect,predraw:this.predraw,didDrawChart:this.didDrawChart}};var a=function(t){var e=document.createElement("span");e.setAttribute("style","margin: 0; padding: 0 0 0 1em; border: 0;"),t.appendChild(e);var a=e.offsetWidth;return t.removeChild(e),a},i=function(t){return t.replace(/&/g,"&amp;").replace(/"/g,"&quot;").replace(/</g,"&lt;").replace(/>/g,"&gt;")};return t.prototype.select=function(e){var a=e.selectedX,i=e.selectedPoints,r=e.selectedRow,n=e.dygraph.getOption("legend");if("never"===n)return void(this.legend_div_.style.display="none");if("follow"===n){var o=e.dygraph.plotter_.area,s=e.dygraph.getOption("labelsDivWidth"),l=e.dygraph.getOptionForAxis("axisLabelWidth","y"),h=i[0].x*o.w+20,p=i[0].y*o.h-20;h+s+1>window.scrollX+window.innerWidth&&(h=h-40-s-(l-o.x)),e.dygraph.graphDiv.appendChild(this.legend_div_),this.legend_div_.style.left=l+h+"px",this.legend_div_.style.top=p+"px"}var g=t.generateLegendHTML(e.dygraph,a,i,this.one_em_width_,r);this.legend_div_.innerHTML=g,this.legend_div_.style.display=""},t.prototype.deselect=function(e){var i=e.dygraph.getOption("legend");"always"!==i&&(this.legend_div_.style.display="none");var r=a(this.legend_div_);this.one_em_width_=r;var n=t.generateLegendHTML(e.dygraph,void 0,void 0,r,null);this.legend_div_.innerHTML=n},t.prototype.didDrawChart=function(t){this.deselect(t)},t.prototype.predraw=function(t){if(this.is_generated_div_){t.dygraph.graphDiv.appendChild(this.legend_div_);var e=t.dygraph.plotter_.area,a=t.dygraph.getOption("labelsDivWidth");this.legend_div_.style.left=e.x+e.w-a-1+"px",this.legend_div_.style.top=e.y+"px",this.legend_div_.style.width=a+"px"}},t.prototype.destroy=function(){this.legend_div_=null},t.generateLegendHTML=function(t,a,r,n,o){if(t.getOption("showLabelsOnHighlight")!==!0)return"";var s,l,h,p,g,d=t.getLabels();if("undefined"==typeof a){if("always"!=t.getOption("legend"))return"";for(l=t.getOption("labelsSeparateLines"),s="",h=1;h<d.length;h++){var u=t.getPropertiesForSeries(d[h]);u.visible&&(""!==s&&(s+=l?"<br/>":" "),g=t.getOption("strokePattern",d[h]),p=e(g,u.color,n),s+="<span style='font-weight: bold; color: "+u.color+";'>"+p+" "+i(d[h])+"</span>")}return s}var c=t.optionsViewForAxis_("x"),y=c("valueFormatter");s=y.call(t,a,c,d[0],t,o,0),""!==s&&(s+=":");var _=[],v=t.numAxes();for(h=0;v>h;h++)_[h]=t.optionsViewForAxis_("y"+(h?1+h:""));var f=t.getOption("labelsShowZeroValues");l=t.getOption("labelsSeparateLines");var x=t.getHighlightSeries();for(h=0;h<r.length;h++){var m=r[h];if((0!==m.yval||f)&&Dygraph.isOK(m.canvasy)){l&&(s+="<br/>");var u=t.getPropertiesForSeries(m.name),D=_[u.axis-1],w=D("valueFormatter"),A=w.call(t,m.yval,D,m.name,t,o,d.indexOf(m.name)),b=m.name==x?" class='highlight'":"";s+="<span"+b+"> <b><span style='color: "+u.color+";'>"+i(m.name)+"</span></b>:&#160;"+A+"</span>"}}return s},e=function(t,e,a){var i=/MSIE/.test(navigator.userAgent)&&!window.opera;if(i)return"&mdash;";if(!t||t.length<=1)return'<div style="display: inline-block; position: relative; bottom: .5ex; padding-left: 1em; height: 1px; border-bottom: 2px solid '+e+';"></div>';var r,n,o,s,l,h=0,p=0,g=[];for(r=0;r<=t.length;r++)h+=t[r%t.length];if(l=Math.floor(a/(h-t[0])),l>1){for(r=0;r<t.length;r++)g[r]=t[r]/a;p=g.length}else{for(l=1,r=0;r<t.length;r++)g[r]=t[r]/h;p=g.length+1}var d="";for(n=0;l>n;n++)for(r=0;p>r;r+=2)o=g[r%g.length],s=r<t.length?g[(r+1)%g.length]:0,d+='<div style="display: inline-block; position: relative; bottom: .5ex; margin-right: '+s+"em; padding-left: "+o+"em; height: 1px; border-bottom: 2px solid "+e+';"></div>';return d},t}(),Dygraph.Plugins.RangeSelector=function(){"use strict";var t=function(){this.isIE_=/MSIE/.test(navigator.userAgent)&&!window.opera,this.hasTouchInterface_="undefined"!=typeof TouchEvent,this.isMobileDevice_=/mobile|android/gi.test(navigator.appVersion),this.interfaceCreated_=!1};return t.prototype.toString=function(){return"RangeSelector Plugin"},t.prototype.activate=function(t){return this.dygraph_=t,this.isUsingExcanvas_=t.isUsingExcanvas_,this.getOption_("showRangeSelector")&&this.createInterface_(),{layout:this.reserveSpace_,predraw:this.renderStaticLayer_,didDrawChart:this.renderInteractiveLayer_}},t.prototype.destroy=function(){this.bgcanvas_=null,this.fgcanvas_=null,this.leftZoomHandle_=null,this.rightZoomHandle_=null,this.iePanOverlay_=null},t.prototype.getOption_=function(t,e){return this.dygraph_.getOption(t,e)},t.prototype.setDefaultOption_=function(t,e){this.dygraph_.attrs_[t]=e},t.prototype.createInterface_=function(){this.createCanvases_(),this.isUsingExcanvas_&&this.createIEPanOverlay_(),this.createZoomHandles_(),this.initInteraction_(),this.getOption_("animatedZooms")&&(console.warn("Animated zooms and range selector are not compatible; disabling animatedZooms."),this.dygraph_.updateOptions({animatedZooms:!1},!0)),this.interfaceCreated_=!0,this.addToGraph_()},t.prototype.addToGraph_=function(){var t=this.graphDiv_=this.dygraph_.graphDiv;t.appendChild(this.bgcanvas_),t.appendChild(this.fgcanvas_),t.appendChild(this.leftZoomHandle_),t.appendChild(this.rightZoomHandle_)},t.prototype.removeFromGraph_=function(){var t=this.graphDiv_;t.removeChild(this.bgcanvas_),t.removeChild(this.fgcanvas_),t.removeChild(this.leftZoomHandle_),t.removeChild(this.rightZoomHandle_),this.graphDiv_=null},t.prototype.reserveSpace_=function(t){this.getOption_("showRangeSelector")&&t.reserveSpaceBottom(this.getOption_("rangeSelectorHeight")+4)},t.prototype.renderStaticLayer_=function(){this.updateVisibility_()&&(this.resize_(),this.drawStaticLayer_())},t.prototype.renderInteractiveLayer_=function(){this.updateVisibility_()&&!this.isChangingRange_&&(this.placeZoomHandles_(),this.drawInteractiveLayer_())},t.prototype.updateVisibility_=function(){var t=this.getOption_("showRangeSelector");if(t)this.interfaceCreated_?this.graphDiv_&&this.graphDiv_.parentNode||this.addToGraph_():this.createInterface_();else if(this.graphDiv_){this.removeFromGraph_();var e=this.dygraph_;setTimeout(function(){e.width_=0,e.resize()},1)}return t},t.prototype.resize_=function(){function t(t,e,a){var i=Dygraph.getContextPixelRatio(e);t.style.top=a.y+"px",t.style.left=a.x+"px",t.width=a.w*i,t.height=a.h*i,t.style.width=a.w+"px",t.style.height=a.h+"px",1!=i&&e.scale(i,i)}var e=this.dygraph_.layout_.getPlotArea(),a=0;this.dygraph_.getOptionForAxis("drawAxis","x")&&(a=this.getOption_("xAxisHeight")||this.getOption_("axisLabelFontSize")+2*this.getOption_("axisTickSize")),this.canvasRect_={x:e.x,y:e.y+e.h+a+4,w:e.w,h:this.getOption_("rangeSelectorHeight")},t(this.bgcanvas_,this.bgcanvas_ctx_,this.canvasRect_),t(this.fgcanvas_,this.fgcanvas_ctx_,this.canvasRect_)},t.prototype.createCanvases_=function(){this.bgcanvas_=Dygraph.createCanvas(),this.bgcanvas_.className="dygraph-rangesel-bgcanvas",this.bgcanvas_.style.position="absolute",this.bgcanvas_.style.zIndex=9,this.bgcanvas_ctx_=Dygraph.getContext(this.bgcanvas_),this.fgcanvas_=Dygraph.createCanvas(),this.fgcanvas_.className="dygraph-rangesel-fgcanvas",this.fgcanvas_.style.position="absolute",this.fgcanvas_.style.zIndex=9,this.fgcanvas_.style.cursor="default",this.fgcanvas_ctx_=Dygraph.getContext(this.fgcanvas_)},t.prototype.createIEPanOverlay_=function(){this.iePanOverlay_=document.createElement("div"),this.iePanOverlay_.style.position="absolute",this.iePanOverlay_.style.backgroundColor="white",this.iePanOverlay_.style.filter="alpha(opacity=0)",this.iePanOverlay_.style.display="none",this.iePanOverlay_.style.cursor="move",this.fgcanvas_.appendChild(this.iePanOverlay_)},t.prototype.createZoomHandles_=function(){var t=new Image;t.className="dygraph-rangesel-zoomhandle",t.style.position="absolute",t.style.zIndex=10,t.style.visibility="hidden",t.style.cursor="col-resize",/MSIE 7/.test(navigator.userAgent)?(t.width=7,t.height=14,t.style.backgroundColor="white",t.style.border="1px solid #333333"):(t.width=9,t.height=16,t.src=""),this.isMobileDevice_&&(t.width*=2,t.height*=2),this.leftZoomHandle_=t,this.rightZoomHandle_=t.cloneNode(!1)},t.prototype.initInteraction_=function(){var t,e,a,i,r,n,o,s,l,h,p,g,d,u,c=this,y=document,_=0,v=null,f=!1,x=!1,m=!this.isMobileDevice_&&!this.isUsingExcanvas_,D=new Dygraph.IFrameTarp;t=function(t){var e=c.dygraph_.xAxisExtremes(),a=(e[1]-e[0])/c.canvasRect_.w,i=e[0]+(t.leftHandlePos-c.canvasRect_.x)*a,r=e[0]+(t.rightHandlePos-c.canvasRect_.x)*a;return[i,r]},e=function(t){return Dygraph.cancelEvent(t),f=!0,_=t.clientX,v=t.target?t.target:t.srcElement,("mousedown"===t.type||"dragstart"===t.type)&&(Dygraph.addEvent(y,"mousemove",a),Dygraph.addEvent(y,"mouseup",i)),c.fgcanvas_.style.cursor="col-resize",D.cover(),!0},a=function(t){if(!f)return!1;Dygraph.cancelEvent(t);var e=t.clientX-_;if(Math.abs(e)<4)return!0;_=t.clientX;var a,i=c.getZoomHandleStatus_();v==c.leftZoomHandle_?(a=i.leftHandlePos+e,a=Math.min(a,i.rightHandlePos-v.width-3),a=Math.max(a,c.canvasRect_.x)):(a=i.rightHandlePos+e,a=Math.min(a,c.canvasRect_.x+c.canvasRect_.w),a=Math.max(a,i.leftHandlePos+v.width+3));var n=v.width/2;return v.style.left=a-n+"px",c.drawInteractiveLayer_(),m&&r(),!0},i=function(t){return f?(f=!1,D.uncover(),Dygraph.removeEvent(y,"mousemove",a),Dygraph.removeEvent(y,"mouseup",i),c.fgcanvas_.style.cursor="default",m||r(),!0):!1},r=function(){try{var e=c.getZoomHandleStatus_();if(c.isChangingRange_=!0,e.isZoomed){var a=t(e);c.dygraph_.doZoomXDates_(a[0],a[1])}else c.dygraph_.resetZoom()}finally{c.isChangingRange_=!1}},n=function(t){if(c.isUsingExcanvas_)return t.srcElement==c.iePanOverlay_;var e=c.leftZoomHandle_.getBoundingClientRect(),a=e.left+e.width/2;e=c.rightZoomHandle_.getBoundingClientRect();var i=e.left+e.width/2;return t.clientX>a&&t.clientX<i},o=function(t){return!x&&n(t)&&c.getZoomHandleStatus_().isZoomed?(Dygraph.cancelEvent(t),x=!0,_=t.clientX,"mousedown"===t.type&&(Dygraph.addEvent(y,"mousemove",s),Dygraph.addEvent(y,"mouseup",l)),!0):!1},s=function(t){if(!x)return!1;Dygraph.cancelEvent(t);var e=t.clientX-_;if(Math.abs(e)<4)return!0;_=t.clientX;var a=c.getZoomHandleStatus_(),i=a.leftHandlePos,r=a.rightHandlePos,n=r-i;i+e<=c.canvasRect_.x?(i=c.canvasRect_.x,r=i+n):r+e>=c.canvasRect_.x+c.canvasRect_.w?(r=c.canvasRect_.x+c.canvasRect_.w,i=r-n):(i+=e,r+=e);var o=c.leftZoomHandle_.width/2;return c.leftZoomHandle_.style.left=i-o+"px",c.rightZoomHandle_.style.left=r-o+"px",c.drawInteractiveLayer_(),m&&h(),!0},l=function(t){return x?(x=!1,Dygraph.removeEvent(y,"mousemove",s),Dygraph.removeEvent(y,"mouseup",l),m||h(),!0):!1},h=function(){try{c.isChangingRange_=!0,c.dygraph_.dateWindow_=t(c.getZoomHandleStatus_()),c.dygraph_.drawGraph_(!1)}finally{c.isChangingRange_=!1}},p=function(t){if(!f&&!x){var e=n(t)?"move":"default";e!=c.fgcanvas_.style.cursor&&(c.fgcanvas_.style.cursor=e)}},g=function(t){"touchstart"==t.type&&1==t.targetTouches.length?e(t.targetTouches[0])&&Dygraph.cancelEvent(t):"touchmove"==t.type&&1==t.targetTouches.length?a(t.targetTouches[0])&&Dygraph.cancelEvent(t):i(t)},d=function(t){"touchstart"==t.type&&1==t.targetTouches.length?o(t.targetTouches[0])&&Dygraph.cancelEvent(t):"touchmove"==t.type&&1==t.targetTouches.length?s(t.targetTouches[0])&&Dygraph.cancelEvent(t):l(t)},u=function(t,e){for(var a=["touchstart","touchend","touchmove","touchcancel"],i=0;i<a.length;i++)c.dygraph_.addAndTrackEvent(t,a[i],e)},this.setDefaultOption_("interactionModel",Dygraph.Interaction.dragIsPanInteractionModel),this.setDefaultOption_("panEdgeFraction",1e-4);var w=window.opera?"mousedown":"dragstart";this.dygraph_.addAndTrackEvent(this.leftZoomHandle_,w,e),this.dygraph_.addAndTrackEvent(this.rightZoomHandle_,w,e),this.isUsingExcanvas_?this.dygraph_.addAndTrackEvent(this.iePanOverlay_,"mousedown",o):(this.dygraph_.addAndTrackEvent(this.fgcanvas_,"mousedown",o),this.dygraph_.addAndTrackEvent(this.fgcanvas_,"mousemove",p)),this.hasTouchInterface_&&(u(this.leftZoomHandle_,g),u(this.rightZoomHandle_,g),u(this.fgcanvas_,d))},t.prototype.drawStaticLayer_=function(){var t=this.bgcanvas_ctx_;t.clearRect(0,0,this.canvasRect_.w,this.canvasRect_.h);try{this.drawMiniPlot_()}catch(e){console.warn(e)}var a=.5;this.bgcanvas_ctx_.lineWidth=1,t.strokeStyle="gray",t.beginPath(),t.moveTo(a,a),t.lineTo(a,this.canvasRect_.h-a),t.lineTo(this.canvasRect_.w-a,this.canvasRect_.h-a),t.lineTo(this.canvasRect_.w-a,a),t.stroke()},t.prototype.drawMiniPlot_=function(){var t=this.getOption_("rangeSelectorPlotFillColor"),e=this.getOption_("rangeSelectorPlotStrokeColor");if(t||e){var a=this.getOption_("stepPlot"),i=this.computeCombinedSeriesAndLimits_(),r=i.yMax-i.yMin,n=this.bgcanvas_ctx_,o=.5,s=this.dygraph_.xAxisExtremes(),l=Math.max(s[1]-s[0],1e-30),h=(this.canvasRect_.w-o)/l,p=(this.canvasRect_.h-o)/r,g=this.canvasRect_.w-o,d=this.canvasRect_.h-o,u=null,c=null;n.beginPath(),n.moveTo(o,d);for(var y=0;y<i.data.length;y++){var _=i.data[y],v=null!==_[0]?(_[0]-s[0])*h:0/0,f=null!==_[1]?d-(_[1]-i.yMin)*p:0/0;(a||null===u||Math.round(v)!=Math.round(u))&&(isFinite(v)&&isFinite(f)?(null===u?n.lineTo(v,d):a&&n.lineTo(v,c),n.lineTo(v,f),u=v,c=f):(null!==u&&(a?(n.lineTo(v,c),n.lineTo(v,d)):n.lineTo(u,d)),u=c=null))}if(n.lineTo(g,d),n.closePath(),t){var x=this.bgcanvas_ctx_.createLinearGradient(0,0,0,d);x.addColorStop(0,"white"),x.addColorStop(1,t),this.bgcanvas_ctx_.fillStyle=x,n.fill()}e&&(this.bgcanvas_ctx_.strokeStyle=e,this.bgcanvas_ctx_.lineWidth=1.5,n.stroke())}},t.prototype.computeCombinedSeriesAndLimits_=function(){var t,e=this.dygraph_,a=this.getOption_("logscale"),i=e.numColumns(),r=e.getLabels(),n=new Array(i),o=!1;for(t=1;i>t;t++){var s=this.getOption_("showInRangeSelector",r[t]);n[t]=s,null!==s&&(o=!0)}if(!o)for(t=0;t<n.length;t++)n[t]=!0;var l=[],h=e.dataHandler_,p=e.attributes_;for(t=1;t<e.numColumns();t++)if(n[t]){var g=h.extractSeries(e.rawData_,t,p);e.rollPeriod()>1&&(g=h.rollingAverage(g,e.rollPeriod(),p)),l.push(g)}var d=[];for(t=0;t<l[0].length;t++){for(var u=0,c=0,y=0;y<l.length;y++){var _=l[y][t][1];null===_||isNaN(_)||(c++,u+=_)}d.push([l[0][t][0],u/c])}var v=Number.MAX_VALUE,f=-Number.MAX_VALUE;for(t=0;t<d.length;t++){var x=d[t][1];null!==x&&isFinite(x)&&(!a||x>0)&&(v=Math.min(v,x),f=Math.max(f,x))}var m=.25;if(a)for(f=Dygraph.log10(f),f+=f*m,v=Dygraph.log10(v),t=0;t<d.length;t++)d[t][1]=Dygraph.log10(d[t][1]);else{var D,w=f-v;D=w<=Number.MIN_VALUE?f*m:w*m,f+=D,v-=D}return{data:d,yMin:v,yMax:f}},t.prototype.placeZoomHandles_=function(){var t=this.dygraph_.xAxisExtremes(),e=this.dygraph_.xAxisRange(),a=t[1]-t[0],i=Math.max(0,(e[0]-t[0])/a),r=Math.max(0,(t[1]-e[1])/a),n=this.canvasRect_.x+this.canvasRect_.w*i,o=this.canvasRect_.x+this.canvasRect_.w*(1-r),s=Math.max(this.canvasRect_.y,this.canvasRect_.y+(this.canvasRect_.h-this.leftZoomHandle_.height)/2),l=this.leftZoomHandle_.width/2;this.leftZoomHandle_.style.left=n-l+"px",this.leftZoomHandle_.style.top=s+"px",this.rightZoomHandle_.style.left=o-l+"px",this.rightZoomHandle_.style.top=this.leftZoomHandle_.style.top,this.leftZoomHandle_.style.visibility="visible",this.rightZoomHandle_.style.visibility="visible"},t.prototype.drawInteractiveLayer_=function(){var t=this.fgcanvas_ctx_;t.clearRect(0,0,this.canvasRect_.w,this.canvasRect_.h);var e=1,a=this.canvasRect_.w-e,i=this.canvasRect_.h-e,r=this.getZoomHandleStatus_();if(t.strokeStyle="black",r.isZoomed){var n=Math.max(e,r.leftHandlePos-this.canvasRect_.x),o=Math.min(a,r.rightHandlePos-this.canvasRect_.x);t.fillStyle="rgba(240, 240, 240, 0.6)",t.fillRect(0,0,n,this.canvasRect_.h),t.fillRect(o,0,this.canvasRect_.w-o,this.canvasRect_.h),t.beginPath(),t.moveTo(e,e),t.lineTo(n,e),t.lineTo(n,i),t.lineTo(o,i),t.lineTo(o,e),t.lineTo(a,e),t.stroke(),this.isUsingExcanvas_&&(this.iePanOverlay_.style.width=o-n+"px",this.iePanOverlay_.style.left=n+"px",this.iePanOverlay_.style.height=i+"px",this.iePanOverlay_.style.display="inline")}else t.beginPath(),t.moveTo(e,e),t.lineTo(e,i),t.lineTo(a,i),t.lineTo(a,e),t.stroke(),this.iePanOverlay_&&(this.iePanOverlay_.style.display="none")},t.prototype.getZoomHandleStatus_=function(){var t=this.leftZoomHandle_.width/2,e=parseFloat(this.leftZoomHandle_.style.left)+t,a=parseFloat(this.rightZoomHandle_.style.left)+t;return{leftHandlePos:e,rightHandlePos:a,isZoomed:e-1>this.canvasRect_.x||a+1<this.canvasRect_.x+this.canvasRect_.w}},t}(),Dygraph.PLUGINS.push(Dygraph.Plugins.Legend,Dygraph.Plugins.Axes,Dygraph.Plugins.RangeSelector,Dygraph.Plugins.ChartLabels,Dygraph.Plugins.Annotations,Dygraph.Plugins.Grid),Dygraph.DataHandler=function(){},Dygraph.DataHandlers={},function(){"use strict";var t=Dygraph.DataHandler;t.X=0,t.Y=1,t.EXTRAS=2,t.prototype.extractSeries=function(t,e,a){},t.prototype.seriesToPoints=function(e,a,i){for(var r=[],n=0;n<e.length;++n){var o=e[n],s=o[1],l=null===s?null:t.parseFloat(s),h={x:0/0,y:0/0,xval:t.parseFloat(o[0]),yval:l,name:a,idx:n+i};r.push(h)}return this.onPointsCreated_(e,r),r},t.prototype.onPointsCreated_=function(t,e){},t.prototype.rollingAverage=function(t,e,a){},t.prototype.getExtremeYValues=function(t,e,a){},t.prototype.onLineEvaluated=function(t,e,a){},t.prototype.computeYInterpolation_=function(t,e,a){var i=e[1]-t[1],r=e[0]-t[0],n=i/r,o=(a-t[0])*n;return t[1]+o},t.prototype.getIndexesInWindow_=function(t,e){var a=0,i=t.length-1;if(e){for(var r=0,n=e[0],o=e[1];r<t.length-1&&t[r][0]<n;)a++,r++;for(r=t.length-1;r>0&&t[r][0]>o;)i--,r--}return i>=a?[a,i]:[0,t.length-1]},t.parseFloat=function(t){return null===t?0/0:t}}(),function(){"use strict";Dygraph.DataHandlers.DefaultHandler=function(){};var t=Dygraph.DataHandlers.DefaultHandler;t.prototype=new Dygraph.DataHandler,t.prototype.extractSeries=function(t,e,a){for(var i=[],r=a.get("logscale"),n=0;n<t.length;n++){var o=t[n][0],s=t[n][e];r&&0>=s&&(s=null),i.push([o,s])}return i},t.prototype.rollingAverage=function(t,e,a){e=Math.min(e,t.length);var i,r,n,o,s,l=[];if(1==e)return t;for(i=0;i<t.length;i++){for(o=0,s=0,r=Math.max(0,i-e+1);i+1>r;r++)n=t[r][1],null===n||isNaN(n)||(s++,o+=t[r][1]);s?l[i]=[t[i][0],o/s]:l[i]=[t[i][0],null]}return l},t.prototype.getExtremeYValues=function(t,e,a){for(var i,r=null,n=null,o=0,s=t.length-1,l=o;s>=l;l++)i=t[l][1],null===i||isNaN(i)||((null===n||i>n)&&(n=i),(null===r||r>i)&&(r=i));return[r,n]}}(),function(){"use strict";Dygraph.DataHandlers.DefaultFractionHandler=function(){};var t=Dygraph.DataHandlers.DefaultFractionHandler;t.prototype=new Dygraph.DataHandlers.DefaultHandler,t.prototype.extractSeries=function(t,e,a){for(var i,r,n,o,s,l,h=[],p=100,g=a.get("logscale"),d=0;d<t.length;d++)i=t[d][0],n=t[d][e],g&&null!==n&&(n[0]<=0||n[1]<=0)&&(n=null),null!==n?(o=n[0],s=n[1],null===o||isNaN(o)?h.push([i,o,[o,s]]):(l=s?o/s:0,r=p*l,h.push([i,r,[o,s]]))):h.push([i,null,[null,null]]);return h},t.prototype.rollingAverage=function(t,e,a){e=Math.min(e,t.length);var i,r=[],n=0,o=0,s=100;for(i=0;i<t.length;i++){n+=t[i][2][0],o+=t[i][2][1],i-e>=0&&(n-=t[i-e][2][0],o-=t[i-e][2][1]);var l=t[i][0],h=o?n/o:0;r[i]=[l,s*h]}return r}}(),function(){"use strict";Dygraph.DataHandlers.BarsHandler=function(){Dygraph.DataHandler.call(this)},Dygraph.DataHandlers.BarsHandler.prototype=new Dygraph.DataHandler;var t=Dygraph.DataHandlers.BarsHandler;t.prototype.extractSeries=function(t,e,a){},t.prototype.rollingAverage=function(t,e,a){},t.prototype.onPointsCreated_=function(t,e){for(var a=0;a<t.length;++a){var i=t[a],r=e[a];r.y_top=0/0,r.y_bottom=0/0,r.yval_minus=Dygraph.DataHandler.parseFloat(i[2][0]),r.yval_plus=Dygraph.DataHandler.parseFloat(i[2][1])}},t.prototype.getExtremeYValues=function(t,e,a){for(var i,r=null,n=null,o=0,s=t.length-1,l=o;s>=l;l++)if(i=t[l][1],null!==i&&!isNaN(i)){var h=t[l][2][0],p=t[l][2][1];h>i&&(h=i),i>p&&(p=i),(null===n||p>n)&&(n=p),(null===r||r>h)&&(r=h)}return[r,n]},t.prototype.onLineEvaluated=function(t,e,a){for(var i,r=0;r<t.length;r++)i=t[r],i.y_top=DygraphLayout.calcYNormal_(e,i.yval_minus,a),i.y_bottom=DygraphLayout.calcYNormal_(e,i.yval_plus,a)}}(),function(){"use strict";Dygraph.DataHandlers.CustomBarsHandler=function(){};var t=Dygraph.DataHandlers.CustomBarsHandler;t.prototype=new Dygraph.DataHandlers.BarsHandler,t.prototype.extractSeries=function(t,e,a){for(var i,r,n,o=[],s=a.get("logscale"),l=0;l<t.length;l++)i=t[l][0],n=t[l][e],s&&null!==n&&(n[0]<=0||n[1]<=0||n[2]<=0)&&(n=null),null!==n?(r=n[1],o.push(null===r||isNaN(r)?[i,r,[r,r]]:[i,r,[n[0],n[2]]])):o.push([i,null,[null,null]]);return o},t.prototype.rollingAverage=function(t,e,a){e=Math.min(e,t.length);var i,r,n,o,s,l,h,p=[];for(r=0,o=0,n=0,s=0,l=0;l<t.length;l++){if(i=t[l][1],h=t[l][2],p[l]=t[l],null===i||isNaN(i)||(r+=h[0],o+=i,n+=h[1],s+=1),l-e>=0){var g=t[l-e];null===g[1]||isNaN(g[1])||(r-=g[2][0],o-=g[1],n-=g[2][1],s-=1)}s?p[l]=[t[l][0],1*o/s,[1*r/s,1*n/s]]:p[l]=[t[l][0],null,[null,null]]}return p}}(),function(){"use strict";Dygraph.DataHandlers.ErrorBarsHandler=function(){};var t=Dygraph.DataHandlers.ErrorBarsHandler;t.prototype=new Dygraph.DataHandlers.BarsHandler,t.prototype.extractSeries=function(t,e,a){for(var i,r,n,o,s=[],l=a.get("sigma"),h=a.get("logscale"),p=0;p<t.length;p++)i=t[p][0],o=t[p][e],h&&null!==o&&(o[0]<=0||o[0]-l*o[1]<=0)&&(o=null),null!==o?(r=o[0],null===r||isNaN(r)?s.push([i,r,[r,r,r]]):(n=l*o[1],s.push([i,r,[r-n,r+n,o[1]]]))):s.push([i,null,[null,null,null]]);return s},t.prototype.rollingAverage=function(t,e,a){e=Math.min(e,t.length);var i,r,n,o,s,l,h,p,g,d=[],u=a.get("sigma");for(i=0;i<t.length;i++){for(s=0,p=0,l=0,r=Math.max(0,i-e+1);i+1>r;r++)n=t[r][1],null===n||isNaN(n)||(l++,s+=n,p+=Math.pow(t[r][2][2],2));l?(h=Math.sqrt(p)/l,g=s/l,d[i]=[t[i][0],g,[g-u*h,g+u*h]]):(o=1==e?t[i][1]:null,d[i]=[t[i][0],o,[o,o]])}return d}}(),function(){"use strict";Dygraph.DataHandlers.FractionsBarsHandler=function(){};var t=Dygraph.DataHandlers.FractionsBarsHandler;t.prototype=new Dygraph.DataHandlers.BarsHandler,t.prototype.extractSeries=function(t,e,a){for(var i,r,n,o,s,l,h,p,g=[],d=100,u=a.get("sigma"),c=a.get("logscale"),y=0;y<t.length;y++)i=t[y][0],n=t[y][e],c&&null!==n&&(n[0]<=0||n[1]<=0)&&(n=null),null!==n?(o=n[0],s=n[1],null===o||isNaN(o)?g.push([i,o,[o,o,o,s]]):(l=s?o/s:0,h=s?u*Math.sqrt(l*(1-l)/s):1,p=d*h,r=d*l,g.push([i,r,[r-p,r+p,o,s]]))):g.push([i,null,[null,null,null,null]]);return g},t.prototype.rollingAverage=function(t,e,a){e=Math.min(e,t.length);var i,r,n,o,s=[],l=a.get("sigma"),h=a.get("wilsonInterval"),p=0,g=0,d=100;for(n=0;n<t.length;n++){p+=t[n][2][2],g+=t[n][2][3],n-e>=0&&(p-=t[n-e][2][2],g-=t[n-e][2][3]);var u=t[n][0],c=g?p/g:0;if(h)if(g){var y=0>c?0:c,_=g,v=l*Math.sqrt(y*(1-y)/_+l*l/(4*_*_)),f=1+l*l/g;i=(y+l*l/(2*g)-v)/f,r=(y+l*l/(2*g)+v)/f,s[n]=[u,y*d,[i*d,r*d]]}else s[n]=[u,0,[0,0]];else o=g?l*Math.sqrt(c*(1-c)/g):1,s[n]=[u,d*c,[d*(c-o),d*(c+o)]]}return s}}();
+//# sourceMappingURL=dygraph-combined.js.map \ No newline at end of file
diff --git a/ui/README.md b/ui/README.md
new file mode 100644
index 0000000..1a00c1d
--- /dev/null
+++ b/ui/README.md
@@ -0,0 +1,6 @@
+ui
+==
+
+This directory contains static HTML, CSS, and JavaScript for the RAPPOR
+dashboard. See the `pipeline/` directory for more details.
+
diff --git a/ui/assoc-day.html b/ui/assoc-day.html
new file mode 100644
index 0000000..2255325
--- /dev/null
+++ b/ui/assoc-day.html
@@ -0,0 +1,44 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Single Day Association Results</title>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initAssocDay(gUrlHash, gTableStates, kStatusElem);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <!-- TODO: up to metric? Nav bar. -->
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <a href="assoc-overview.html">Association Overview</a>
+ </p>
+
+ <!-- NOTE: There is a metric description here. Get it from the XML file.
+ -->
+
+ <h2 id="metricDay"></h2>
+
+ <table id="results_table">
+ </table>
+
+ <p>
+ <!-- link depends on fragment; filled in by JS -->
+ Underlying data: <a id="underlying" href="">assoc-results.csv</a>
+ </p>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/assoc-metric.html b/ui/assoc-metric.html
new file mode 100644
index 0000000..1ac1dde
--- /dev/null
+++ b/ui/assoc-metric.html
@@ -0,0 +1,45 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title></title> <!-- filled in by JS -->
+
+ <script type="text/javascript" src="static/dygraph-combined.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initAssocMetric(gUrlHash, gTableStates, kStatusElem, globals);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <a href="assoc-overview.html">Association Overview</a>
+ </p>
+
+ <h1 id="pageTitle"></h1> <!-- filled in by JS -->
+
+ <p id="metricDesc"></p> <!-- filled in by JS -->
+
+ <table id="metric_table">
+ </table>
+
+ <p>
+ <!-- link depends on fragment; filled in by JS -->
+ Underlying data: <a id="underlying-status" href=""></a>
+ </p>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var globals = {proportionsDygraph: null};
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/assoc-overview.html b/ui/assoc-overview.html
new file mode 100644
index 0000000..e3f06e1
--- /dev/null
+++ b/ui/assoc-overview.html
@@ -0,0 +1,43 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>RAPPOR Association Analysis Overview</title>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initAssocOverview(gUrlHash, gTableStates, kStatusElem);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <p style="text-align: right">
+ <a href="../../live/latest/overview.html">Single variable analysis</a> (latest)
+ </p>
+
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <b>Association Overview</b>
+ </p>
+
+ <h1>RAPPOR Association Analysis Overview</h1>
+
+ <table id="overview">
+ </table>
+
+ <p>
+ Underlying data: <a href="cooked/assoc-overview.csv">overview.csv</a>
+ </p>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/assoc-pair.html b/ui/assoc-pair.html
new file mode 100644
index 0000000..7625966
--- /dev/null
+++ b/ui/assoc-pair.html
@@ -0,0 +1,47 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title></title> <!-- filled in by JS -->
+
+ <script type="text/javascript" src="static/dygraph-combined.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initAssocPair(gUrlHash, gTableStates, kStatusElem, globals);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <a href="assoc-overview.html">Association Overview</a>
+ </p>
+
+ <h1 id="pageTitle"></h1> <!-- filled in by JS -->
+
+ <p id="metricDesc"></p> <!-- filled in by JS -->
+
+ <h2>Task Status</h2>
+
+ <table id="status_table">
+ </table>
+
+ <p>
+ <!-- link depends on fragment; filled in by JS -->
+ Underlying data: <a id="underlying-status" href=""></a>
+ </p>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var globals = {proportionsDygraph: null};
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/day.html b/ui/day.html
new file mode 100644
index 0000000..624778c
--- /dev/null
+++ b/ui/day.html
@@ -0,0 +1,49 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Single Day Results</title>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initDay(gUrlHash, gTableStates, kStatusElem);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <!-- TODO: up to metric? Nav bar. -->
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <a href="overview.html">Overview</a> /
+ <a href="histograms.html">Histograms</a>
+ </p>
+
+ <!-- NOTE: There is a metric description here. Get it from the XML file.
+ -->
+
+ <h2 id="metricDay"></h2>
+
+ <table id="results_table">
+ </table>
+
+ <p>
+ <img id="residual" src="" alt="Residuals">
+ </p>
+
+ <p>
+ <!-- link depends on fragment; filled in by JS -->
+ Underlying data: <a id="underlying" href="">results.csv</a>
+ </p>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/histograms.html b/ui/histograms.html
new file mode 100644
index 0000000..cce5ee2
--- /dev/null
+++ b/ui/histograms.html
@@ -0,0 +1,48 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>RAPPOR Task Histograms</title>
+
+ <!-- TODO: use <base> tag? -->
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body>
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <a href="overview.html">Overview</a> /
+ <b>Histograms</b>
+ </p>
+
+ <h1>RAPPOR Task Histograms</h1>
+
+ <p>Each task's input is a (metric, day), i.e. it runs on the summed reports
+ for a single metric received in a single day.</p>
+
+ <p>
+ <img src="cooked/allocated_mass.png" />
+ </p>
+
+ <p>
+ <img src="cooked/num_rappor.png" />
+ </p>
+
+ <p>
+ <img src="cooked/num_reports.png" />
+ </p>
+
+ <p>
+ <img src="cooked/seconds.png" />
+ </p>
+
+ <p>
+ <img src="cooked/memory.png" />
+ </p>
+
+ <p>
+ <img src="mem-series.png" />
+ </p>
+
+ </body>
+</html>
diff --git a/ui/home.html b/ui/home.html
new file mode 100644
index 0000000..d4f947a
--- /dev/null
+++ b/ui/home.html
@@ -0,0 +1,16 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Rappor HOME</title>
+
+ <!-- This page is a stub that redirects to our Github home page.
+ overview.html, etc. link to it. Redirect after 0 seconds. -->
+ <meta http-equiv="refresh" content="0; url=https://github.com/google/rappor" />
+ </head>
+
+ <body>
+ <p>
+ Redirecting to <a href="https://github.com/google/rappor">https://github.com/google/rappor</a>
+ </p>
+ </body>
+</html>
diff --git a/ui/metric.html b/ui/metric.html
new file mode 100644
index 0000000..ac14a88
--- /dev/null
+++ b/ui/metric.html
@@ -0,0 +1,83 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>Metric Results</title>
+
+ <script type="text/javascript" src="static/dygraph-combined.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initMetric(gUrlHash, gTableStates, kStatusElem, globals);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <a href="overview.html">Overview</a> /
+ <a href="histograms.html">Histograms</a>
+ </p>
+
+ <!-- NOTE: There is a metric description here. Get it from the XML file.
+ -->
+
+ <h1 id="metricName"></h1> <!-- filled in by JS -->
+
+ <p id="metricDesc"></p> <!-- filled in by JS -->
+
+ <h2>Estimated Proportions</h2>
+ <p>NOTE: Only the top 5 values for each day are shown</p>
+
+ <!--
+ NOTE: Setting customBars: false removes the entire line? That's lame.
+ <p>
+ <label>
+ <input type="checkbox" checked="checked"
+ onclick="onMetricCheckboxClick(this, globals.proportionsDygraph);">
+ Show Error Bars
+ </label>
+ </p>
+ -->
+ <p class="dy" id="proportionsDy"></p>
+ <p>
+ Underlying data: <a id="underlying-dist" href="">dist.csv</a>
+ </p>
+
+ <h2>Number of Reports</h2>
+
+ <p class="dy" id="num-reports-dy" align="center"></p>
+ <!-- underlying data here is in status.csv? -->
+
+ <h2>Unallocated Mass</h2>
+
+ <p class="dy" id="mass-dy" align="center"></p>
+
+ <p>
+ Plot Help: Drag horizontally to <b>zoom to selection</b>. Double click
+ to <b>zoom out</b>. Shift + drag to <b>pan</b>.
+ </p>
+
+ <h2>Task Status</h2>
+
+ <table id="status_table">
+ </table>
+
+ <p>
+ <!-- link depends on fragment; filled in by JS -->
+ Underlying data: <a id="underlying-status" href="">status.csv</a>
+ </p>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var globals = {proportionsDygraph: null};
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/overview.html b/ui/overview.html
new file mode 100644
index 0000000..464f983
--- /dev/null
+++ b/ui/overview.html
@@ -0,0 +1,59 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>RAPPOR Results Overview</title>
+
+ <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
+ <script type="text/javascript" src="static/table-lib.js"></script>
+
+ <link rel="stylesheet" type="text/css" href="static/ui.css" />
+ <script type="text/javascript" src="static/ui.js"></script>
+ </head>
+
+ <body onload="initOverview(gUrlHash, gTableStates, kStatusElem);"
+ onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
+ <p id="status"></p>
+
+ <p style="text-align: right">
+ <a href="../../assoc-live/latest/assoc-overview.html">Association analysis</a> (latest)
+ </p>
+
+ <p style="text-align: right">
+ <a href="../home.html">Home</a> /
+ <b>Overview</b> /
+ <a href="histograms.html">Histograms</a>
+ </p>
+
+ <h1>RAPPOR Results Overview</h1>
+
+ <table id="overview">
+ </table>
+
+ <p>
+ Underlying data: <a href="cooked/overview.csv">overview.csv</a>
+ </p>
+
+ <h2>Metric Descriptions</h2>
+
+ <!-- Filled in by JS -->
+ <table id="metricMetadata">
+ <thead>
+ <tr>
+ <td>Metric Name</td>
+ <td>Owners</td>
+ <td>Description</td>
+ </tr>
+ </thead>
+ <tbody>
+ </tbody>
+ </table>
+
+ <!-- page globals -->
+ <script type="text/javascript">
+ var gUrlHash = new UrlHash(location.hash);
+ var gTableStates = {};
+ var kStatusElem = document.getElementById('status');
+ </script>
+
+ </body>
+</html>
diff --git a/ui/table-lib.js b/ui/table-lib.js
new file mode 100644
index 0000000..64913dc
--- /dev/null
+++ b/ui/table-lib.js
@@ -0,0 +1,482 @@
+// Sortable HTML table.
+//
+// Usage:
+//
+// Each page should have gTableStates and gUrlHash variables. This library
+// only provides functions / classes, not instances.
+//
+// Then use these public functions on those variables. They should be hooked
+// up to initialization and onhashchange events.
+//
+// - makeTablesSortable
+// - updateTables
+//
+// Life of a click
+//
+// - query existing TableState object to find the new state
+// - mutate urlHash
+// - location.hash = urlHash.encode()
+// - onhashchange
+// - decode location.hash into urlHash
+// - update DOM
+//
+// HTML generation requirements:
+// - <table id="foo">
+// - need <colgroup> for types.
+// - For numbers, class="num-cell" as well as <col type="number">
+// - single <thead> and <tbody>
+
+'use strict';
+
+function appendMessage(elem, msg) {
+ // TODO: escape HTML?
+ elem.innerHTML += msg + '<br />';
+}
+
+function userError(errElem, msg) {
+ if (errElem) {
+ appendMessage(errElem, msg);
+ } else {
+ console.log(msg);
+ }
+}
+
+//
+// Key functions for column ordering
+//
+// TODO: better naming convention?
+
+function identity(x) {
+ return x;
+}
+
+function lowerCase(x) {
+ return x.toLowerCase();
+}
+
+// Parse as number.
+function asNumber(x) {
+ var stripped = x.replace(/[ \t\r\n]/g, '');
+ if (stripped === 'NA') {
+ // return lowest value, so NA sorts below everything else.
+ return -Number.MAX_VALUE;
+ }
+ var numClean = x.replace(/[$,]/g, ''); // remove dollar signs and commas
+ return parseFloat(numClean);
+}
+
+// as a date.
+//
+// TODO: Parse into JS date object?
+// http://stackoverflow.com/questions/19430561/how-to-sort-a-javascript-array-of-objects-by-date
+// Uses getTime(). Hm.
+
+function asDate(x) {
+ return x;
+}
+
+//
+// Table Implementation
+//
+
+// Given a column array and a key function, construct a permutation of the
+// indices [0, n).
+function makePermutation(colArray, keyFunc) {
+ var pairs = []; // (index, result of keyFunc on cell)
+
+ var n = colArray.length;
+ for (var i = 0; i < n; ++i) {
+ var value = colArray[i];
+
+ // NOTE: This could be a URL, so you need to extract that?
+ // If it's a URL, take the anchor text I guess.
+ var key = keyFunc(value);
+
+ pairs.push([key, i]);
+ }
+
+ // Sort by computed key
+ pairs.sort(function(a, b) {
+ if (a[0] < b[0]) {
+ return -1;
+ } else if (a[0] > b[0]) {
+ return 1;
+ } else {
+ return 0;
+ }
+ });
+
+ // Extract the permutation as second column
+ var perm = [];
+ for (var i = 0; i < pairs.length; ++i) {
+ perm.push(pairs[i][1]); // append index
+ }
+ return perm;
+}
+
+function extractCol(rows, colIndex) {
+ var colArray = [];
+ for (var i = 0; i < rows.length; ++i) {
+ var row = rows[i];
+ colArray.push(row.cells[colIndex].textContent);
+ }
+ return colArray;
+}
+
+// Given an array of DOM row objects, and a list of sort functions (one per
+// column), return a list of permutations.
+//
+// Right now this is eager. Could be lazy later.
+function makeAllPermutations(rows, keyFuncs) {
+ var numCols = keyFuncs.length;
+ var permutations = [];
+ for (var i = 0; i < numCols; ++i) {
+ var colArray = extractCol(rows, i);
+ var keyFunc = keyFuncs[i];
+ var p = makePermutation(colArray, keyFunc);
+ permutations.push(p);
+ }
+ return permutations;
+}
+
+// Model object for a table. (Mostly) independent of the DOM.
+function TableState(table, keyFuncs) {
+ this.table = table;
+ keyFuncs = keyFuncs || []; // array of column
+
+ // these are mutated
+ this.sortCol = -1; // not sorted by any col
+ this.ascending = false; // if sortCol is sorted in ascending order
+
+ if (table === null) { // hack so we can pass dummy table
+ console.log('TESTING');
+ return;
+ }
+
+ var bodyRows = table.tBodies[0].rows;
+ this.orig = []; // pointers to row objects in their original order
+ for (var i = 0; i < bodyRows.length; ++i) {
+ this.orig.push(bodyRows[i]);
+ }
+
+ this.colElems = [];
+ var colgroup = table.getElementsByTagName('colgroup')[0];
+
+ // copy it into an array
+ if (!colgroup) {
+ throw new Error('<colgroup> is required');
+ }
+
+ for (var i = 0; i < colgroup.children.length; ++i) {
+ var colElem = colgroup.children[i];
+ var colType = colElem.getAttribute('type');
+ var keyFunc;
+ switch (colType) {
+ case 'case-sensitive':
+ keyFunc = identity;
+ break;
+ case 'case-insensitive':
+ keyFunc = lowerCase;
+ break;
+ case 'number':
+ keyFunc = asNumber;
+ break;
+ case 'date':
+ keyFunc = asDate;
+ break;
+ default:
+ throw new Error('Invalid column type ' + colType);
+ }
+ keyFuncs[i] = keyFunc;
+
+ this.colElems.push(colElem);
+ }
+
+ this.permutations = makeAllPermutations(this.orig, keyFuncs);
+}
+
+// Reset sort state.
+TableState.prototype.resetSort = function() {
+ this.sortCol = -1; // not sorted by any col
+ this.ascending = false; // if sortCol is sorted in ascending order
+};
+
+// Change state for a click on a column.
+TableState.prototype.doClick = function(colIndex) {
+ if (this.sortCol === colIndex) { // same column; invert direction
+ this.ascending = !this.ascending;
+ } else { // different column
+ this.sortCol = colIndex;
+ // first click makes it *descending*. Typically you want to see the
+ // largest values first.
+ this.ascending = false;
+ }
+};
+
+TableState.prototype.decode = function(stateStr, errElem) {
+ var sortCol = parseInt(stateStr); // parse leading integer
+ var lastChar = stateStr[stateStr.length - 1];
+
+ var ascending;
+ if (lastChar === 'a') {
+ ascending = true;
+ } else if (lastChar === 'd') {
+ ascending = false;
+ } else {
+ // The user could have entered a bad ID
+ userError(errElem, 'Invalid state string ' + stateStr);
+ return;
+ }
+
+ this.sortCol = sortCol;
+ this.ascending = ascending;
+}
+
+
+TableState.prototype.encode = function() {
+ if (this.sortCol === -1) {
+ return ''; // default state isn't serialized
+ }
+
+ var s = this.sortCol.toString();
+ s += this.ascending ? 'a' : 'd';
+ return s;
+};
+
+// Update the DOM with using this object's internal state.
+TableState.prototype.updateDom = function() {
+ var tHead = this.table.tHead;
+ setArrows(tHead, this.sortCol, this.ascending);
+
+ // Highlight the column that the table is sorted by.
+ for (var i = 0; i < this.colElems.length; ++i) {
+ // set or clear it. NOTE: This means we can't have other classes on the
+ // <col> tags, which is OK.
+ var className = (i === this.sortCol) ? 'highlight' : '';
+ this.colElems[i].className = className;
+ }
+
+ var n = this.orig.length;
+ var tbody = this.table.tBodies[0];
+
+ if (this.sortCol === -1) { // reset it and return
+ for (var i = 0; i < n; ++i) {
+ tbody.appendChild(this.orig[i]);
+ }
+ return;
+ }
+
+ var perm = this.permutations[this.sortCol];
+ if (this.ascending) {
+ for (var i = 0; i < n; ++i) {
+ var index = perm[i];
+ tbody.appendChild(this.orig[index]);
+ }
+ } else { // descending, apply the permutation in reverse order
+ for (var i = n - 1; i >= 0; --i) {
+ var index = perm[i];
+ tbody.appendChild(this.orig[index]);
+ }
+ }
+};
+
+var kTablePrefix = 't:';
+var kTablePrefixLength = 2;
+
+// Given a UrlHash instance and a list of tables, mutate tableStates.
+function decodeState(urlHash, tableStates, errElem) {
+ var keys = urlHash.getKeysWithPrefix(kTablePrefix); // by convention, t:foo=1a
+ for (var i = 0; i < keys.length; ++i) {
+ var key = keys[i];
+ var tableId = key.substring(kTablePrefixLength);
+
+ if (!tableStates.hasOwnProperty(tableId)) {
+ // The user could have entered a bad ID
+ userError(errElem, 'Invalid table ID [' + tableId + ']');
+ return;
+ }
+
+ var state = tableStates[tableId];
+ var stateStr = urlHash.get(key); // e.g. '1d'
+
+ state.decode(stateStr, errElem);
+ }
+}
+
+// Add <span> element for sort arrows.
+function addArrowSpans(tHead) {
+ var tHeadCells = tHead.rows[0].cells;
+ for (var i = 0; i < tHeadCells.length; ++i) {
+ var colHead = tHeadCells[i];
+ // Put a space in so the width is relatively constant
+ colHead.innerHTML += ' <span class="sortArrow">&nbsp;</span>';
+ }
+}
+
+// Go through all the cells in the header. Clear the arrow if there is one.
+// Set the one on the correct column.
+//
+// How to do this? Each column needs a <span></span> modify the text?
+function setArrows(tHead, sortCol, ascending) {
+ var tHeadCells = tHead.rows[0].cells;
+
+ for (var i = 0; i < tHeadCells.length; ++i) {
+ var colHead = tHeadCells[i];
+ var span = colHead.getElementsByTagName('span')[0];
+
+ if (i === sortCol) {
+ span.innerHTML = ascending ? '&#x25B4;' : '&#x25BE;';
+ } else {
+ span.innerHTML = '&nbsp;'; // clear it
+ }
+ }
+}
+
+// Given the URL hash, table states, tableId, and column index that was
+// clicked, visit a new location.
+function makeClickHandler(urlHash, tableStates, id, colIndex) {
+ return function() { // no args for onclick=
+ var clickedState = tableStates[id];
+
+ clickedState.doClick(colIndex);
+
+ // now urlHash has non-table state, and tableStates is the table state.
+ for (var tableId in tableStates) {
+ var state = tableStates[tableId];
+
+ var stateStr = state.encode();
+ var key = kTablePrefix + tableId;
+
+ if (stateStr === '') {
+ urlHash.del(key);
+ } else {
+ urlHash.set(key, stateStr);
+ }
+ }
+
+ // move to new location
+ location.hash = urlHash.encode();
+ };
+}
+
+// Go through cells and register onClick
+function registerClick(table, urlHash, tableStates) {
+ var id = table.id; // id is required
+
+ var tHeadCells = table.tHead.rows[0].cells;
+ for (var colIndex = 0; colIndex < tHeadCells.length; ++colIndex) {
+ var colHead = tHeadCells[colIndex];
+ // NOTE: in ES5, could use 'bind'.
+ colHead.onclick = makeClickHandler(urlHash, tableStates, id, colIndex);
+ }
+}
+
+//
+// Public Functions (TODO: Make a module?)
+//
+
+// Parse the URL fragment, and update all tables. Errors are printed to a DOM
+// element.
+function updateTables(urlHash, tableStates, statusElem) {
+ // State should come from the hash alone, so reset old state. (We want to
+ // keep the permutations though.)
+ for (var tableId in tableStates) {
+ tableStates[tableId].resetSort();
+ }
+
+ decodeState(urlHash, tableStates, statusElem);
+
+ for (var name in tableStates) {
+ var state = tableStates[name];
+ state.updateDom();
+ }
+}
+
+// Takes a {tableId: spec} object. The spec should be an array of sortable
+// items.
+// Returns a dictionary of table states.
+function makeTablesSortable(urlHash, tables, tableStates) {
+ for (var i = 0; i < tables.length; ++i) {
+ var table = tables[i];
+ var tableId = table.id;
+
+ registerClick(table, urlHash, tableStates);
+ tableStates[tableId] = new TableState(table);
+
+ addArrowSpans(table.tHead);
+ }
+ return tableStates;
+}
+
+// table-sort.js can use t:holidays=1d
+//
+// metric.html can use:
+//
+// metric=Foo.bar
+//
+// day.html could use
+//
+// jobId=X&metric=Foo.bar&day=2015-06-01
+
+// helper
+function _decode(s) {
+ var obj = {};
+ var parts = s.split('&');
+ for (var i = 0; i < parts.length; ++i) {
+ if (parts[i].length === 0) {
+ continue; // quirk: ''.split('&') is [''] ? Should be a 0-length array.
+ }
+ var pair = parts[i].split('=');
+ obj[pair[0]] = pair[1]; // for now, assuming no =
+ }
+ return obj;
+}
+
+// UrlHash Constructor.
+// Args:
+// hashStr: location.hash
+function UrlHash(hashStr) {
+ this.reset(hashStr);
+}
+
+UrlHash.prototype.reset = function(hashStr) {
+ var h = hashStr.substring(1); // without leading #
+ // Internal storage is string -> string
+ this.dict = _decode(h);
+}
+
+UrlHash.prototype.set = function(name, value) {
+ this.dict[name] = value;
+};
+
+UrlHash.prototype.del = function(name) {
+ delete this.dict[name];
+};
+
+UrlHash.prototype.get = function(name ) {
+ return this.dict[name];
+};
+
+// e.g. Table states have keys which start with 't:'.
+UrlHash.prototype.getKeysWithPrefix = function(prefix) {
+ var keys = [];
+ for (var name in this.dict) {
+ if (name.indexOf(prefix) === 0) {
+ keys.push(name);
+ }
+ }
+ return keys;
+};
+
+// Return a string reflecting internal key-value pairs.
+UrlHash.prototype.encode = function() {
+ var parts = [];
+ for (var name in this.dict) {
+ var s = name;
+ s += '=';
+ var value = this.dict[name];
+ s += encodeURIComponent(value);
+ parts.push(s);
+ }
+ return parts.join('&');
+};
diff --git a/ui/table-sort.css b/ui/table-sort.css
new file mode 100644
index 0000000..1034f4e
--- /dev/null
+++ b/ui/table-sort.css
@@ -0,0 +1,39 @@
+/* sort indicator in column headings */
+.sortArrow {
+ color: grey;
+}
+
+thead {
+ font-weight: bold;
+ text-align: center;
+}
+
+table {
+ padding: 10px; /* Padding makes it look nicer. */
+ margin: 0 auto; /* center table on the page */
+ border-collapse: collapse; /* this is like old cellpadding */
+}
+
+/* like cellspacing? */
+td {
+ padding: 5px;
+}
+
+/* Built-in support for R NA values */
+.na {
+ color: darkred;
+}
+
+/* Numbers aligned on the right, like Excel */
+.num {
+ text-align: right;
+}
+
+.highlight {
+ background-color: #f0f0f0;
+}
+
+tbody tr:hover {
+ background-color: lightcyan;
+}
+
diff --git a/ui/ui.css b/ui/ui.css
new file mode 100644
index 0000000..8431ecf
--- /dev/null
+++ b/ui/ui.css
@@ -0,0 +1,53 @@
+/* Center the plots */
+.dy {
+ margin: 0 auto;
+ width: 50em;
+}
+
+/* main metric */
+#proportionsDy {
+ width: 1000px;
+ height: 600px;
+}
+
+#num-reports-dy {
+ width: 1000px;
+ height: 300px;
+}
+
+#mass-dy {
+ width: 1000px;
+ height: 300px;
+}
+
+#metricDesc {
+ font-style: italic;
+}
+
+body {
+ /*margin: 0 auto;*/
+ /*text-align: left;*/
+}
+
+h1 {
+ text-align: center;
+}
+
+h2 {
+ text-align: center;
+}
+
+p {
+ text-align: center;
+}
+
+/* R NA values */
+.na {
+ color: darkred;
+}
+
+#status {
+ text-align: center;
+ font-size: x-large;
+ color: darkred;
+}
diff --git a/ui/ui.js b/ui/ui.js
new file mode 100644
index 0000000..b74a8e2
--- /dev/null
+++ b/ui/ui.js
@@ -0,0 +1,363 @@
+// Dashboard UI functions.
+//
+// This is shared between all HTML pages.
+
+'use strict';
+
+// Append a message to an element. Used for errors.
+function appendMessage(elem, msg) {
+ elem.innerHTML += msg + '<br />';
+}
+
+// jQuery-like AJAX helper, but simpler.
+
+// Requires an element with id "status" to show errors.
+//
+// Args:
+// errElem: optional element to append error messages to. If null, then
+// alert() on error.
+// success: callback that is passed the xhr object.
+function ajaxGet(url, errElem, success) {
+ var xhr = new XMLHttpRequest();
+ xhr.open('GET', url, true /*async*/);
+ xhr.onreadystatechange = function() {
+ if (xhr.readyState != 4 /*DONE*/) {
+ return;
+ }
+
+ if (xhr.status != 200) {
+ var msg = 'ERROR requesting ' + url + ': ' + xhr.status + ' ' +
+ xhr.statusText;
+ if (errElem) {
+ appendMessage(errElem, msg);
+ } else {
+ alert(msg);
+ }
+ return;
+ }
+
+ success(xhr);
+ };
+ xhr.send();
+}
+
+// Load metadata about the metrics.
+// metric-metadata.json is just 14 KB, so we load it for every page.
+//
+// callback:
+// on metric page, just pick out the right description.
+// on overview page, populate them ALL with tool tips?
+// Or create another column?
+function loadMetricMetadata(errElem, success) {
+ // TODO: Should we make metric-metadata.json optional? Some may not have it.
+
+ ajaxGet('metric-metadata.json', errElem, function(xhr) {
+ // TODO: handle parse error
+ var m = JSON.parse(xhr.responseText);
+ success(m);
+ });
+}
+
+// for overview.html.
+function initOverview(urlHash, tableStates, statusElem) {
+
+ ajaxGet('cooked/overview.part.html', statusElem, function(xhr) {
+ var elem = document.getElementById('overview');
+ elem.innerHTML = xhr.responseText;
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+
+ loadMetricMetadata(statusElem, function(metadata) {
+ var elem = document.getElementById('metricMetadata').tBodies[0];
+ var metrics = metadata.metrics;
+
+ // Sort by the metric name
+ var metricNames = Object.getOwnPropertyNames(metrics);
+ metricNames.sort();
+
+ var tableHtml = '';
+ for (var i = 0; i < metricNames.length; ++i) {
+ var name = metricNames[i];
+ var meta = metrics[name];
+ tableHtml += '<tr>';
+ tableHtml += '<td>' + name + '</td>';
+ tableHtml += '<td>' + meta.owners + '</td>';
+ tableHtml += '<td>' + meta.summary + '</td>';
+ tableHtml += '</tr>';
+ }
+ elem.innerHTML += tableHtml;
+ });
+}
+
+// for metric.html.
+function initMetric(urlHash, tableStates, statusElem, globals) {
+
+ var metricName = urlHash.get('metric');
+ if (metricName === undefined) {
+ appendMessage(statusElem, "Missing metric name in URL hash.");
+ return;
+ }
+
+ loadMetricMetadata(statusElem, function(metadata) {
+ var meta = metadata.metrics[metricName];
+ if (!meta) {
+ appendMessage(statusElem, 'Found no metadata for ' + metricName);
+ return;
+ }
+ var descElem = document.getElementById('metricDesc');
+ descElem.innerHTML = meta.summary;
+
+ // TODO: put owners at the bottom of the page somewhere?
+ });
+
+ // Add title and page element
+ document.title = metricName;
+ var nameElem = document.getElementById('metricName');
+ nameElem.innerHTML = metricName;
+
+ // Add correct links.
+ var u = document.getElementById('underlying-status');
+ u.href = 'cooked/' + metricName + '/status.csv';
+
+ var distUrl = 'cooked/' + metricName + '/dist.csv';
+ var u2 = document.getElementById('underlying-dist');
+ u2.href = distUrl;
+
+ ajaxGet(distUrl, statusElem, function(xhr) {
+ var csvData = xhr.responseText;
+ var elem = document.getElementById('proportionsDy');
+ // Mutate global so we can respond to onclick.
+ globals.proportionsDygraph = new Dygraph(elem, csvData, {customBars: true});
+ });
+
+ var numReportsUrl = 'cooked/' + metricName + '/num_reports.csv';
+ ajaxGet(numReportsUrl, statusElem, function(xhr) {
+ var csvData = xhr.responseText;
+ var elem = document.getElementById('num-reports-dy');
+ var g = new Dygraph(elem, csvData);
+ });
+
+ var massUrl = 'cooked/' + metricName + '/mass.csv';
+ ajaxGet(massUrl, statusElem, function(xhr) {
+ var csvData = xhr.responseText;
+ var elem = document.getElementById('mass-dy');
+ var g = new Dygraph(elem, csvData);
+ });
+
+ var tableUrl = 'cooked/' + metricName + '/status.part.html';
+ ajaxGet(tableUrl, statusElem, function(xhr) {
+ var htmlData = xhr.responseText;
+ var elem = document.getElementById('status_table');
+ elem.innerHTML = htmlData;
+
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+}
+
+// NOTE: This was for optional Dygraphs error bars, but it's not hooked up yet.
+function onMetricCheckboxClick(checkboxElem, proportionsDygraph) {
+ var checked = checkboxElem.checked;
+ if (proportionsDygraph === null) {
+ console.log('NULL');
+ }
+ proportionsDygraph.updateOptions({customBars: checked});
+ console.log('HANDLED');
+}
+
+// for day.html.
+function initDay(urlHash, tableStates, statusElem) {
+ var jobId = urlHash.get('jobId');
+ var metricName = urlHash.get('metric');
+ var date = urlHash.get('date');
+
+ var err = '';
+ if (!jobId) {
+ err = 'jobId missing from hash';
+ }
+ if (!metricName) {
+ err = 'metric missing from hash';
+ }
+ if (!date) {
+ err = 'date missing from hash';
+ }
+ if (err) {
+ appendMessage(statusElem, err);
+ }
+
+ // Add title and page element
+ var titleStr = metricName + ' on ' + date;
+ document.title = titleStr;
+ var mElem = document.getElementById('metricDay');
+ mElem.innerHTML = titleStr;
+
+ // Add correct links.
+ var u = document.getElementById('underlying');
+ u.href = '../' + jobId + '/raw/' + metricName + '/' + date +
+ '/results.csv';
+
+ // Add correct links.
+ var u_res = document.getElementById('residual');
+ u_res.src = '../' + jobId + '/raw/' + metricName + '/' + date +
+ '/residual.png';
+
+ var url = '../' + jobId + '/cooked/' + metricName + '/' + date + '.part.html';
+ ajaxGet(url, statusElem, function(xhr) {
+ var htmlData = xhr.responseText;
+ var elem = document.getElementById('results_table');
+ elem.innerHTML = htmlData;
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+}
+
+// for assoc-overview.html.
+function initAssocOverview(urlHash, tableStates, statusElem) {
+ ajaxGet('cooked/assoc-overview.part.html', statusElem, function(xhr) {
+ var elem = document.getElementById('overview');
+ elem.innerHTML = xhr.responseText;
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+}
+
+// for assoc-metric.html.
+function initAssocMetric(urlHash, tableStates, statusElem) {
+ var metricName = urlHash.get('metric');
+ if (metricName === undefined) {
+ appendMessage(statusElem, "Missing metric name in URL hash.");
+ return;
+ }
+
+ // Add title and page element
+ var title = metricName + ': pairs of variables';
+ document.title = title;
+ var pageTitleElem = document.getElementById('pageTitle');
+ pageTitleElem.innerHTML = title;
+
+ // Add correct links.
+ var u = document.getElementById('underlying-status');
+ u.href = 'cooked/' + metricName + '/metric-status.csv';
+
+ var csvPath = 'cooked/' + metricName + '/metric-status.part.html';
+ ajaxGet(csvPath, statusElem, function(xhr) {
+ var elem = document.getElementById('metric_table');
+ elem.innerHTML = xhr.responseText;
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+}
+
+// Function to help us find the *.part.html files.
+//
+// NOTE: This naming convention matches the one defined in task_spec.py
+// AssocTaskSpec.
+function formatAssocRelPath(metricName, var1, var2) {
+ var varDir = var1 + '_X_' + var2.replace('..', '_');
+ return metricName + '/' + varDir;
+}
+
+// for assoc-pair.html
+function initAssocPair(urlHash, tableStates, statusElem, globals) {
+
+ var metricName = urlHash.get('metric');
+ if (metricName === undefined) {
+ appendMessage(statusElem, "Missing metric name in URL hash.");
+ return;
+ }
+ var var1 = urlHash.get('var1');
+ if (var1 === undefined) {
+ appendMessage(statusElem, "Missing var1 in URL hash.");
+ return;
+ }
+ var var2 = urlHash.get('var2');
+ if (var2 === undefined) {
+ appendMessage(statusElem, "Missing var2 in URL hash.");
+ return;
+ }
+
+ var relPath = formatAssocRelPath(metricName, var1, var2);
+
+ // Add title and page element
+ var title = metricName + ': ' + var1 + ' vs. ' + var2;
+ document.title = title;
+ var pageTitleElem = document.getElementById('pageTitle');
+ pageTitleElem.innerHTML = title;
+
+ // Add correct links.
+ var u = document.getElementById('underlying-status');
+ u.href = 'cooked/' + relPath + '/pair-status.csv';
+
+ /*
+ var distUrl = 'cooked/' + metricName + '/dist.csv';
+ var u2 = document.getElementById('underlying-dist');
+ u2.href = distUrl;
+ */
+
+ var tableUrl = 'cooked/' + relPath + '/pair-status.part.html';
+ ajaxGet(tableUrl, statusElem, function(xhr) {
+ var htmlData = xhr.responseText;
+ var elem = document.getElementById('status_table');
+ elem.innerHTML = htmlData;
+
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+}
+
+// for assoc-day.html.
+function initAssocDay(urlHash, tableStates, statusElem) {
+ var jobId = urlHash.get('jobId');
+ var metricName = urlHash.get('metric');
+ var var1 = urlHash.get('var1');
+ var var2 = urlHash.get('var2');
+ var date = urlHash.get('date');
+
+ var err = '';
+ if (!jobId) {
+ err = 'jobId missing from hash';
+ }
+ if (!metricName) {
+ err = 'metric missing from hash';
+ }
+ if (!var1) {
+ err = 'var1 missing from hash';
+ }
+ if (!var2) {
+ err = 'var2 missing from hash';
+ }
+ if (!date) {
+ err = 'date missing from hash';
+ }
+ if (err) {
+ appendMessage(statusElem, err);
+ }
+
+ // Add title and page element
+ var titleStr = metricName + ': ' + var1 + ' vs. ' + var2 + ' on ' + date;
+ document.title = titleStr;
+ var mElem = document.getElementById('metricDay');
+ mElem.innerHTML = titleStr;
+
+ var relPath = formatAssocRelPath(metricName, var1, var2);
+
+ // Add correct links.
+ var u = document.getElementById('underlying');
+ u.href = '../' + jobId + '/raw/' + relPath + '/' + date +
+ '/assoc-results.csv';
+
+ var url = '../' + jobId + '/cooked/' + relPath + '/' + date + '.part.html';
+ ajaxGet(url, statusElem, function(xhr) {
+ var htmlData = xhr.responseText;
+ var elem = document.getElementById('results_table');
+ elem.innerHTML = htmlData;
+ makeTablesSortable(urlHash, [elem], tableStates);
+ updateTables(urlHash, tableStates, statusElem);
+ });
+}
+
+// This is the onhashchange handler of *all* HTML files.
+function onHashChange(urlHash, tableStates, statusElem) {
+ updateTables(urlHash, tableStates, statusElem);
+}
diff --git a/util.sh b/util.sh
new file mode 100755
index 0000000..7ad42c8
--- /dev/null
+++ b/util.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Utility functions, used by demo.sh and regtest.sh.
+
+banner() {
+ echo
+ echo "----- $@"
+ echo
+}
+
+log() {
+ echo 1>&2 "$@"
+}
+
+die() {
+ log "$0: $@"
+ exit 1
+}
+