1 files changed, 440 insertions, 0 deletions
diff --git a/regtest.sh b/regtest.sh
new file mode 100755
index 0000000..d21edc2
--- /dev/null
+++ b/regtest.sh
@@ -0,0 +1,440 @@
+#!/bin/bash
+usage() {
+echo "
+ Run end-to-end tests in parallel.
+
+ Usage:
+   ./regtest.sh <function name>
+ At the end, it will print an HTML summary.
+ 
+ Three main functions are 
+    run [<pattern> [<lang>]] - run tests matching <pattern> in
+                                       parallel. The language
+                                       of the client to use.
+    run-seq [<pattern> [<lang>]] - ditto, except that tests are run
+                                       sequentially
+    run-all                      - run all tests, in parallel
+
+ Examples:
+ $ ./regtest.sh run-seq unif-small-typical  # Run, the unif-small-typical test
+ $ ./regtest.sh run-seq unif-small-         # Sequential, the tests containing:
+                                            # 'unif-small-'
+ $ ./regtest.sh run unif-  # Parallel run, matches multiple cases
+ $ ./regtest.sh run-all    # Run all tests 
+
+ The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
+ use $ in the pattern, since it matches the whole spec line and not just the
+ test case name.) The number of processors used in a parallel run is one less
+ than the number of CPUs on the machine.
+"
+}
+# Future speedups:
+# - Reuse the same input -- come up with naming scheme based on params
+# - Reuse the same maps -- ditto, rappor library can cache it
+#
+
+set -o nounset
+set -o pipefail
+set -o errexit
+
+. util.sh
+
+readonly THIS_DIR=$(dirname $0)
+readonly REPO_ROOT=$THIS_DIR
+readonly CLIENT_DIR=$REPO_ROOT/client/python
+# subdirs are in _tmp/$impl, which shouldn't overlap with anything else in _tmp
+readonly REGTEST_BASE_DIR=_tmp
+
+# All the Python tools need this
+export PYTHONPATH=$CLIENT_DIR
+
+print-unique-values() {
+  local num_unique_values=$1
+  seq 1 $num_unique_values | awk '{print "v" $1}'
+}
+
+# Add some more candidates here.  We hope these are estimated at 0.
+# e.g. if add_start=51, and num_additional is 20, show v51-v70
+more-candidates() {
+  local last_true=$1
+  local num_additional=$2
+
+  local begin
+  local end
+  begin=$(expr $last_true + 1)
+  end=$(expr $last_true + $num_additional)
+
+  seq $begin $end | awk '{print "v" $1}'
+}
+
+# Args:
+#   unique_values: File of unique true values
+#   last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
+#   num_additional: additional candidates to generate (starting at 'last_true')
+#   to_remove: Regex of true values to omit from the candidates list, or the
+#     string 'NONE' if none should be.  (Our values look like 'v1', 'v2', etc. so
+#     there isn't any ambiguity.)
+print-candidates() {
+  local unique_values=$1
+  local last_true=$2
+  local num_additional=$3 
+  local to_remove=$4
+
+  if test $to_remove = NONE; then
+    cat $unique_values  # include all true inputs
+  else
+    egrep -v $to_remove $unique_values  # remove some true inputs
+  fi
+  more-candidates $last_true $num_additional
+}
+
+# Generate a single test case, specified by a line of the test spec.
+# This is a helper function for _run_tests().
+_setup-one-case() {
+  local impl=$1
+  shift  # impl is not part of the spec; the next 13 params are
+
+  local test_case=$1
+
+  # input params
+  local dist=$2
+  local num_unique_values=$3
+  local num_clients=$4
+  local values_per_client=$5
+
+  # RAPPOR params
+  local num_bits=$6
+  local num_hashes=$7
+  local num_cohorts=$8
+  local p=$9
+  local q=${10}  # need curly braces to get the 10th arg
+  local f=${11}
+
+  # map params
+  local num_additional=${12}
+  local to_remove=${13}
+
+  banner 'Setting up parameters and candidate files for '$test_case
+
+  local case_dir=$REGTEST_BASE_DIR/$impl/$test_case
+  mkdir --verbose -p $case_dir
+
+  # Save the "spec"
+  echo "$@" > $case_dir/spec.txt
+
+  local params_path=$case_dir/case_params.csv
+
+  echo 'k,h,m,p,q,f' > $params_path
+  echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
+
+  print-unique-values $num_unique_values > $case_dir/case_unique_values.txt
+
+  local true_map_path=$case_dir/case_true_map.csv
+
+  bin/hash_candidates.py \
+    $params_path \
+    < $case_dir/case_unique_values.txt \
+    > $true_map_path
+
+  # banner "Constructing candidates"
+
+  print-candidates \
+    $case_dir/case_unique_values.txt $num_unique_values \
+    $num_additional "$to_remove" \
+    > $case_dir/case_candidates.txt
+
+  # banner "Hashing candidates to get 'map'"
+
+  bin/hash_candidates.py \
+    $params_path \
+    < $case_dir/case_candidates.txt \
+    > $case_dir/case_map.csv
+}
+
+# Run a single test instance, specified by <test_name, instance_num>.
+# This is a helper function for _run_tests().
+_run-one-instance() {
+  local test_case=$1
+  local test_instance=$2
+  local impl=$3
+
+  local case_dir=$REGTEST_BASE_DIR/$impl/$test_case
+  
+  read -r \
+    case_name distr num_unique_values num_clients values_per_client \
+    num_bits num_hashes num_cohorts p q f \
+    num_additional to_remove \
+    < $case_dir/spec.txt
+
+  local instance_dir=$case_dir/$test_instance
+  mkdir --verbose -p $instance_dir
+
+  banner "Generating reports (gen_reports.R)"
+
+  # the TRUE_VALUES_PATH environment variable can be used to avoid
+  # generating new values every time.  NOTE: You are responsible for making
+  # sure the params match!
+
+  local true_values=${TRUE_VALUES_PATH:-}
+  if test -z "$true_values"; then
+    true_values=$instance_dir/case_true_values.csv
+    tests/gen_true_values.R $distr $num_unique_values $num_clients \
+                            $values_per_client $num_cohorts \
+                            $true_values
+  else
+    # TEMP hack: Make it visible to plot.
+    # TODO: Fix compare_dist.R
+    ln -s -f --verbose \
+      $PWD/$true_values \
+      $instance_dir/case_true_values.csv
+  fi
+
+  case $impl in
+    python)
+      banner "Running RAPPOR Python client"
+
+      # Writes encoded "out" file, true histogram, true inputs to
+      # $instance_dir.
+      time tests/rappor_sim.py \
+        --num-bits $num_bits \
+        --num-hashes $num_hashes \
+        --num-cohorts $num_cohorts \
+        -p $p \
+        -q $q \
+        -f $f \
+        < $true_values \
+        > "$instance_dir/case_reports.csv"
+      ;;
+      
+    cpp)
+      banner "Running RAPPOR C++ client (see rappor_sim.log for errors)"
+
+      time client/cpp/_tmp/rappor_sim \
+        $num_bits \
+        $num_hashes \
+        $num_cohorts \
+        $p \
+        $q \
+        $f \
+        < $true_values \
+        > "$instance_dir/case_reports.csv" \
+        2>"$instance_dir/rappor_sim.log"
+      ;;
+      
+    *)
+      log "Invalid impl $impl (should be one of python|cpp)"
+      exit 1
+    ;;
+    
+  esac
+
+  banner "Summing RAPPOR IRR bits to get 'counts'"
+
+  bin/sum_bits.py \
+    $case_dir/case_params.csv \
+    < $instance_dir/case_reports.csv \
+    > $instance_dir/case_counts.csv
+
+  local out_dir=${instance_dir}_report
+  mkdir --verbose -p $out_dir
+
+  # Currently, the summary file shows and aggregates timing of the inference
+  # engine, which excludes R's loading time and reading of the (possibly 
+  # substantial) map file. Timing below is more inclusive.
+  TIMEFORMAT='Running compare_dist.R took %R seconds'
+  time {
+    # Input prefix, output dir
+    tests/compare_dist.R -t "Test case: $test_case (instance $test_instance)" \
+                         "$case_dir/case" "$instance_dir/case" $out_dir
+  }
+}
+
+# Like _run-once-case, but log to a file.
+_run-one-instance-logged() {
+  local test_case=$1
+  local test_instance=$2
+  local impl=$3
+
+  local log_dir=$REGTEST_BASE_DIR/$impl/$test_case/${test_instance}_report
+  mkdir --verbose -p $log_dir
+
+  log "Started '$test_case' (instance $test_instance) -- logging to $log_dir/log.txt"
+  _run-one-instance "$@" >$log_dir/log.txt 2>&1 \
+    && log "Test case $test_case (instance $test_instance) done" \
+    || log "Test case $test_case (instance $test_instance) failed"
+}
+
+make-summary() {
+  local dir=$1
+  local impl=$2
+
+  local filename=results.html
+
+  tests/make_summary.py $dir $dir/rows.html
+
+  pushd $dir >/dev/null
+
+  cat ../../tests/regtest.html \
+    | sed -e '/__TABLE_ROWS__/ r rows.html' -e "s/_IMPL_/$impl/g" \
+    > $filename
+
+  popd >/dev/null
+
+  log "Wrote $dir/$filename"
+  log "URL: file://$PWD/$dir/$filename"
+}
+
+test-error() {
+  local spec_regex=${1:-}
+  log "Some test cases failed"
+  if test -n "$spec_regex"; then
+    log "(Perhaps none matched pattern '$spec_regex')"
+  fi
+  # don't quit just yet
+  # exit 1 
+}
+
+# Assuming the spec file, write a list of test case names (first column) with
+# the instance ids (second column), where instance ids run from 1 to $1.
+# Third column is impl.
+_setup-test-instances() {
+  local instances=$1
+  local impl=$2
+
+  while read line; do
+    for i in $(seq 1 $instances); do
+      read case_name _ <<< $line  # extract the first token
+      echo $case_name $i $impl
+    done
+  done
+}
+
+# Print the default number of parallel processes, which is max(#CPUs - 1, 1)
+default-processes() {
+  processors=$(grep -c ^processor /proc/cpuinfo || echo 4)  # Linux-specific
+  if test $processors -gt 1; then  # leave one CPU for the OS
+    processors=$(expr $processors - 1)
+  fi
+  echo $processors
+}
+
+# Args:
+#   spec_gen: A program to execute to generate the spec.
+#   spec_regex: A pattern selecting the subset of tests to run
+#   parallel: Whether the tests are run in parallel (T/F).  Sequential
+#     runs log to the console; parallel runs log to files.
+#   impl: one of python, or cpp
+#   instances: A number of times each test case is run
+
+_run-tests() {
+  local spec_gen=$1
+  local spec_regex="$2"  # grep -E format on the spec, can be empty
+  local parallel=$3
+  local impl=${4:-"cpp"}
+  local instances=${5:-1}
+
+  local regtest_dir=$REGTEST_BASE_DIR/$impl
+  rm -r -f --verbose $regtest_dir
+  
+  mkdir --verbose -p $regtest_dir
+
+  local func
+  local processors
+
+  if test $parallel = F; then
+    func=_run-one-instance  # output to the console
+    processors=1
+  else
+    func=_run-one-instance-logged
+    # Let the user override with MAX_PROC, in case they don't have enough
+    # memory.
+    processors=${MAX_PROC:-$(default-processes)}
+    log "Running $processors parallel processes"
+  fi
+
+  local cases_list=$regtest_dir/test-cases.txt
+  # Need -- for regexes that start with -
+  $spec_gen | grep -E -- "$spec_regex" > $cases_list
+
+  # Generate parameters for all test cases.
+  cat $cases_list \
+    | xargs -l -P $processors -- $0 _setup-one-case $impl \
+    || test-error
+
+  log "Done generating parameters for all test cases"
+
+  local instances_list=$regtest_dir/test-instances.txt
+  _setup-test-instances $instances $impl < $cases_list > $instances_list 
+
+  cat $instances_list \
+    | xargs -l -P $processors -- $0 $func || test-error
+
+  log "Done running all test instances"
+
+  make-summary $regtest_dir $impl
+}
+
+# used for most tests
+readonly REGTEST_SPEC=tests/regtest_spec.py
+
+# Run tests sequentially.  NOTE: called by demo.sh.
+run-seq() {
+  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
+  shift
+
+  time _run-tests $REGTEST_SPEC $spec_regex F $@
+}
+
+# Run tests in parallel
+run() {
+  local spec_regex=${1:-'^r-'}  # grep -E format on the spec
+  shift
+  
+  time _run-tests $REGTEST_SPEC $spec_regex T $@
+}
+
+# Run tests in parallel (7+ minutes on 8 cores)
+run-all() {
+  log "Running all tests. Can take a while."
+  time _run-tests $REGTEST_SPEC '^r-' T cpp
+}
+
+run-user() {
+  local spec_regex=${1:-}
+  local parallel=T  # too much memory
+  time _run-tests tests/user_spec.py "$spec_regex" $parallel cpp
+}
+
+# Use stable true values
+compare-python-cpp() {
+  local num_unique_values=100
+  local num_clients=10000
+  local values_per_client=10
+  local num_cohorts=64
+
+  local true_values=$REGTEST_BASE_DIR/stable_true_values.csv
+
+  tests/gen_true_values.R \
+    exp $num_unique_values $num_clients $values_per_client $num_cohorts \
+    $true_values
+
+  wc -l $true_values
+
+  # Run Python and C++ simulation on the same input
+
+  ./build.sh cpp-client
+
+  TRUE_VALUES_PATH=$true_values \
+    ./regtest.sh run-seq '^demo3' 1 python
+
+  TRUE_VALUES_PATH=$true_values \
+    ./regtest.sh run-seq '^demo3' 1 cpp
+
+  head _tmp/{python,cpp}/demo3/1/case_reports.csv
+}
+
+if test $# -eq 0 ; then
+  usage
+else
+  "$@"
+fi