1 files changed, 271 insertions, 0 deletions
diff --git a/analysis/R/ngrams_simulation.R b/analysis/R/ngrams_simulation.R
new file mode 100755
index 0000000..ca7ce49
--- /dev/null
+++ b/analysis/R/ngrams_simulation.R
@@ -0,0 +1,271 @@
+# Copyright 2014 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Authors: vpihur@google.com (Vasyl Pihur) and fanti@google.com (Giulia Fanti)
+#
+# Tools used to simulate sending partial ngrams to the server for estimating the
+#     dictionary of terms over which we want to learn a distribution. This
+#     mostly contains functions that aid in the generation of synthetic data.
+
+library(RUnit)
+library(parallel)
+
+source("analysis/R/encode.R")
+source("analysis/R/decode.R")
+source("analysis/R/simulation.R")
+source("analysis/R/association.R")
+source("analysis/R/decode_ngrams.R")
+
+# The alphabet is the set of all possible characters that will appear in a
+#     string. Here we use the English alphabet, but one might want to include
+#     numbers or punctuation marks.
+alphabet <- letters
+
+GenerateCandidates <- function(alphabet, ngram_size = 2) {
+  # Draws a random string for each individual in the
+  #     population from distribution.
+  #
+  # Args:
+  #   N: Number of individuals in the population
+  #   num_strs: Number of strings from which to draw strings
+  #   str_len: Length of each string
+  #
+  # Returns:
+  #   Vector of strings for each individual in the population
+
+  cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet))
+  apply(cands, 1, function(x) paste0(x, collapse = ""))
+}
+
+GenerateString <- function(n) {
+  # Generates a string of a given length from the alphabet.
+  #
+  # Args:
+  #   n: Number of characters in the string
+  #
+  # Returns:
+  #   String of length n
+  paste0(sample(alphabet, n, replace = TRUE), collapse = "")
+}
+
+GeneratePopulation <- function(N, num_strs, str_len = 10,
+                               distribution = 1) {
+  # Generates a string for each individual in the population from distribution.
+  #
+  # Args:
+  #   N: Number of individuals in the population
+  #   num_strs: Number of strings from which to draw strings
+  #   str_len: Length of each string
+  #   distribution: which type of distribution to use
+  #     1: Zipfian
+  #     2: Geometric (exponential)
+  #     3: Step function
+  #
+  # Returns:
+  #   Vector of strings for each individual in the population
+
+  strs <- sapply(1:num_strs, function(i) GenerateString(str_len))
+
+  if (distribution == 1) {
+    # Zipfian-ish distribution
+    prob <- (1:num_strs)^20
+    prob <- prob / sum(prob) + 0.001
+    prob <- prob / sum(prob)
+  } else if (distribution == 2) {
+    # Geometric distribution (discrete approximation to exponential)
+    p <- 0.3
+    prob <- p * (1 - p)^(1:num_strs - 1)
+    prob <- prob / sum(prob)
+  } else {
+    # Uniform
+    prob <- rep(1 / num_strs, num_strs)
+  }
+
+  sample(strs, N, replace = TRUE, prob = prob)
+}
+
+SelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) {
+  # Selects which ngrams each user will encode and then submit.
+  #
+  # Args:
+  #   str: String from which ngram is built.
+  #   num_ngrams: Number of ngrams to choose
+  #   size: Number of characters per ngram
+  #   max_str_len: Maximum number of characters in the string
+  #
+  # Returns:
+  #   List of each individual's ngrams and which positions the ngrams
+  #       were drawn from.
+
+  start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams))
+  ngrams <- mapply(function(x, y, str) substr(str, x, y),
+                   start, start + size - 1,
+                   MoreArgs = list(str = str))
+  list(ngrams = ngrams, starts = start)
+}
+
+UpdateMapWithCandidates <- function(str_candidates, sim, params) {
+  # Generates a new map based on the returned candidates.
+  #     Normally this would be created on the spot by having the
+  #     aggregator hash the string candidates. But since we already have
+  #     the map from simulation, we'll just choose the appropriate
+  #     column
+  #
+  # Arguments:
+  #   str_candidates: Vector of string candidates
+  #   sim: Simulation object containing the original map
+  #   params: RAPPOR parameter list
+
+  k <- params$k
+  h <- params$h
+  m <- params$m
+
+  # First add the real candidates to the map
+  valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
+  updated_map <- sim$full_map
+  updated_map$map_by_cohort <- lapply(1:m, function(i) {
+    sim$full_map$map_by_cohort[[i]][, valid_cands]
+  })
+
+  # Now add the false positives (we can just draw random strings for
+  #     these since they didn't appear in the original dataset anyway)
+  new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
+  M <- length(new_cands)
+  if (M > 0) {
+    for (i in 1:m) {
+      ones <- sample(1:k, M * h, replace = TRUE)
+      cols <- rep(1:M, each = h)
+      strs <- c(sort(valid_cands), new_cands)
+      updated_map$map_by_cohort[[i]] <-
+          do.call(cBind, list(updated_map$map_by_cohort[[i]],
+                              sparseMatrix(ones, cols, dims = c(k, M))))
+      colnames(updated_map$map_by_cohort[[i]]) <- strs
+    }
+  }
+  if (class(updated_map$map_by_cohort[[1]]) == "logical") {
+    updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort)
+    updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE)
+    colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands)
+  } else {
+    updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort)
+  }
+  updated_map
+}
+
+SimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10,
+                           alphabet, params, distribution = 1) {
+  # Simulates the creation and encoding of ngrams for each individual.
+  #
+  # Args:
+  #   N: Number of individuals in the population
+  #   ngram_params: Parameters about ngram size, etc.
+  #   str_len: Length of each string
+  #   num_strs: NUmber of strings in the dictionary
+  #   alphabet: Alphabet used to generate strings
+  #   params: RAPPOR parameters, like noise and cohorts
+  #
+  # Returns:
+  #   List containing all the information needed for estimating and
+  #       verifying the results.
+
+  # Get the list of strings for each user
+  strs <- GeneratePopulation(N, num_strs = num_strs,
+                             str_len = str_len,
+                             distribution)
+
+  # Split them into ngrams and encode
+  ngram <- lapply(strs, function(i)
+                  SelectNGrams(i,
+                               num_ngrams = ngram_params$num_ngrams_collected,
+                               size = ngram_params$ngram_size,
+                               max_str_len = str_len))
+
+  cands <- GenerateCandidates(alphabet, ngram_params$ngram_size)
+  map <- CreateMap(cands, params, FALSE)
+  cohorts <- sample(1:params$m, N, replace = TRUE)
+
+  g <- sapply(ngram, function(x) paste(x$starts, sep = "_",
+                                       collapse = "_"))
+  ug <- sort(unique(g))
+  pairings <- t(sapply(ug, function(x)
+                       sapply(strsplit(x, "_"), function(y) as.numeric(y))))
+
+  inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i]))
+
+  reports <- lapply(1:length(ug), function(k) {
+    # Generate the ngram reports
+    lapply(1:ngram_params$num_ngrams_collected, function(x) {
+      EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]),
+                cohorts[inds[[k]]], map$map_by_cohort, params)})
+  })
+  cat("Encoded the ngrams.\n")
+  # Now generate the full string reports
+  full_map <- CreateMap(sort(unique(strs)), params, FALSE)
+  full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params)
+
+  list(reports = reports, cohorts = cohorts, ngram = ngram, map = map,
+       strs = strs, pairings = pairings, inds = inds, cands = cands,
+       full_reports = full_reports, full_map = full_map)
+
+}
+
+
+EstimateDictionaryTrial <- function(N, str_len, num_strs,
+                                    params, ngram_params,
+                                    distribution = 3) {
+  # Runs a single trial for simulation. Generates simulated reports,
+  #     decodes them, and returns the result.
+  #
+  # Arguments:
+  #   N: Number of users to simulation
+  #   str_len: The length of strings to estimate
+  #   num_strs: The number of strings in the dictionary
+  #   params: RAPPOR parameter list
+  #   ngram_params: Parameters related to the size of ngrams
+  #   distribution: Tells what kind of distribution to use:
+  #       1: Zipfian
+  #       2: Geometric
+  #       3: Uniform (default)
+  #
+  # Returns:
+  #   List with recovered and true marginals.
+
+  # We call the needed libraries here in order to make them available when this
+  #     function gets called by BorgApply. Otherwise, they do not get included.
+  library(glmnet)
+  library(parallel)
+  sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
+                        alphabet, params, distribution)
+
+  res <- EstimateDictionary(sim, N, ngram_params, params)
+  str_candidates <- res$found_candidates
+  pairwise_candidates <- res$pairwise_candidates
+
+  if (length(str_candidates) == 0) {
+    return (NULL)
+  }
+  updated_map <- UpdateMapWithCandidates(str_candidates, sim, params)
+
+  # Compute the marginal for this new set of strings
+  variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params)
+  # Our dictionary estimate
+  marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit
+  # Estimate given full dictionary knowledge
+  marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit
+  # The true (sampled) data distribution
+  truth <- sort(table(sim$strs)) / N
+
+  list(marginal = marginal, marginal_full = marginal_full,
+       truth = truth, pairwise_candidates = pairwise_candidates)
+}