diff options
Diffstat (limited to 'analysis/R/ngrams_simulation.R')
-rwxr-xr-x | analysis/R/ngrams_simulation.R | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/analysis/R/ngrams_simulation.R b/analysis/R/ngrams_simulation.R new file mode 100755 index 0000000..ca7ce49 --- /dev/null +++ b/analysis/R/ngrams_simulation.R @@ -0,0 +1,271 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Authors: vpihur@google.com (Vasyl Pihur) and fanti@google.com (Giulia Fanti) +# +# Tools used to simulate sending partial ngrams to the server for estimating the +# dictionary of terms over which we want to learn a distribution. This +# mostly contains functions that aid in the generation of synthetic data. + +library(RUnit) +library(parallel) + +source("analysis/R/encode.R") +source("analysis/R/decode.R") +source("analysis/R/simulation.R") +source("analysis/R/association.R") +source("analysis/R/decode_ngrams.R") + +# The alphabet is the set of all possible characters that will appear in a +# string. Here we use the English alphabet, but one might want to include +# numbers or punctuation marks. +alphabet <- letters + +GenerateCandidates <- function(alphabet, ngram_size = 2) { + # Draws a random string for each individual in the + # population from distribution. + # + # Args: + # N: Number of individuals in the population + # num_strs: Number of strings from which to draw strings + # str_len: Length of each string + # + # Returns: + # Vector of strings for each individual in the population + + cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet)) + apply(cands, 1, function(x) paste0(x, collapse = "")) +} + +GenerateString <- function(n) { + # Generates a string of a given length from the alphabet. + # + # Args: + # n: Number of characters in the string + # + # Returns: + # String of length n + paste0(sample(alphabet, n, replace = TRUE), collapse = "") +} + +GeneratePopulation <- function(N, num_strs, str_len = 10, + distribution = 1) { + # Generates a string for each individual in the population from distribution. + # + # Args: + # N: Number of individuals in the population + # num_strs: Number of strings from which to draw strings + # str_len: Length of each string + # distribution: which type of distribution to use + # 1: Zipfian + # 2: Geometric (exponential) + # 3: Step function + # + # Returns: + # Vector of strings for each individual in the population + + strs <- sapply(1:num_strs, function(i) GenerateString(str_len)) + + if (distribution == 1) { + # Zipfian-ish distribution + prob <- (1:num_strs)^20 + prob <- prob / sum(prob) + 0.001 + prob <- prob / sum(prob) + } else if (distribution == 2) { + # Geometric distribution (discrete approximation to exponential) + p <- 0.3 + prob <- p * (1 - p)^(1:num_strs - 1) + prob <- prob / sum(prob) + } else { + # Uniform + prob <- rep(1 / num_strs, num_strs) + } + + sample(strs, N, replace = TRUE, prob = prob) +} + +SelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) { + # Selects which ngrams each user will encode and then submit. + # + # Args: + # str: String from which ngram is built. + # num_ngrams: Number of ngrams to choose + # size: Number of characters per ngram + # max_str_len: Maximum number of characters in the string + # + # Returns: + # List of each individual's ngrams and which positions the ngrams + # were drawn from. + + start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams)) + ngrams <- mapply(function(x, y, str) substr(str, x, y), + start, start + size - 1, + MoreArgs = list(str = str)) + list(ngrams = ngrams, starts = start) +} + +UpdateMapWithCandidates <- function(str_candidates, sim, params) { + # Generates a new map based on the returned candidates. + # Normally this would be created on the spot by having the + # aggregator hash the string candidates. But since we already have + # the map from simulation, we'll just choose the appropriate + # column + # + # Arguments: + # str_candidates: Vector of string candidates + # sim: Simulation object containing the original map + # params: RAPPOR parameter list + + k <- params$k + h <- params$h + m <- params$m + + # First add the real candidates to the map + valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) + updated_map <- sim$full_map + updated_map$map_by_cohort <- lapply(1:m, function(i) { + sim$full_map$map_by_cohort[[i]][, valid_cands] + }) + + # Now add the false positives (we can just draw random strings for + # these since they didn't appear in the original dataset anyway) + new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) + M <- length(new_cands) + if (M > 0) { + for (i in 1:m) { + ones <- sample(1:k, M * h, replace = TRUE) + cols <- rep(1:M, each = h) + strs <- c(sort(valid_cands), new_cands) + updated_map$map_by_cohort[[i]] <- + do.call(cBind, list(updated_map$map_by_cohort[[i]], + sparseMatrix(ones, cols, dims = c(k, M)))) + colnames(updated_map$map_by_cohort[[i]]) <- strs + } + } + if (class(updated_map$map_by_cohort[[1]]) == "logical") { + updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort) + updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE) + colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands) + } else { + updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort) + } + updated_map +} + +SimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10, + alphabet, params, distribution = 1) { + # Simulates the creation and encoding of ngrams for each individual. + # + # Args: + # N: Number of individuals in the population + # ngram_params: Parameters about ngram size, etc. + # str_len: Length of each string + # num_strs: NUmber of strings in the dictionary + # alphabet: Alphabet used to generate strings + # params: RAPPOR parameters, like noise and cohorts + # + # Returns: + # List containing all the information needed for estimating and + # verifying the results. + + # Get the list of strings for each user + strs <- GeneratePopulation(N, num_strs = num_strs, + str_len = str_len, + distribution) + + # Split them into ngrams and encode + ngram <- lapply(strs, function(i) + SelectNGrams(i, + num_ngrams = ngram_params$num_ngrams_collected, + size = ngram_params$ngram_size, + max_str_len = str_len)) + + cands <- GenerateCandidates(alphabet, ngram_params$ngram_size) + map <- CreateMap(cands, params, FALSE) + cohorts <- sample(1:params$m, N, replace = TRUE) + + g <- sapply(ngram, function(x) paste(x$starts, sep = "_", + collapse = "_")) + ug <- sort(unique(g)) + pairings <- t(sapply(ug, function(x) + sapply(strsplit(x, "_"), function(y) as.numeric(y)))) + + inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i])) + + reports <- lapply(1:length(ug), function(k) { + # Generate the ngram reports + lapply(1:ngram_params$num_ngrams_collected, function(x) { + EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]), + cohorts[inds[[k]]], map$map_by_cohort, params)}) + }) + cat("Encoded the ngrams.\n") + # Now generate the full string reports + full_map <- CreateMap(sort(unique(strs)), params, FALSE) + full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params) + + list(reports = reports, cohorts = cohorts, ngram = ngram, map = map, + strs = strs, pairings = pairings, inds = inds, cands = cands, + full_reports = full_reports, full_map = full_map) + +} + + +EstimateDictionaryTrial <- function(N, str_len, num_strs, + params, ngram_params, + distribution = 3) { + # Runs a single trial for simulation. Generates simulated reports, + # decodes them, and returns the result. + # + # Arguments: + # N: Number of users to simulation + # str_len: The length of strings to estimate + # num_strs: The number of strings in the dictionary + # params: RAPPOR parameter list + # ngram_params: Parameters related to the size of ngrams + # distribution: Tells what kind of distribution to use: + # 1: Zipfian + # 2: Geometric + # 3: Uniform (default) + # + # Returns: + # List with recovered and true marginals. + + # We call the needed libraries here in order to make them available when this + # function gets called by BorgApply. Otherwise, they do not get included. + library(glmnet) + library(parallel) + sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, + alphabet, params, distribution) + + res <- EstimateDictionary(sim, N, ngram_params, params) + str_candidates <- res$found_candidates + pairwise_candidates <- res$pairwise_candidates + + if (length(str_candidates) == 0) { + return (NULL) + } + updated_map <- UpdateMapWithCandidates(str_candidates, sim, params) + + # Compute the marginal for this new set of strings + variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params) + # Our dictionary estimate + marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit + # Estimate given full dictionary knowledge + marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit + # The true (sampled) data distribution + truth <- sort(table(sim$strs)) / N + + list(marginal = marginal, marginal_full = marginal_full, + truth = truth, pairwise_candidates = pairwise_candidates) +} |