diff options
Diffstat (limited to 'analysis/R/read_input.R')
-rwxr-xr-x | analysis/R/read_input.R | 154 |
1 files changed, 154 insertions, 0 deletions
diff --git a/analysis/R/read_input.R b/analysis/R/read_input.R new file mode 100755 index 0000000..47f8be5 --- /dev/null +++ b/analysis/R/read_input.R @@ -0,0 +1,154 @@ +# Copyright 2014 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Read parameter, counts and map files. + +library(Matrix) + +source.rappor <- function(rel_path) { + abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) + source(abs_path) +} + +source.rappor("analysis/R/util.R") # for Log + + +ReadParameterFile <- function(params_file) { + # Read parameter file. Format: + # k, h, m, p, q, f + # 128, 2, 8, 0.5, 0.75, 0.75 + + params <- as.list(read.csv(params_file)) + if (length(params) != 6) { + stop("There should be exactly 6 columns in the parameter file.") + } + if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) { + stop("Parameter names must be k,h,m,p,q,f.") + } + params +} + +# Handle the case of redundant cohorts, i.e. the counts file needs to be +# further aggregated to obtain counts for the number of cohorts specified in +# the params file. +# +# NOTE: Why is this happening? +AdjustCounts <- function(counts, params) { + apply(counts, 2, function(x) { + tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) + }) +} + +ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { + # Read in the counts file. + if (!file.exists(counts_file)) { + return(NULL) + } + counts <- read.csv(counts_file, header = FALSE) + + if (adjust_counts) { + counts <- AdjustCounts(counts, params) + } + + if (nrow(counts) != params$m) { + stop(sprintf("Got %d rows in the counts file, expected m = %d", + nrow(counts), params$m)) + } + + if ((ncol(counts) - 1) != params$k) { + stop(paste0("Counts file: number of columns should equal to k + 1: ", + ncol(counts))) + } + + if (any(counts < 0)) { + stop("Counts file: all counts must be positive.") + } + + # Turn counts from a data frame into a matrix. (In R a data frame and matrix + # are sometimes interchangeable, but sometimes we need it to be matrix.) + as.matrix(counts) +} + +ReadMapFile <- function(map_file, params) { + # Read in the map file which is in the following format (two hash functions): + # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ... + # str2, ... + # Output: + # map: a sparse representation of set bits for each candidate string. + # strs: a vector of all candidate strings. + + Log("Parsing %s", map_file) + + map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE) + strs <- map_pos[, 1] + strs[strs == ""] <- "Empty" + + # Remove duplicated strings. + ind <- which(!duplicated(strs)) + strs <- strs[ind] + map_pos <- map_pos[ind, ] + + n <- ncol(map_pos) - 1 + if (n != (params$h * params$m)) { + stop(paste0("Map file: number of columns should equal hm + 1:", + n, "_", params$h * params$m)) + } + + row_pos <- unlist(map_pos[, -1], use.names = FALSE) + col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1) + + # TODO: When would this ever happen? + removed <- which(is.na(row_pos)) + if (length(removed) > 0) { + Log("Removed %d entries", length(removed)) + row_pos <- row_pos[-removed] + col_pos <- col_pos[-removed] + } + + map <- sparseMatrix(row_pos, col_pos, + dims = c(params$m * params$k, length(strs))) + + colnames(map) <- strs + list(map = map, strs = strs, map_pos = map_pos) +} + +LoadMapFile <- function(map_file, params) { + # Reads the map file, caching an .rda (R binary data) version of it to speed + # up future loads. + + rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE) + # This must be unique per process, so concurrent processes don't try to + # write the same file. + tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid()) + + # First save to a temp file, and then atomically rename to the destination. + if (file.exists(rda_path)) { + Log("Loading %s", rda_path) + load(rda_path, .GlobalEnv) # creates the 'map' variable in the global env + } else { + map <- ReadMapFile(map_file, params) + + Log("Saving %s as an rda file for faster access", map_file) + tryCatch({ + save(map, file = tmp_path) + file.rename(tmp_path, rda_path) + }, warning = function(w) { + Log("WARNING: %s", w) + }, error = function(e) { + Log("ERROR: %s", e) + }) + } + return(map) +} |