1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
/*
* Copyright 2019 Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UTIL_PROCESS_RECORD_FILE_UTIL_H_
#define UTIL_PROCESS_RECORD_FILE_UTIL_H_
#include <algorithm>
#include <functional>
#include <future> // NOLINT
#include <string>
#include "absl/strings/string_view.h"
#include "private_join_and_compute/util/process_record_file_parameters.h"
#include "private_join_and_compute/util/proto_util.h"
#include "private_join_and_compute/util/recordio.h"
#include "private_join_and_compute/util/status.inc"
namespace private_join_and_compute::util::process_file_util {
// Applies the function record_transformer() to all the records in input_file,
// and writes the resulting records to output_file, sorted by the key returned
// by the provided get_sorting_key_function. By default, records are sorted by
// their string representation.
// input_file must contain records of type InputFile.
// output_file contains records of type OutputFile.
// The files are processed in parallel using the number of threads specified by
// the ProcessRecordFileParameters.
// The file is processed in chunks of at most params.data_chunk_size values:
// read a chunk, apply function record_transformer() in parallel using
// params.thread_count threads, get the output values returned by each thread,
// and write them to file. Process the next chunk until there are no more values
// to read.
template <typename InputType, typename OutputType>
Status ProcessRecordFile(
const std::function<StatusOr<OutputType>(InputType)>& record_transformer,
const ProcessRecordFileParameters& params, absl::string_view input_file,
absl::string_view output_file,
const std::function<std::string(absl::string_view)>&
get_sorting_key_function = [](absl::string_view raw_record) {
return std::string(raw_record);
}) {
auto reader = std::unique_ptr<RecordReader>(RecordReader::GetRecordReader());
RETURN_IF_ERROR(reader->Open(input_file));
auto writer = ShardingWriter<std::string>::Get(get_sorting_key_function);
writer->SetShardPrefix(output_file);
std::string raw_record;
size_t num_records_read = 0;
// Process the file in chunks of at most data_chunk_size values: read a
// chunk, process it in parallel using the number of available threads, get
// the values returned by each thread, and write them to file.
// Process the next chunk until there are no more values to read.
ASSIGN_OR_RETURN(bool has_more, reader->HasMore());
while (has_more) {
// Read the next chunk to process in parallel.
num_records_read = 0;
std::vector<InputType> chunk;
while (num_records_read < params.data_chunk_size && has_more) {
RETURN_IF_ERROR(reader->Read(&raw_record));
chunk.push_back(ProtoUtils::FromString<InputType>(raw_record));
num_records_read++;
ASSIGN_OR_RETURN(has_more, reader->HasMore());
}
// The max number of items each thread will process.
size_t per_thread_size =
(chunk.size() + params.thread_count - 1) / params.thread_count;
// Stores the results of each thread.
// Each thread processes a portion of chunk.
std::vector<std::future<StatusOr<std::vector<OutputType>>>> futures;
for (uint32_t j = 0; j < params.thread_count; j++) {
size_t start = j * per_thread_size;
size_t end = std::min((j + 1) * per_thread_size, num_records_read);
// std::launch::async ensures multi-thread.
futures.push_back(std::async(
std::launch::async,
[&chunk, start, end,
record_transformer]() -> StatusOr<std::vector<OutputType>> {
std::vector<OutputType> processes_chunk;
for (size_t i = start; i < end; i++) {
ASSIGN_OR_RETURN(auto processed_record,
record_transformer(chunk.at(i)));
processes_chunk.push_back(std::move(processed_record));
}
return processes_chunk;
}));
}
// Write the processed values returned by each thread to file.
writer->SetShardPrefix(output_file);
int index = 0;
for (auto& future : futures) {
index++;
ASSIGN_OR_RETURN(auto records, future.get());
for (const auto& record : records) {
RETURN_IF_ERROR(writer->Write(ProtoUtils::ToString(record)));
}
}
}
RETURN_IF_ERROR(reader->Close());
// Merge all the processed chunks into one output file and delete intermediate
// chunk files.
ASSIGN_OR_RETURN(auto shard_files, writer->Close());
ShardMerger<std::string> merger;
RETURN_IF_ERROR(
merger.Merge(get_sorting_key_function, shard_files, output_file));
RETURN_IF_ERROR(merger.Delete(shard_files));
return OkStatus();
}
} // namespace private_join_and_compute::util::process_file_util
#endif // UTIL_PROCESS_RECORD_FILE_UTIL_H_
|