diff options
Diffstat (limited to 'afdo_redaction/redact_profile.py')
-rwxr-xr-x | afdo_redaction/redact_profile.py | 306 |
1 files changed, 160 insertions, 146 deletions
diff --git a/afdo_redaction/redact_profile.py b/afdo_redaction/redact_profile.py index 02bae928..0779d2ac 100755 --- a/afdo_redaction/redact_profile.py +++ b/afdo_redaction/redact_profile.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2018 The Chromium OS Authors. All rights reserved. +# Copyright 2018 The ChromiumOS Authors # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. @@ -24,7 +24,6 @@ It reads a textual AFDO profile from stdin, and prints a 'fixed' version of it to stdout. A summary of what the script actually did is printed to stderr. """ -from __future__ import division, print_function import collections import re @@ -32,23 +31,23 @@ import sys def _count_samples(samples): - """Count the total number of samples in a function.""" - line_re = re.compile(r'^(\s*)\d+(?:\.\d+)?: (\d+)\s*$') + """Count the total number of samples in a function.""" + line_re = re.compile(r"^(\s*)\d+(?:\.\d+)?: (\d+)\s*$") - top_level_samples = 0 - all_samples = 0 - for line in samples: - m = line_re.match(line) - if not m: - continue + top_level_samples = 0 + all_samples = 0 + for line in samples: + m = line_re.match(line) + if not m: + continue - spaces, n = m.groups() - n = int(n) - all_samples += n - if len(spaces) == 1: - top_level_samples += n + spaces, n = m.groups() + n = int(n) + all_samples += n + if len(spaces) == 1: + top_level_samples += n - return top_level_samples, all_samples + return top_level_samples, all_samples # A ProfileRecord is a set of samples for a top-level symbol in a textual AFDO @@ -80,70 +79,75 @@ def _count_samples(samples): # And samples look like one of: # arbitrary_number: sample_count # arbitrary_number: inlined_function_symbol:inlined_entry_count -ProfileRecord = collections.namedtuple('ProfileRecord', - ['function_line', 'samples']) +ProfileRecord = collections.namedtuple( + "ProfileRecord", ["function_line", "samples"] +) def _normalize_samples(samples): - """Normalizes the samples in the given function body. - - Normalization just means that we redact inlined function names. This is - done so that a bit of templating doesn't make two function bodies look - distinct. Namely: - - template <typename T> - __attribute__((noinline)) - int getNumber() { return 1; } - - template <typename T> - __attribute__((noinline)) - int getNumberIndirectly() { return getNumber<T>(); } - - int main() { - return getNumber<int>() + getNumber<float>(); - } - - If the profile has the mangled name for getNumber<float> in - getNumberIndirectly<float> (and similar for <int>), we'll consider them to - be distinct when they're not. - """ - - # I'm not actually sure if this ends up being an issue in practice, but it's - # simple enough to guard against. - inlined_re = re.compile(r'(^\s*\d+): [^:]+:(\s*\d+)\s*$') - result = [] - for s in samples: - m = inlined_re.match(s) - if m: - result.append('%s: __REDACTED__:%s' % m.groups()) - else: - result.append(s) - return tuple(result) + """Normalizes the samples in the given function body. + + Normalization just means that we redact inlined function names. This is + done so that a bit of templating doesn't make two function bodies look + distinct. Namely: + + template <typename T> + __attribute__((noinline)) + int getNumber() { return 1; } + + template <typename T> + __attribute__((noinline)) + int getNumberIndirectly() { return getNumber<T>(); } + + int main() { + return getNumber<int>() + getNumber<float>(); + } + + If the profile has the mangled name for getNumber<float> in + getNumberIndirectly<float> (and similar for <int>), we'll consider them to + be distinct when they're not. + """ + + # I'm not actually sure if this ends up being an issue in practice, but it's + # simple enough to guard against. + inlined_re = re.compile(r"(^\s*\d+): [^:]+:(\s*\d+)\s*$") + result = [] + for s in samples: + m = inlined_re.match(s) + if m: + result.append("%s: __REDACTED__:%s" % m.groups()) + else: + result.append(s) + return tuple(result) def _read_textual_afdo_profile(stream): - """Parses an AFDO profile from a line stream into ProfileRecords.""" - # ProfileRecords are actually nested, due to inlining. For the purpose of - # this script, that doesn't matter. - lines = (line.rstrip() for line in stream) - function_line = None - samples = [] - for line in lines: - if not line: - continue - - if line[0].isspace(): - assert function_line is not None, 'sample exists outside of a function?' - samples.append(line) - continue - - if function_line is not None: - yield ProfileRecord(function_line=function_line, samples=tuple(samples)) - function_line = line + """Parses an AFDO profile from a line stream into ProfileRecords.""" + # ProfileRecords are actually nested, due to inlining. For the purpose of + # this script, that doesn't matter. + lines = (line.rstrip() for line in stream) + function_line = None samples = [] + for line in lines: + if not line: + continue + + if line[0].isspace(): + assert ( + function_line is not None + ), "sample exists outside of a function?" + samples.append(line) + continue + + if function_line is not None: + yield ProfileRecord( + function_line=function_line, samples=tuple(samples) + ) + function_line = line + samples = [] - if function_line is not None: - yield ProfileRecord(function_line=function_line, samples=tuple(samples)) + if function_line is not None: + yield ProfileRecord(function_line=function_line, samples=tuple(samples)) # The default of 100 is arbitrarily selected, but it does make the overwhelming @@ -157,86 +161,96 @@ def _read_textual_afdo_profile(stream): # Non-nm based approaches are superior because they don't require any prior # build artifacts; just an AFDO profile. def dedup_records(profile_records, summary_file, max_repeats=100): - """Removes heavily duplicated records from profile_records. - - profile_records is expected to be an iterable of ProfileRecord. - max_repeats ia how many functions must share identical bodies for us to - consider it 'heavily duplicated' and remove the results. - """ - - # Build a mapping of function structure -> list of functions with identical - # structure and sample counts - counts = collections.defaultdict(list) - for record in profile_records: - counts[_normalize_samples(record.samples)].append(record) - - # Be sure that we didn't see any duplicate functions, since that's bad... - total_functions_recorded = sum(len(records) for records in counts.values()) - - unique_function_names = { - record.function_line.split(':')[0] - for records in counts.values() - for record in records - } - - assert len(unique_function_names) == total_functions_recorded, \ - 'duplicate function names?' - - num_kept = 0 - num_samples_kept = 0 - num_top_samples_kept = 0 - num_total = 0 - num_samples_total = 0 - num_top_samples_total = 0 - - for normalized_samples, records in counts.items(): - top_sample_count, all_sample_count = _count_samples(normalized_samples) - top_sample_count *= len(records) - all_sample_count *= len(records) - - num_total += len(records) - num_samples_total += all_sample_count - num_top_samples_total += top_sample_count - - if len(records) >= max_repeats: - continue - - num_kept += len(records) - num_samples_kept += all_sample_count - num_top_samples_kept += top_sample_count - for record in records: - yield record - - print( - 'Retained {:,}/{:,} functions'.format(num_kept, num_total), - file=summary_file) - print( - 'Retained {:,}/{:,} samples, total'.format(num_samples_kept, - num_samples_total), - file=summary_file) - print('Retained {:,}/{:,} top-level samples' \ - .format(num_top_samples_kept, num_top_samples_total), - file=summary_file) + """Removes heavily duplicated records from profile_records. + + profile_records is expected to be an iterable of ProfileRecord. + max_repeats ia how many functions must share identical bodies for us to + consider it 'heavily duplicated' and remove the results. + """ + + # Build a mapping of function structure -> list of functions with identical + # structure and sample counts + counts = collections.defaultdict(list) + for record in profile_records: + counts[_normalize_samples(record.samples)].append(record) + + # Be sure that we didn't see any duplicate functions, since that's bad... + total_functions_recorded = sum(len(records) for records in counts.values()) + + unique_function_names = { + record.function_line.split(":")[0] + for records in counts.values() + for record in records + } + + assert ( + len(unique_function_names) == total_functions_recorded + ), "duplicate function names?" + + num_kept = 0 + num_samples_kept = 0 + num_top_samples_kept = 0 + num_total = 0 + num_samples_total = 0 + num_top_samples_total = 0 + + for normalized_samples, records in counts.items(): + top_sample_count, all_sample_count = _count_samples(normalized_samples) + top_sample_count *= len(records) + all_sample_count *= len(records) + + num_total += len(records) + num_samples_total += all_sample_count + num_top_samples_total += top_sample_count + + if len(records) >= max_repeats: + continue + + num_kept += len(records) + num_samples_kept += all_sample_count + num_top_samples_kept += top_sample_count + for record in records: + yield record + + print( + "Retained {:,}/{:,} functions".format(num_kept, num_total), + file=summary_file, + ) + print( + "Retained {:,}/{:,} samples, total".format( + num_samples_kept, num_samples_total + ), + file=summary_file, + ) + print( + "Retained {:,}/{:,} top-level samples".format( + num_top_samples_kept, num_top_samples_total + ), + file=summary_file, + ) def run(profile_input_file, summary_output_file, profile_output_file): - profile_records = _read_textual_afdo_profile(profile_input_file) + profile_records = _read_textual_afdo_profile(profile_input_file) - # Sort this so we get deterministic output. AFDO doesn't care what order it's - # in. - deduped = sorted( - dedup_records(profile_records, summary_output_file), - key=lambda r: r.function_line) - for function_line, samples in deduped: - print(function_line, file=profile_output_file) - print('\n'.join(samples), file=profile_output_file) + # Sort this so we get deterministic output. AFDO doesn't care what order it's + # in. + deduped = sorted( + dedup_records(profile_records, summary_output_file), + key=lambda r: r.function_line, + ) + for function_line, samples in deduped: + print(function_line, file=profile_output_file) + print("\n".join(samples), file=profile_output_file) def _main(): - run(profile_input_file=sys.stdin, - summary_output_file=sys.stderr, - profile_output_file=sys.stdout) + run( + profile_input_file=sys.stdin, + summary_output_file=sys.stderr, + profile_output_file=sys.stdout, + ) -if __name__ == '__main__': - _main() +if __name__ == "__main__": + _main() |