diff options
Diffstat (limited to 'llvm_tools/upload_lexan_crashes_to_forcey.py')
-rwxr-xr-x | llvm_tools/upload_lexan_crashes_to_forcey.py | 412 |
1 files changed, 213 insertions, 199 deletions
diff --git a/llvm_tools/upload_lexan_crashes_to_forcey.py b/llvm_tools/upload_lexan_crashes_to_forcey.py index 61bf6b7d..885a88f6 100755 --- a/llvm_tools/upload_lexan_crashes_to_forcey.py +++ b/llvm_tools/upload_lexan_crashes_to_forcey.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2020 The Chromium OS Authors. All rights reserved. +# Copyright 2020 The ChromiumOS Authors # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. @@ -16,142 +16,149 @@ import shutil import subprocess import sys import tempfile -from typing import Generator, List, Iterable +from typing import Generator, Iterable, List -gsurl_base = 'gs://chrome-clang-crash-reports/v1' + +gsurl_base = "gs://chrome-clang-crash-reports/v1" def gsutil_ls(loc: str) -> List[str]: - results = subprocess.run(['gsutil.py', 'ls', loc], - stdout=subprocess.PIPE, - check=True, - encoding='utf-8') - return [l.strip() for l in results.stdout.splitlines()] + results = subprocess.run( + ["gsutil.py", "ls", loc], + stdout=subprocess.PIPE, + check=True, + encoding="utf-8", + ) + return [l.strip() for l in results.stdout.splitlines()] def gsurl_ls_last_numbers(url: str) -> List[int]: - return sorted(int(x.rstrip('/').split('/')[-1]) for x in gsutil_ls(url)) + return sorted(int(x.rstrip("/").split("/")[-1]) for x in gsutil_ls(url)) def get_available_year_numbers() -> List[int]: - return gsurl_ls_last_numbers(gsurl_base) + return gsurl_ls_last_numbers(gsurl_base) def get_available_month_numbers(year: int) -> List[int]: - return gsurl_ls_last_numbers(f'{gsurl_base}/{year}') + return gsurl_ls_last_numbers(f"{gsurl_base}/{year}") def get_available_day_numbers(year: int, month: int) -> List[int]: - return gsurl_ls_last_numbers(f'{gsurl_base}/{year}/{month:02d}') + return gsurl_ls_last_numbers(f"{gsurl_base}/{year}/{month:02d}") def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]: - return gsutil_ls(f'{gsurl_base}/{year}/{month:02d}/{day:02d}') + return gsutil_ls(f"{gsurl_base}/{year}/{month:02d}/{day:02d}") -def test_cases_on_or_after(date: datetime.datetime - ) -> Generator[str, None, None]: - """Yields all test-cases submitted on or after the given date.""" - for year in get_available_year_numbers(): - if year < date.year: - continue +def test_cases_on_or_after( + date: datetime.datetime, +) -> Generator[str, None, None]: + """Yields all test-cases submitted on or after the given date.""" + for year in get_available_year_numbers(): + if year < date.year: + continue - for month in get_available_month_numbers(year): - if year == date.year and month < date.month: - continue + for month in get_available_month_numbers(year): + if year == date.year and month < date.month: + continue - for day in get_available_day_numbers(year, month): - when = datetime.date(year, month, day) - if when < date: - continue + for day in get_available_day_numbers(year, month): + when = datetime.date(year, month, day) + if when < date: + continue - yield when, get_available_test_case_urls(year, month, day) + yield when, get_available_test_case_urls(year, month, day) def to_ymd(date: datetime.date) -> str: - return date.strftime('%Y-%m-%d') + return date.strftime("%Y-%m-%d") def from_ymd(date_str: str) -> datetime.date: - return datetime.datetime.strptime(date_str, '%Y-%m-%d').date() - - -def persist_state(seen_urls: Iterable[str], state_file: str, - current_date: datetime.date): - tmp_state_file = state_file + '.tmp' - with open(tmp_state_file, 'w', encoding='utf-8') as f: - json.dump( - { - 'already_seen': sorted(seen_urls), - 'most_recent_date': to_ymd(current_date), - }, - f, - ) - os.rename(tmp_state_file, state_file) + return datetime.datetime.strptime(date_str, "%Y-%m-%d").date() + + +def persist_state( + seen_urls: Iterable[str], state_file: str, current_date: datetime.date +): + tmp_state_file = state_file + ".tmp" + with open(tmp_state_file, "w", encoding="utf-8") as f: + json.dump( + { + "already_seen": sorted(seen_urls), + "most_recent_date": to_ymd(current_date), + }, + f, + ) + os.rename(tmp_state_file, state_file) @contextlib.contextmanager def temp_dir() -> Generator[str, None, None]: - loc = tempfile.mkdtemp('lexan-autosubmit') - try: - yield loc - finally: - shutil.rmtree(loc) + loc = tempfile.mkdtemp("lexan-autosubmit") + try: + yield loc + finally: + shutil.rmtree(loc) def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None: - suffix = os.path.splitext(gs_url)[1] - target_name = 'test_case' + suffix - target = os.path.join(tempdir, target_name) - subprocess.run(['gsutil.py', 'cp', gs_url, target], check=True) - subprocess.run(['tar', 'xaf', target_name], check=True, cwd=tempdir) - os.unlink(target) + suffix = os.path.splitext(gs_url)[1] + target_name = "test_case" + suffix + target = os.path.join(tempdir, target_name) + subprocess.run(["gsutil.py", "cp", gs_url, target], check=True) + subprocess.run(["tar", "xaf", target_name], check=True, cwd=tempdir) + os.unlink(target) def submit_test_case(gs_url: str, cr_tool: str) -> None: - logging.info('Submitting %s', gs_url) - with temp_dir() as tempdir: - download_and_unpack_test_case(gs_url, tempdir) - - # Sometimes (e.g., in - # gs://chrome-clang-crash-reports/v1/2020/03/27/ - # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz) - # we'll get `.crash` files. Unclear why, but let's filter them out anyway. - repro_files = [ - os.path.join(tempdir, x) - for x in os.listdir(tempdir) - if not x.endswith('.crash') - ] - assert len(repro_files) == 2, repro_files - if repro_files[0].endswith('.sh'): - sh_file, src_file = repro_files - assert not src_file.endswith('.sh'), repro_files - else: - src_file, sh_file = repro_files - assert sh_file.endswith('.sh'), repro_files - - # Peephole: lexan got a crash upload with a way old clang. Ignore it. - with open(sh_file, encoding='utf-8') as f: - if 'Crash reproducer for clang version 9.0.0' in f.read(): - logging.warning('Skipping upload for %s; seems to be with an old clang', - gs_url) - return - - subprocess.run( - [ - cr_tool, - 'reduce', - '-stream=false', - '-wait=false', - '-note', - gs_url, - '-sh_file', - os.path.join(tempdir, sh_file), - '-src_file', - os.path.join(tempdir, src_file), - ], - check=True, - ) + logging.info("Submitting %s", gs_url) + with temp_dir() as tempdir: + download_and_unpack_test_case(gs_url, tempdir) + + # Sometimes (e.g., in + # gs://chrome-clang-crash-reports/v1/2020/03/27/ + # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz) + # we'll get `.crash` files. Unclear why, but let's filter them out anyway. + repro_files = [ + os.path.join(tempdir, x) + for x in os.listdir(tempdir) + if not x.endswith(".crash") + ] + assert len(repro_files) == 2, repro_files + if repro_files[0].endswith(".sh"): + sh_file, src_file = repro_files + assert not src_file.endswith(".sh"), repro_files + else: + src_file, sh_file = repro_files + assert sh_file.endswith(".sh"), repro_files + + # Peephole: lexan got a crash upload with a way old clang. Ignore it. + with open(sh_file, encoding="utf-8") as f: + if "Crash reproducer for clang version 9.0.0" in f.read(): + logging.warning( + "Skipping upload for %s; seems to be with an old clang", + gs_url, + ) + return + + subprocess.run( + [ + cr_tool, + "reduce", + "-stream=false", + "-wait=false", + "-note", + gs_url, + "-sh_file", + os.path.join(tempdir, sh_file), + "-src_file", + os.path.join(tempdir, src_file), + ], + check=True, + ) def submit_new_test_cases( @@ -160,112 +167,119 @@ def submit_new_test_cases( forcey: str, state_file_path: str, ) -> None: - """Submits new test-cases to forcey. - - This will persist state after each test-case is submitted. - - Args: - last_seen_test_cases: test-cases which have been submitted already, and - should be skipped if seen again. - earliest_date_to_check: the earliest date we should consider test-cases - from. - forcey: path to the forcey binary. - state_file_path: path to our state file. - """ - # `all_test_cases_seen` is the union of all test-cases seen on this and prior - # invocations. It guarantees, in all cases we care about, that we won't - # submit the same test-case twice. `test_cases_seen_this_invocation` is - # persisted as "all of the test-cases we've seen on this and prior - # invocations" if we successfully submit _all_ test-cases. - # - # Since you can visualize the test-cases this script considers as a sliding - # window that only moves forward, if we saw a test-case on a prior iteration - # but no longer see it, we'll never see it again (since it fell out of our - # sliding window by being too old). Hence, keeping it around is - # pointless. - # - # We only persist this minimized set of test-cases if _everything_ succeeds, - # since if something fails below, there's a chance that we haven't revisited - # test-cases that we've already seen. - all_test_cases_seen = set(last_seen_test_cases) - test_cases_seen_this_invocation = [] - most_recent_date = earliest_date_to_check - for date, candidates in test_cases_on_or_after(earliest_date_to_check): - most_recent_date = max(most_recent_date, date) - - for url in candidates: - test_cases_seen_this_invocation.append(url) - if url in all_test_cases_seen: - continue - - all_test_cases_seen.add(url) - submit_test_case(url, forcey) - - # Persisting on each iteration of this loop isn't free, but it's the - # easiest way to not resubmit test-cases, and it's good to keep in mind - # that: - # - the state file will be small (<12KB, since it only keeps a few days - # worth of test-cases after the first run) - # - in addition to this, we're downloading+unzipping+reuploading multiple - # MB of test-case bytes. - # - # So comparatively, the overhead here probably isn't an issue. - persist_state(all_test_cases_seen, state_file_path, most_recent_date) - - persist_state(test_cases_seen_this_invocation, state_file_path, - most_recent_date) + """Submits new test-cases to forcey. + + This will persist state after each test-case is submitted. + + Args: + last_seen_test_cases: test-cases which have been submitted already, and + should be skipped if seen again. + earliest_date_to_check: the earliest date we should consider test-cases + from. + forcey: path to the forcey binary. + state_file_path: path to our state file. + """ + # `all_test_cases_seen` is the union of all test-cases seen on this and prior + # invocations. It guarantees, in all cases we care about, that we won't + # submit the same test-case twice. `test_cases_seen_this_invocation` is + # persisted as "all of the test-cases we've seen on this and prior + # invocations" if we successfully submit _all_ test-cases. + # + # Since you can visualize the test-cases this script considers as a sliding + # window that only moves forward, if we saw a test-case on a prior iteration + # but no longer see it, we'll never see it again (since it fell out of our + # sliding window by being too old). Hence, keeping it around is + # pointless. + # + # We only persist this minimized set of test-cases if _everything_ succeeds, + # since if something fails below, there's a chance that we haven't revisited + # test-cases that we've already seen. + all_test_cases_seen = set(last_seen_test_cases) + test_cases_seen_this_invocation = [] + most_recent_date = earliest_date_to_check + for date, candidates in test_cases_on_or_after(earliest_date_to_check): + most_recent_date = max(most_recent_date, date) + + for url in candidates: + test_cases_seen_this_invocation.append(url) + if url in all_test_cases_seen: + continue + + all_test_cases_seen.add(url) + submit_test_case(url, forcey) + + # Persisting on each iteration of this loop isn't free, but it's the + # easiest way to not resubmit test-cases, and it's good to keep in mind + # that: + # - the state file will be small (<12KB, since it only keeps a few days + # worth of test-cases after the first run) + # - in addition to this, we're downloading+unzipping+reuploading multiple + # MB of test-case bytes. + # + # So comparatively, the overhead here probably isn't an issue. + persist_state( + all_test_cases_seen, state_file_path, most_recent_date + ) + + persist_state( + test_cases_seen_this_invocation, state_file_path, most_recent_date + ) def main(argv: List[str]): - logging.basicConfig( - format='>> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: ' - '%(message)s', - level=logging.INFO, - ) - - my_dir = os.path.dirname(os.path.abspath(__file__)) - - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - '--state_file', default=os.path.join(my_dir, 'lexan-state.json')) - parser.add_argument( - '--last_date', - help='The earliest date that we care about. All test cases from here ' - 'on will be picked up. Format is YYYY-MM-DD.') - parser.add_argument( - '--4c', dest='forcey', required=True, help='Path to a 4c client binary') - opts = parser.parse_args(argv) - - forcey = opts.forcey - state_file = opts.state_file - last_date_str = opts.last_date - - os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True) - - if last_date_str is None: - with open(state_file, encoding='utf-8') as f: - data = json.load(f) - most_recent_date = from_ymd(data['most_recent_date']) - submit_new_test_cases( - last_seen_test_cases=data['already_seen'], - # Note that we always subtract one day from this to avoid a race: - # uploads may appear slightly out-of-order (or builders may lag, or - # ...), so the last test-case uploaded for 2020/01/01 might appear - # _after_ the first test-case for 2020/01/02. Assuming that builders - # won't lag behind for over a day, the easiest way to handle this is to - # always check the previous and current days. - earliest_date_to_check=most_recent_date - datetime.timedelta(days=1), - forcey=forcey, - state_file_path=state_file, + logging.basicConfig( + format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: " + "%(message)s", + level=logging.INFO, ) - else: - submit_new_test_cases( - last_seen_test_cases=(), - earliest_date_to_check=from_ymd(last_date_str), - forcey=forcey, - state_file_path=state_file, + + my_dir = os.path.dirname(os.path.abspath(__file__)) + + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--state_file", default=os.path.join(my_dir, "lexan-state.json") ) + parser.add_argument( + "--last_date", + help="The earliest date that we care about. All test cases from here " + "on will be picked up. Format is YYYY-MM-DD.", + ) + parser.add_argument( + "--4c", dest="forcey", required=True, help="Path to a 4c client binary" + ) + opts = parser.parse_args(argv) + + forcey = opts.forcey + state_file = opts.state_file + last_date_str = opts.last_date + + os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True) + + if last_date_str is None: + with open(state_file, encoding="utf-8") as f: + data = json.load(f) + most_recent_date = from_ymd(data["most_recent_date"]) + submit_new_test_cases( + last_seen_test_cases=data["already_seen"], + # Note that we always subtract one day from this to avoid a race: + # uploads may appear slightly out-of-order (or builders may lag, or + # ...), so the last test-case uploaded for 2020/01/01 might appear + # _after_ the first test-case for 2020/01/02. Assuming that builders + # won't lag behind for over a day, the easiest way to handle this is to + # always check the previous and current days. + earliest_date_to_check=most_recent_date + - datetime.timedelta(days=1), + forcey=forcey, + state_file_path=state_file, + ) + else: + submit_new_test_cases( + last_seen_test_cases=(), + earliest_date_to_check=from_ymd(last_date_str), + forcey=forcey, + state_file_path=state_file, + ) -if __name__ == '__main__': - sys.exit(main(sys.argv[1:])) +if __name__ == "__main__": + sys.exit(main(sys.argv[1:])) |