1 files changed, 213 insertions, 199 deletions
diff --git a/llvm_tools/upload_lexan_crashes_to_forcey.py b/llvm_tools/upload_lexan_crashes_to_forcey.py
index 61bf6b7d..885a88f6 100755
--- a/llvm_tools/upload_lexan_crashes_to_forcey.py
+++ b/llvm_tools/upload_lexan_crashes_to_forcey.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# Copyright 2020 The Chromium OS Authors. All rights reserved.
+# Copyright 2020 The ChromiumOS Authors
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
@@ -16,142 +16,149 @@ import shutil
 import subprocess
 import sys
 import tempfile
-from typing import Generator, List, Iterable
+from typing import Generator, Iterable, List
 
-gsurl_base = 'gs://chrome-clang-crash-reports/v1'
+
+gsurl_base = "gs://chrome-clang-crash-reports/v1"
 
 
 def gsutil_ls(loc: str) -> List[str]:
-  results = subprocess.run(['gsutil.py', 'ls', loc],
-                           stdout=subprocess.PIPE,
-                           check=True,
-                           encoding='utf-8')
-  return [l.strip() for l in results.stdout.splitlines()]
+    results = subprocess.run(
+        ["gsutil.py", "ls", loc],
+        stdout=subprocess.PIPE,
+        check=True,
+        encoding="utf-8",
+    )
+    return [l.strip() for l in results.stdout.splitlines()]
 
 
 def gsurl_ls_last_numbers(url: str) -> List[int]:
-  return sorted(int(x.rstrip('/').split('/')[-1]) for x in gsutil_ls(url))
+    return sorted(int(x.rstrip("/").split("/")[-1]) for x in gsutil_ls(url))
 
 
 def get_available_year_numbers() -> List[int]:
-  return gsurl_ls_last_numbers(gsurl_base)
+    return gsurl_ls_last_numbers(gsurl_base)
 
 
 def get_available_month_numbers(year: int) -> List[int]:
-  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}')
+    return gsurl_ls_last_numbers(f"{gsurl_base}/{year}")
 
 
 def get_available_day_numbers(year: int, month: int) -> List[int]:
-  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}/{month:02d}')
+    return gsurl_ls_last_numbers(f"{gsurl_base}/{year}/{month:02d}")
 
 
 def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]:
-  return gsutil_ls(f'{gsurl_base}/{year}/{month:02d}/{day:02d}')
+    return gsutil_ls(f"{gsurl_base}/{year}/{month:02d}/{day:02d}")
 
 
-def test_cases_on_or_after(date: datetime.datetime
-                          ) -> Generator[str, None, None]:
-  """Yields all test-cases submitted on or after the given date."""
-  for year in get_available_year_numbers():
-    if year < date.year:
-      continue
+def test_cases_on_or_after(
+    date: datetime.datetime,
+) -> Generator[str, None, None]:
+    """Yields all test-cases submitted on or after the given date."""
+    for year in get_available_year_numbers():
+        if year < date.year:
+            continue
 
-    for month in get_available_month_numbers(year):
-      if year == date.year and month < date.month:
-        continue
+        for month in get_available_month_numbers(year):
+            if year == date.year and month < date.month:
+                continue
 
-      for day in get_available_day_numbers(year, month):
-        when = datetime.date(year, month, day)
-        if when < date:
-          continue
+            for day in get_available_day_numbers(year, month):
+                when = datetime.date(year, month, day)
+                if when < date:
+                    continue
 
-        yield when, get_available_test_case_urls(year, month, day)
+                yield when, get_available_test_case_urls(year, month, day)
 
 
 def to_ymd(date: datetime.date) -> str:
-  return date.strftime('%Y-%m-%d')
+    return date.strftime("%Y-%m-%d")
 
 
 def from_ymd(date_str: str) -> datetime.date:
-  return datetime.datetime.strptime(date_str, '%Y-%m-%d').date()
-
-
-def persist_state(seen_urls: Iterable[str], state_file: str,
-                  current_date: datetime.date):
-  tmp_state_file = state_file + '.tmp'
-  with open(tmp_state_file, 'w', encoding='utf-8') as f:
-    json.dump(
-        {
-            'already_seen': sorted(seen_urls),
-            'most_recent_date': to_ymd(current_date),
-        },
-        f,
-    )
-  os.rename(tmp_state_file, state_file)
+    return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
+
+
+def persist_state(
+    seen_urls: Iterable[str], state_file: str, current_date: datetime.date
+):
+    tmp_state_file = state_file + ".tmp"
+    with open(tmp_state_file, "w", encoding="utf-8") as f:
+        json.dump(
+            {
+                "already_seen": sorted(seen_urls),
+                "most_recent_date": to_ymd(current_date),
+            },
+            f,
+        )
+    os.rename(tmp_state_file, state_file)
 
 
 @contextlib.contextmanager
 def temp_dir() -> Generator[str, None, None]:
-  loc = tempfile.mkdtemp('lexan-autosubmit')
-  try:
-    yield loc
-  finally:
-    shutil.rmtree(loc)
+    loc = tempfile.mkdtemp("lexan-autosubmit")
+    try:
+        yield loc
+    finally:
+        shutil.rmtree(loc)
 
 
 def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None:
-  suffix = os.path.splitext(gs_url)[1]
-  target_name = 'test_case' + suffix
-  target = os.path.join(tempdir, target_name)
-  subprocess.run(['gsutil.py', 'cp', gs_url, target], check=True)
-  subprocess.run(['tar', 'xaf', target_name], check=True, cwd=tempdir)
-  os.unlink(target)
+    suffix = os.path.splitext(gs_url)[1]
+    target_name = "test_case" + suffix
+    target = os.path.join(tempdir, target_name)
+    subprocess.run(["gsutil.py", "cp", gs_url, target], check=True)
+    subprocess.run(["tar", "xaf", target_name], check=True, cwd=tempdir)
+    os.unlink(target)
 
 
 def submit_test_case(gs_url: str, cr_tool: str) -> None:
-  logging.info('Submitting %s', gs_url)
-  with temp_dir() as tempdir:
-    download_and_unpack_test_case(gs_url, tempdir)
-
-    # Sometimes (e.g., in
-    # gs://chrome-clang-crash-reports/v1/2020/03/27/
-    # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz)
-    # we'll get `.crash` files. Unclear why, but let's filter them out anyway.
-    repro_files = [
-        os.path.join(tempdir, x)
-        for x in os.listdir(tempdir)
-        if not x.endswith('.crash')
-    ]
-    assert len(repro_files) == 2, repro_files
-    if repro_files[0].endswith('.sh'):
-      sh_file, src_file = repro_files
-      assert not src_file.endswith('.sh'), repro_files
-    else:
-      src_file, sh_file = repro_files
-      assert sh_file.endswith('.sh'), repro_files
-
-    # Peephole: lexan got a crash upload with a way old clang. Ignore it.
-    with open(sh_file, encoding='utf-8') as f:
-      if 'Crash reproducer for clang version 9.0.0' in f.read():
-        logging.warning('Skipping upload for %s; seems to be with an old clang',
-                        gs_url)
-        return
-
-    subprocess.run(
-        [
-            cr_tool,
-            'reduce',
-            '-stream=false',
-            '-wait=false',
-            '-note',
-            gs_url,
-            '-sh_file',
-            os.path.join(tempdir, sh_file),
-            '-src_file',
-            os.path.join(tempdir, src_file),
-        ],
-        check=True,
-    )
+    logging.info("Submitting %s", gs_url)
+    with temp_dir() as tempdir:
+        download_and_unpack_test_case(gs_url, tempdir)
+
+        # Sometimes (e.g., in
+        # gs://chrome-clang-crash-reports/v1/2020/03/27/
+        # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz)
+        # we'll get `.crash` files. Unclear why, but let's filter them out anyway.
+        repro_files = [
+            os.path.join(tempdir, x)
+            for x in os.listdir(tempdir)
+            if not x.endswith(".crash")
+        ]
+        assert len(repro_files) == 2, repro_files
+        if repro_files[0].endswith(".sh"):
+            sh_file, src_file = repro_files
+            assert not src_file.endswith(".sh"), repro_files
+        else:
+            src_file, sh_file = repro_files
+            assert sh_file.endswith(".sh"), repro_files
+
+        # Peephole: lexan got a crash upload with a way old clang. Ignore it.
+        with open(sh_file, encoding="utf-8") as f:
+            if "Crash reproducer for clang version 9.0.0" in f.read():
+                logging.warning(
+                    "Skipping upload for %s; seems to be with an old clang",
+                    gs_url,
+                )
+                return
+
+        subprocess.run(
+            [
+                cr_tool,
+                "reduce",
+                "-stream=false",
+                "-wait=false",
+                "-note",
+                gs_url,
+                "-sh_file",
+                os.path.join(tempdir, sh_file),
+                "-src_file",
+                os.path.join(tempdir, src_file),
+            ],
+            check=True,
+        )
 
 
 def submit_new_test_cases(
@@ -160,112 +167,119 @@ def submit_new_test_cases(
     forcey: str,
     state_file_path: str,
 ) -> None:
-  """Submits new test-cases to forcey.
-
-  This will persist state after each test-case is submitted.
-
-  Args:
-    last_seen_test_cases: test-cases which have been submitted already, and
-      should be skipped if seen again.
-    earliest_date_to_check: the earliest date we should consider test-cases
-      from.
-    forcey: path to the forcey binary.
-    state_file_path: path to our state file.
-  """
-  # `all_test_cases_seen` is the union of all test-cases seen on this and prior
-  # invocations. It guarantees, in all cases we care about, that we won't
-  # submit the same test-case twice. `test_cases_seen_this_invocation` is
-  # persisted as "all of the test-cases we've seen on this and prior
-  # invocations" if we successfully submit _all_ test-cases.
-  #
-  # Since you can visualize the test-cases this script considers as a sliding
-  # window that only moves forward, if we saw a test-case on a prior iteration
-  # but no longer see it, we'll never see it again (since it fell out of our
-  # sliding window by being too old). Hence, keeping it around is
-  # pointless.
-  #
-  # We only persist this minimized set of test-cases if _everything_ succeeds,
-  # since if something fails below, there's a chance that we haven't revisited
-  # test-cases that we've already seen.
-  all_test_cases_seen = set(last_seen_test_cases)
-  test_cases_seen_this_invocation = []
-  most_recent_date = earliest_date_to_check
-  for date, candidates in test_cases_on_or_after(earliest_date_to_check):
-    most_recent_date = max(most_recent_date, date)
-
-    for url in candidates:
-      test_cases_seen_this_invocation.append(url)
-      if url in all_test_cases_seen:
-        continue
-
-      all_test_cases_seen.add(url)
-      submit_test_case(url, forcey)
-
-      # Persisting on each iteration of this loop isn't free, but it's the
-      # easiest way to not resubmit test-cases, and it's good to keep in mind
-      # that:
-      # - the state file will be small (<12KB, since it only keeps a few days
-      #   worth of test-cases after the first run)
-      # - in addition to this, we're downloading+unzipping+reuploading multiple
-      #   MB of test-case bytes.
-      #
-      # So comparatively, the overhead here probably isn't an issue.
-      persist_state(all_test_cases_seen, state_file_path, most_recent_date)
-
-  persist_state(test_cases_seen_this_invocation, state_file_path,
-                most_recent_date)
+    """Submits new test-cases to forcey.
+
+    This will persist state after each test-case is submitted.
+
+    Args:
+      last_seen_test_cases: test-cases which have been submitted already, and
+        should be skipped if seen again.
+      earliest_date_to_check: the earliest date we should consider test-cases
+        from.
+      forcey: path to the forcey binary.
+      state_file_path: path to our state file.
+    """
+    # `all_test_cases_seen` is the union of all test-cases seen on this and prior
+    # invocations. It guarantees, in all cases we care about, that we won't
+    # submit the same test-case twice. `test_cases_seen_this_invocation` is
+    # persisted as "all of the test-cases we've seen on this and prior
+    # invocations" if we successfully submit _all_ test-cases.
+    #
+    # Since you can visualize the test-cases this script considers as a sliding
+    # window that only moves forward, if we saw a test-case on a prior iteration
+    # but no longer see it, we'll never see it again (since it fell out of our
+    # sliding window by being too old). Hence, keeping it around is
+    # pointless.
+    #
+    # We only persist this minimized set of test-cases if _everything_ succeeds,
+    # since if something fails below, there's a chance that we haven't revisited
+    # test-cases that we've already seen.
+    all_test_cases_seen = set(last_seen_test_cases)
+    test_cases_seen_this_invocation = []
+    most_recent_date = earliest_date_to_check
+    for date, candidates in test_cases_on_or_after(earliest_date_to_check):
+        most_recent_date = max(most_recent_date, date)
+
+        for url in candidates:
+            test_cases_seen_this_invocation.append(url)
+            if url in all_test_cases_seen:
+                continue
+
+            all_test_cases_seen.add(url)
+            submit_test_case(url, forcey)
+
+            # Persisting on each iteration of this loop isn't free, but it's the
+            # easiest way to not resubmit test-cases, and it's good to keep in mind
+            # that:
+            # - the state file will be small (<12KB, since it only keeps a few days
+            #   worth of test-cases after the first run)
+            # - in addition to this, we're downloading+unzipping+reuploading multiple
+            #   MB of test-case bytes.
+            #
+            # So comparatively, the overhead here probably isn't an issue.
+            persist_state(
+                all_test_cases_seen, state_file_path, most_recent_date
+            )
+
+    persist_state(
+        test_cases_seen_this_invocation, state_file_path, most_recent_date
+    )
 
 
 def main(argv: List[str]):
-  logging.basicConfig(
-      format='>> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: '
-      '%(message)s',
-      level=logging.INFO,
-  )
-
-  my_dir = os.path.dirname(os.path.abspath(__file__))
-
-  parser = argparse.ArgumentParser(description=__doc__)
-  parser.add_argument(
-      '--state_file', default=os.path.join(my_dir, 'lexan-state.json'))
-  parser.add_argument(
-      '--last_date',
-      help='The earliest date that we care about. All test cases from here '
-      'on will be picked up. Format is YYYY-MM-DD.')
-  parser.add_argument(
-      '--4c', dest='forcey', required=True, help='Path to a 4c client binary')
-  opts = parser.parse_args(argv)
-
-  forcey = opts.forcey
-  state_file = opts.state_file
-  last_date_str = opts.last_date
-
-  os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True)
-
-  if last_date_str is None:
-    with open(state_file, encoding='utf-8') as f:
-      data = json.load(f)
-    most_recent_date = from_ymd(data['most_recent_date'])
-    submit_new_test_cases(
-        last_seen_test_cases=data['already_seen'],
-        # Note that we always subtract one day from this to avoid a race:
-        # uploads may appear slightly out-of-order (or builders may lag, or
-        # ...), so the last test-case uploaded for 2020/01/01 might appear
-        # _after_ the first test-case for 2020/01/02. Assuming that builders
-        # won't lag behind for over a day, the easiest way to handle this is to
-        # always check the previous and current days.
-        earliest_date_to_check=most_recent_date - datetime.timedelta(days=1),
-        forcey=forcey,
-        state_file_path=state_file,
+    logging.basicConfig(
+        format=">> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: "
+        "%(message)s",
+        level=logging.INFO,
     )
-  else:
-    submit_new_test_cases(
-        last_seen_test_cases=(),
-        earliest_date_to_check=from_ymd(last_date_str),
-        forcey=forcey,
-        state_file_path=state_file,
+
+    my_dir = os.path.dirname(os.path.abspath(__file__))
+
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--state_file", default=os.path.join(my_dir, "lexan-state.json")
     )
+    parser.add_argument(
+        "--last_date",
+        help="The earliest date that we care about. All test cases from here "
+        "on will be picked up. Format is YYYY-MM-DD.",
+    )
+    parser.add_argument(
+        "--4c", dest="forcey", required=True, help="Path to a 4c client binary"
+    )
+    opts = parser.parse_args(argv)
+
+    forcey = opts.forcey
+    state_file = opts.state_file
+    last_date_str = opts.last_date
+
+    os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True)
+
+    if last_date_str is None:
+        with open(state_file, encoding="utf-8") as f:
+            data = json.load(f)
+        most_recent_date = from_ymd(data["most_recent_date"])
+        submit_new_test_cases(
+            last_seen_test_cases=data["already_seen"],
+            # Note that we always subtract one day from this to avoid a race:
+            # uploads may appear slightly out-of-order (or builders may lag, or
+            # ...), so the last test-case uploaded for 2020/01/01 might appear
+            # _after_ the first test-case for 2020/01/02. Assuming that builders
+            # won't lag behind for over a day, the easiest way to handle this is to
+            # always check the previous and current days.
+            earliest_date_to_check=most_recent_date
+            - datetime.timedelta(days=1),
+            forcey=forcey,
+            state_file_path=state_file,
+        )
+    else:
+        submit_new_test_cases(
+            last_seen_test_cases=(),
+            earliest_date_to_check=from_ymd(last_date_str),
+            forcey=forcey,
+            state_file_path=state_file,
+        )
 
 
-if __name__ == '__main__':
-  sys.exit(main(sys.argv[1:]))
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))