llvm_tools/upload_lexan_crashes_to_forcey.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2020 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Fetches and submits the latest test-cases from Lexan's crash bucket."""

import argparse
import contextlib
import datetime
import json
import logging
import os
import shutil
import subprocess
import sys
import tempfile
from typing import Generator, List, Iterable

gsurl_base = 'gs://chrome-clang-crash-reports/v1'


def gsutil_ls(loc: str) -> List[str]:
  results = subprocess.run(['gsutil.py', 'ls', loc],
                           stdout=subprocess.PIPE,
                           check=True,
                           encoding='utf-8')
  return [l.strip() for l in results.stdout.splitlines()]


def gsurl_ls_last_numbers(url: str) -> List[int]:
  return sorted(int(x.rstrip('/').split('/')[-1]) for x in gsutil_ls(url))


def get_available_year_numbers() -> List[int]:
  return gsurl_ls_last_numbers(gsurl_base)


def get_available_month_numbers(year: int) -> List[int]:
  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}')


def get_available_day_numbers(year: int, month: int) -> List[int]:
  return gsurl_ls_last_numbers(f'{gsurl_base}/{year}/{month:02d}')


def get_available_test_case_urls(year: int, month: int, day: int) -> List[str]:
  return gsutil_ls(f'{gsurl_base}/{year}/{month:02d}/{day:02d}')


def test_cases_on_or_after(date: datetime.datetime
                          ) -> Generator[str, None, None]:
  """Yields all test-cases submitted on or after the given date."""
  for year in get_available_year_numbers():
    if year < date.year:
      continue

    for month in get_available_month_numbers(year):
      if year == date.year and month < date.month:
        continue

      for day in get_available_day_numbers(year, month):
        when = datetime.date(year, month, day)
        if when < date:
          continue

        yield when, get_available_test_case_urls(year, month, day)


def to_ymd(date: datetime.date) -> str:
  return date.strftime('%Y-%m-%d')


def from_ymd(date_str: str) -> datetime.date:
  return datetime.datetime.strptime(date_str, '%Y-%m-%d').date()


def persist_state(seen_urls: Iterable[str], state_file: str,
                  current_date: datetime.date):
  tmp_state_file = state_file + '.tmp'
  with open(tmp_state_file, 'w', encoding='utf-8') as f:
    json.dump(
        {
            'already_seen': sorted(seen_urls),
            'most_recent_date': to_ymd(current_date),
        },
        f,
    )
  os.rename(tmp_state_file, state_file)


@contextlib.contextmanager
def temp_dir() -> Generator[str, None, None]:
  loc = tempfile.mkdtemp('lexan-autosubmit')
  try:
    yield loc
  finally:
    shutil.rmtree(loc)


def download_and_unpack_test_case(gs_url: str, tempdir: str) -> None:
  suffix = os.path.splitext(gs_url)[1]
  target_name = 'test_case' + suffix
  target = os.path.join(tempdir, target_name)
  subprocess.run(['gsutil.py', 'cp', gs_url, target], check=True)
  subprocess.run(['tar', 'xaf', target_name], check=True, cwd=tempdir)
  os.unlink(target)


def submit_test_case(gs_url: str, cr_tool: str) -> None:
  logging.info('Submitting %s', gs_url)
  with temp_dir() as tempdir:
    download_and_unpack_test_case(gs_url, tempdir)

    # Sometimes (e.g., in
    # gs://chrome-clang-crash-reports/v1/2020/03/27/
    # chromium.clang-ToTiOS-12754-GTXToolKit-2bfcde.tgz)
    # we'll get `.crash` files. Unclear why, but let's filter them out anyway.
    repro_files = [
        os.path.join(tempdir, x)
        for x in os.listdir(tempdir)
        if not x.endswith('.crash')
    ]
    assert len(repro_files) == 2, repro_files
    if repro_files[0].endswith('.sh'):
      sh_file, src_file = repro_files
      assert not src_file.endswith('.sh'), repro_files
    else:
      src_file, sh_file = repro_files
      assert sh_file.endswith('.sh'), repro_files

    # Peephole: lexan got a crash upload with a way old clang. Ignore it.
    with open(sh_file, encoding='utf-8') as f:
      if 'Crash reproducer for clang version 9.0.0' in f.read():
        logging.warning('Skipping upload for %s; seems to be with an old clang',
                        gs_url)
        return

    subprocess.run(
        [
            cr_tool,
            'reduce',
            '-stream=false',
            '-wait=false',
            '-note',
            gs_url,
            '-sh_file',
            os.path.join(tempdir, sh_file),
            '-src_file',
            os.path.join(tempdir, src_file),
        ],
        check=True,
    )


def submit_new_test_cases(
    last_seen_test_cases: Iterable[str],
    earliest_date_to_check: datetime.date,
    forcey: str,
    state_file_path: str,
) -> None:
  """Submits new test-cases to forcey.

  This will persist state after each test-case is submitted.

  Args:
    last_seen_test_cases: test-cases which have been submitted already, and
      should be skipped if seen again.
    earliest_date_to_check: the earliest date we should consider test-cases
      from.
    forcey: path to the forcey binary.
    state_file_path: path to our state file.
  """
  # `all_test_cases_seen` is the union of all test-cases seen on this and prior
  # invocations. It guarantees, in all cases we care about, that we won't
  # submit the same test-case twice. `test_cases_seen_this_invocation` is
  # persisted as "all of the test-cases we've seen on this and prior
  # invocations" if we successfully submit _all_ test-cases.
  #
  # Since you can visualize the test-cases this script considers as a sliding
  # window that only moves forward, if we saw a test-case on a prior iteration
  # but no longer see it, we'll never see it again (since it fell out of our
  # sliding window by being too old). Hence, keeping it around is
  # pointless.
  #
  # We only persist this minimized set of test-cases if _everything_ succeeds,
  # since if something fails below, there's a chance that we haven't revisited
  # test-cases that we've already seen.
  all_test_cases_seen = set(last_seen_test_cases)
  test_cases_seen_this_invocation = []
  most_recent_date = earliest_date_to_check
  for date, candidates in test_cases_on_or_after(earliest_date_to_check):
    most_recent_date = max(most_recent_date, date)

    for url in candidates:
      test_cases_seen_this_invocation.append(url)
      if url in all_test_cases_seen:
        continue

      all_test_cases_seen.add(url)
      submit_test_case(url, forcey)

      # Persisting on each iteration of this loop isn't free, but it's the
      # easiest way to not resubmit test-cases, and it's good to keep in mind
      # that:
      # - the state file will be small (<12KB, since it only keeps a few days
      #   worth of test-cases after the first run)
      # - in addition to this, we're downloading+unzipping+reuploading multiple
      #   MB of test-case bytes.
      #
      # So comparatively, the overhead here probably isn't an issue.
      persist_state(all_test_cases_seen, state_file_path, most_recent_date)

  persist_state(test_cases_seen_this_invocation, state_file_path,
                most_recent_date)


def main(argv: List[str]):
  logging.basicConfig(
      format='>> %(asctime)s: %(levelname)s: %(filename)s:%(lineno)d: '
      '%(message)s',
      level=logging.INFO,
  )

  my_dir = os.path.dirname(os.path.abspath(__file__))

  parser = argparse.ArgumentParser(description=__doc__)
  parser.add_argument(
      '--state_file', default=os.path.join(my_dir, 'lexan-state.json'))
  parser.add_argument(
      '--last_date',
      help='The earliest date that we care about. All test cases from here '
      'on will be picked up. Format is YYYY-MM-DD.')
  parser.add_argument(
      '--4c', dest='forcey', required=True, help='Path to a 4c client binary')
  opts = parser.parse_args(argv)

  forcey = opts.forcey
  state_file = opts.state_file
  last_date_str = opts.last_date

  os.makedirs(os.path.dirname(state_file), 0o755, exist_ok=True)

  if last_date_str is None:
    with open(state_file, encoding='utf-8') as f:
      data = json.load(f)
    most_recent_date = from_ymd(data['most_recent_date'])
    submit_new_test_cases(
        last_seen_test_cases=data['already_seen'],
        # Note that we always subtract one day from this to avoid a race:
        # uploads may appear slightly out-of-order (or builders may lag, or
        # ...), so the last test-case uploaded for 2020/01/01 might appear
        # _after_ the first test-case for 2020/01/02. Assuming that builders
        # won't lag behind for over a day, the easiest way to handle this is to
        # always check the previous and current days.
        earliest_date_to_check=most_recent_date - datetime.timedelta(days=1),
        forcey=forcey,
        state_file_path=state_file,
    )
  else:
    submit_new_test_cases(
        last_seen_test_cases=(),
        earliest_date_to_check=from_ymd(last_date_str),
        forcey=forcey,
        state_file_path=state_file,
    )


if __name__ == '__main__':
  sys.exit(main(sys.argv[1:]))