diff options
author | cmtice <cmtice@google.com> | 2014-05-12 13:56:42 -0700 |
---|---|---|
committer | chrome-internal-fetch <chrome-internal-fetch@google.com> | 2014-05-21 04:07:47 +0000 |
commit | 798a8fa986db930786b2a6777db0f3b06db995f6 (patch) | |
tree | feacec28548e4a53f6961cc443915937105a35cf /crosperf | |
parent | df8053336feea245a6bb43fe16dbc16a7b3d71cd (diff) | |
download | toolchain-utils-798a8fa986db930786b2a6777db0f3b06db995f6.tar.gz |
Better handling of "not identical machines" failure.
Nightly tests, especially on the x86-generic boxes, fail frequently
because the DUTs have different board images on them, so Crosperf
decides the machines are not identical and refuses to run the tests.
With this CL, if the machine_manager finds that the machines fail the
"identical" test, it will try to push the same image onto all the machines,
and then check them again to see if they are the same. It only tries this
once; if they fail the check the second time around, it is still a fatal
failure. This should eliminate many of the unnecessary failures in our
nightly tests.
This CL also fixes a small bug in the auto-delete script (this fix has
been running for a while on mobiletc-prebuild, but never got committed).
BUG=None
TEST=Ran several iterations where I forced the first "identical" check to
fail. The changes worked.
Change-Id: Ied2a55e5d3e2789e58a503aef03269888954b579
Reviewed-on: https://chrome-internal-review.googlesource.com/163334
Reviewed-by: Luis Lozano <llozano@chromium.org>
Commit-Queue: Caroline Tice <cmtice@google.com>
Tested-by: Caroline Tice <cmtice@google.com>
Diffstat (limited to 'crosperf')
-rw-r--r-- | crosperf/benchmark_run.py | 14 | ||||
-rw-r--r-- | crosperf/machine_manager.py | 23 |
2 files changed, 33 insertions, 4 deletions
diff --git a/crosperf/benchmark_run.py b/crosperf/benchmark_run.py index ec39b403..e4fe693d 100644 --- a/crosperf/benchmark_run.py +++ b/crosperf/benchmark_run.py @@ -13,6 +13,7 @@ import traceback from utils import command_executer from utils import timeline +from machine_manager import NonMatchingMachines from suite_runner import SuiteRunner from results_cache import MockResult from results_cache import MockResultsCache @@ -141,10 +142,19 @@ class BenchmarkRun(threading.Thread): def AcquireMachine(self): while True: + machine = None if self.terminated: raise Exception("Thread terminated while trying to acquire machine.") - machine = self.machine_manager.AcquireMachine(self.label.chromeos_image, - self.label) + try: + machine = self.machine_manager.AcquireMachine(self.label.chromeos_image, + self.label, + throw=True) + + except NonMatchingMachines: + self.machine_manager.ForceSameImageToAllMachines(self.label) + machine = self.machine_manager.AcquireMachine(self.label.chromeos_image, + self.label, + throw=False) if machine: self._logger.LogOutput("%s: Machine %s acquired at %s" % diff --git a/crosperf/machine_manager.py b/crosperf/machine_manager.py index 52c3d818..04a4eec7 100644 --- a/crosperf/machine_manager.py +++ b/crosperf/machine_manager.py @@ -22,6 +22,8 @@ from image_checksummer import ImageChecksummer CHECKSUM_FILE = "/usr/local/osimage_checksum_file" +class NonMatchingMachines(Exception): + pass class CrosMachine(object): def __init__(self, name, chromeos_root, log_level): @@ -33,6 +35,9 @@ class CrosMachine(object): self.test_run = None self.chromeos_root = chromeos_root self.log_level = log_level + self.SetUpChecksumInfo() + + def SetUpChecksumInfo(self): if not self.IsReachable(): self.machine_checksum = None return @@ -288,6 +293,7 @@ class MachineManager(object): checksums = [m.machine_checksum for m in self.GetMachines(label)] return len(set(checksums)) == 1 + def RemoveMachine(self, machine_name): with self._lock: self._machines = [m for m in self._machines @@ -297,7 +303,14 @@ class MachineManager(object): logger.GetLogger().LogError("Could not unlock machine: '%s'." % m.name) - def AcquireMachine(self, chromeos_image, label): + def ForceSameImageToAllMachines(self, label): + machines = self.GetMachines(label) + chromeos_image = label.chromeos_image + for m in machines: + self.ImageMachine(m, label) + m.SetUpChecksumInfo() + + def AcquireMachine(self, chromeos_image, label, throw=False): if label.image_type == "local": image_checksum = ImageChecksummer().Checksum(label, self.log_level) elif label.image_type == "trybot": @@ -315,7 +328,13 @@ class MachineManager(object): if new_machine: m.released_time = time.time() if not self.AreAllMachineSame(label): - logger.GetLogger().LogFatal("-- not all the machine are identical") + if not throw: + # Log fatal message, which calls sys.exit. Default behavior. + logger.GetLogger().LogFatal("-- not all the machines are identical") + else: + # Raise an exception, which can be caught and handled by calling + # function. + raise NonMatchingMachines("Not all the machines are identical") if self.GetAvailableMachines(label): break else: |