aboutsummaryrefslogtreecommitdiff
path: root/crosperf
diff options
context:
space:
mode:
authorcmtice <cmtice@google.com>2014-05-12 13:56:42 -0700
committerchrome-internal-fetch <chrome-internal-fetch@google.com>2014-05-21 04:07:47 +0000
commit798a8fa986db930786b2a6777db0f3b06db995f6 (patch)
treefeacec28548e4a53f6961cc443915937105a35cf /crosperf
parentdf8053336feea245a6bb43fe16dbc16a7b3d71cd (diff)
downloadtoolchain-utils-798a8fa986db930786b2a6777db0f3b06db995f6.tar.gz
Better handling of "not identical machines" failure.
Nightly tests, especially on the x86-generic boxes, fail frequently because the DUTs have different board images on them, so Crosperf decides the machines are not identical and refuses to run the tests. With this CL, if the machine_manager finds that the machines fail the "identical" test, it will try to push the same image onto all the machines, and then check them again to see if they are the same. It only tries this once; if they fail the check the second time around, it is still a fatal failure. This should eliminate many of the unnecessary failures in our nightly tests. This CL also fixes a small bug in the auto-delete script (this fix has been running for a while on mobiletc-prebuild, but never got committed). BUG=None TEST=Ran several iterations where I forced the first "identical" check to fail. The changes worked. Change-Id: Ied2a55e5d3e2789e58a503aef03269888954b579 Reviewed-on: https://chrome-internal-review.googlesource.com/163334 Reviewed-by: Luis Lozano <llozano@chromium.org> Commit-Queue: Caroline Tice <cmtice@google.com> Tested-by: Caroline Tice <cmtice@google.com>
Diffstat (limited to 'crosperf')
-rw-r--r--crosperf/benchmark_run.py14
-rw-r--r--crosperf/machine_manager.py23
2 files changed, 33 insertions, 4 deletions
diff --git a/crosperf/benchmark_run.py b/crosperf/benchmark_run.py
index ec39b403..e4fe693d 100644
--- a/crosperf/benchmark_run.py
+++ b/crosperf/benchmark_run.py
@@ -13,6 +13,7 @@ import traceback
from utils import command_executer
from utils import timeline
+from machine_manager import NonMatchingMachines
from suite_runner import SuiteRunner
from results_cache import MockResult
from results_cache import MockResultsCache
@@ -141,10 +142,19 @@ class BenchmarkRun(threading.Thread):
def AcquireMachine(self):
while True:
+ machine = None
if self.terminated:
raise Exception("Thread terminated while trying to acquire machine.")
- machine = self.machine_manager.AcquireMachine(self.label.chromeos_image,
- self.label)
+ try:
+ machine = self.machine_manager.AcquireMachine(self.label.chromeos_image,
+ self.label,
+ throw=True)
+
+ except NonMatchingMachines:
+ self.machine_manager.ForceSameImageToAllMachines(self.label)
+ machine = self.machine_manager.AcquireMachine(self.label.chromeos_image,
+ self.label,
+ throw=False)
if machine:
self._logger.LogOutput("%s: Machine %s acquired at %s" %
diff --git a/crosperf/machine_manager.py b/crosperf/machine_manager.py
index 52c3d818..04a4eec7 100644
--- a/crosperf/machine_manager.py
+++ b/crosperf/machine_manager.py
@@ -22,6 +22,8 @@ from image_checksummer import ImageChecksummer
CHECKSUM_FILE = "/usr/local/osimage_checksum_file"
+class NonMatchingMachines(Exception):
+ pass
class CrosMachine(object):
def __init__(self, name, chromeos_root, log_level):
@@ -33,6 +35,9 @@ class CrosMachine(object):
self.test_run = None
self.chromeos_root = chromeos_root
self.log_level = log_level
+ self.SetUpChecksumInfo()
+
+ def SetUpChecksumInfo(self):
if not self.IsReachable():
self.machine_checksum = None
return
@@ -288,6 +293,7 @@ class MachineManager(object):
checksums = [m.machine_checksum for m in self.GetMachines(label)]
return len(set(checksums)) == 1
+
def RemoveMachine(self, machine_name):
with self._lock:
self._machines = [m for m in self._machines
@@ -297,7 +303,14 @@ class MachineManager(object):
logger.GetLogger().LogError("Could not unlock machine: '%s'."
% m.name)
- def AcquireMachine(self, chromeos_image, label):
+ def ForceSameImageToAllMachines(self, label):
+ machines = self.GetMachines(label)
+ chromeos_image = label.chromeos_image
+ for m in machines:
+ self.ImageMachine(m, label)
+ m.SetUpChecksumInfo()
+
+ def AcquireMachine(self, chromeos_image, label, throw=False):
if label.image_type == "local":
image_checksum = ImageChecksummer().Checksum(label, self.log_level)
elif label.image_type == "trybot":
@@ -315,7 +328,13 @@ class MachineManager(object):
if new_machine:
m.released_time = time.time()
if not self.AreAllMachineSame(label):
- logger.GetLogger().LogFatal("-- not all the machine are identical")
+ if not throw:
+ # Log fatal message, which calls sys.exit. Default behavior.
+ logger.GetLogger().LogFatal("-- not all the machines are identical")
+ else:
+ # Raise an exception, which can be caught and handled by calling
+ # function.
+ raise NonMatchingMachines("Not all the machines are identical")
if self.GetAvailableMachines(label):
break
else: