aboutsummaryrefslogtreecommitdiff
path: root/crosperf/experiment.py
diff options
context:
space:
mode:
Diffstat (limited to 'crosperf/experiment.py')
-rw-r--r--crosperf/experiment.py460
1 files changed, 249 insertions, 211 deletions
diff --git a/crosperf/experiment.py b/crosperf/experiment.py
index e919f6ee..9973f7e9 100644
--- a/crosperf/experiment.py
+++ b/crosperf/experiment.py
@@ -1,21 +1,18 @@
# -*- coding: utf-8 -*-
-# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
+# Copyright 2013 The ChromiumOS Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""The experiment setting module."""
-from __future__ import print_function
import os
-import time
-
from threading import Lock
+import time
+import benchmark_run
from cros_utils import logger
from cros_utils import misc
-
-import benchmark_run
from machine_manager import BadChecksum
from machine_manager import MachineManager
from machine_manager import MockMachineManager
@@ -23,208 +20,249 @@ import test_flag
class Experiment(object):
- """Class representing an Experiment to be run."""
-
- def __init__(self, name, remote, working_directory, chromeos_root,
- cache_conditions, labels, benchmarks, experiment_file, email_to,
- acquire_timeout, log_dir, log_level, share_cache,
- results_directory, compress_results, locks_directory, cwp_dso,
- ignore_min_max, crosfleet, dut_config, no_lock: bool):
- self.name = name
- self.working_directory = working_directory
- self.remote = remote
- self.chromeos_root = chromeos_root
- self.cache_conditions = cache_conditions
- self.experiment_file = experiment_file
- self.email_to = email_to
- if not results_directory:
- self.results_directory = os.path.join(self.working_directory,
- self.name + '_results')
- else:
- self.results_directory = misc.CanonicalizePath(results_directory)
- self.compress_results = compress_results
- self.log_dir = log_dir
- self.log_level = log_level
- self.labels = labels
- self.benchmarks = benchmarks
- self.num_complete = 0
- self.num_run_complete = 0
- self.share_cache = share_cache
- self.active_threads = []
- self.locks_dir = locks_directory
- self.locked_machines = []
- self.lock_mgr = None
- self.cwp_dso = cwp_dso
- self.ignore_min_max = ignore_min_max
- self.crosfleet = crosfleet
- self.no_lock = no_lock
- self.l = logger.GetLogger(log_dir)
-
- if not self.benchmarks:
- raise RuntimeError('No benchmarks specified')
- if not self.labels:
- raise RuntimeError('No labels specified')
- if not remote and not self.crosfleet:
- raise RuntimeError('No remote hosts specified')
-
- # We need one chromeos_root to run the benchmarks in, but it doesn't
- # matter where it is, unless the ABIs are different.
- if not chromeos_root:
- for label in self.labels:
- if label.chromeos_root:
- chromeos_root = label.chromeos_root
- break
- if not chromeos_root:
- raise RuntimeError('No chromeos_root given and could not determine '
- 'one from the image path.')
-
- machine_manager_fn = MachineManager
- if test_flag.GetTestMode():
- machine_manager_fn = MockMachineManager
- self.machine_manager = machine_manager_fn(chromeos_root, acquire_timeout,
- log_level, locks_directory)
- self.l = logger.GetLogger(log_dir)
-
- for machine in self.remote:
- # machine_manager.AddMachine only adds reachable machines.
- self.machine_manager.AddMachine(machine)
- # Now machine_manager._all_machines contains a list of reachable
- # machines. This is a subset of self.remote. We make both lists the same.
- self.remote = [m.name for m in self.machine_manager.GetAllMachines()]
- if not self.remote:
- raise RuntimeError('No machine available for running experiment.')
-
- # Initialize checksums for all machines, ignore errors at this time.
- # The checksum will be double checked, and image will be flashed after
- # duts are locked/leased.
- self.SetCheckSums()
-
- self.start_time = None
- self.benchmark_runs = self._GenerateBenchmarkRuns(dut_config)
-
- self._schedv2 = None
- self._internal_counter_lock = Lock()
-
- def set_schedv2(self, schedv2):
- self._schedv2 = schedv2
-
- def schedv2(self):
- return self._schedv2
-
- def _GenerateBenchmarkRuns(self, dut_config):
- """Generate benchmark runs from labels and benchmark defintions."""
- benchmark_runs = []
- for label in self.labels:
- for benchmark in self.benchmarks:
- for iteration in range(1, benchmark.iterations + 1):
-
- benchmark_run_name = '%s: %s (%s)' % (label.name, benchmark.name,
- iteration)
- full_name = '%s_%s_%s' % (label.name, benchmark.name, iteration)
- logger_to_use = logger.Logger(self.log_dir, 'run.%s' % (full_name),
- True)
- benchmark_runs.append(
- benchmark_run.BenchmarkRun(benchmark_run_name, benchmark, label,
- iteration, self.cache_conditions,
- self.machine_manager, logger_to_use,
- self.log_level, self.share_cache,
- dut_config))
-
- return benchmark_runs
-
- def SetCheckSums(self, forceSameImage=False):
- for label in self.labels:
- # We filter out label remotes that are not reachable (not in
- # self.remote). So each label.remote is a sublist of experiment.remote.
- label.remote = [r for r in label.remote if r in self.remote]
- try:
- self.machine_manager.ComputeCommonCheckSum(label)
- except BadChecksum:
- # Force same image on all machines, then we do checksum again. No
- # bailout if checksums still do not match.
- # TODO (zhizhouy): Need to figure out how flashing image will influence
- # the new checksum.
- if forceSameImage:
- self.machine_manager.ForceSameImageToAllMachines(label)
- self.machine_manager.ComputeCommonCheckSum(label)
-
- self.machine_manager.ComputeCommonCheckSumString(label)
-
- def Build(self):
- pass
-
- def Terminate(self):
- if self._schedv2 is not None:
- self._schedv2.terminate()
- else:
- for t in self.benchmark_runs:
- if t.isAlive():
- self.l.LogError("Terminating run: '%s'." % t.name)
- t.Terminate()
-
- def IsComplete(self):
- if self._schedv2:
- return self._schedv2.is_complete()
- if self.active_threads:
- for t in self.active_threads:
- if t.isAlive():
- t.join(0)
- if not t.isAlive():
- self.num_complete += 1
- if not t.cache_hit:
- self.num_run_complete += 1
- self.active_threads.remove(t)
- return False
- return True
-
- def BenchmarkRunFinished(self, br):
- """Update internal counters after br finishes.
-
- Note this is only used by schedv2 and is called by multiple threads.
- Never throw any exception here.
- """
-
- assert self._schedv2 is not None
- with self._internal_counter_lock:
- self.num_complete += 1
- if not br.cache_hit:
- self.num_run_complete += 1
-
- def Run(self):
- self.start_time = time.time()
- if self._schedv2 is not None:
- self._schedv2.run_sched()
- else:
- self.active_threads = []
- for run in self.benchmark_runs:
- # Set threads to daemon so program exits when ctrl-c is pressed.
- run.daemon = True
- run.start()
- self.active_threads.append(run)
-
- def SetCacheConditions(self, cache_conditions):
- for run in self.benchmark_runs:
- run.SetCacheConditions(cache_conditions)
-
- def Cleanup(self):
- """Make sure all machines are unlocked."""
- if self.locks_dir:
- # We are using the file locks mechanism, so call machine_manager.Cleanup
- # to unlock everything.
- self.machine_manager.Cleanup()
-
- if test_flag.GetTestMode() or not self.locked_machines:
- return
-
- # If we locked any machines earlier, make sure we unlock them now.
- if self.lock_mgr:
- machine_states = self.lock_mgr.GetMachineStates('unlock')
- self.lock_mgr.CheckMachineLocks(machine_states, 'unlock')
- unlocked_machines = self.lock_mgr.UpdateMachines(False)
- failed_machines = [
- m for m in self.locked_machines if m not in unlocked_machines
- ]
- if failed_machines:
- raise RuntimeError('These machines are not unlocked correctly: %s' %
- failed_machines)
- self.lock_mgr = None
+ """Class representing an Experiment to be run."""
+
+ def __init__(
+ self,
+ name,
+ remote,
+ working_directory,
+ chromeos_root,
+ cache_conditions,
+ labels,
+ benchmarks,
+ experiment_file,
+ email_to,
+ acquire_timeout,
+ log_dir,
+ log_level,
+ share_cache,
+ results_directory,
+ compress_results,
+ locks_directory,
+ cwp_dso,
+ ignore_min_max,
+ crosfleet,
+ dut_config,
+ no_lock: bool,
+ ):
+ self.name = name
+ self.working_directory = working_directory
+ self.remote = remote
+ self.chromeos_root = chromeos_root
+ self.cache_conditions = cache_conditions
+ self.experiment_file = experiment_file
+ self.email_to = email_to
+ if not results_directory:
+ self.results_directory = os.path.join(
+ self.working_directory, self.name + "_results"
+ )
+ else:
+ self.results_directory = misc.CanonicalizePath(results_directory)
+ self.compress_results = compress_results
+ self.log_dir = log_dir
+ self.log_level = log_level
+ self.labels = labels
+ self.benchmarks = benchmarks
+ self.num_complete = 0
+ self.num_run_complete = 0
+ self.share_cache = share_cache
+ self.active_threads = []
+ self.locks_dir = locks_directory
+ self.locked_machines = []
+ self.lock_mgr = None
+ self.cwp_dso = cwp_dso
+ self.ignore_min_max = ignore_min_max
+ self.crosfleet = crosfleet
+ self.no_lock = no_lock
+ self.l = logger.GetLogger(log_dir)
+
+ if not self.benchmarks:
+ raise RuntimeError("No benchmarks specified")
+ if not self.labels:
+ raise RuntimeError("No labels specified")
+ if not remote and not self.crosfleet:
+ raise RuntimeError("No remote hosts specified")
+
+ # We need one chromeos_root to run the benchmarks in, but it doesn't
+ # matter where it is, unless the ABIs are different.
+ if not chromeos_root:
+ for label in self.labels:
+ if label.chromeos_root:
+ chromeos_root = label.chromeos_root
+ break
+ if not chromeos_root:
+ raise RuntimeError(
+ "No chromeos_root given and could not determine "
+ "one from the image path."
+ )
+
+ machine_manager_fn = MachineManager
+ if test_flag.GetTestMode():
+ machine_manager_fn = MockMachineManager
+ self.machine_manager = machine_manager_fn(
+ chromeos_root, acquire_timeout, log_level, locks_directory
+ )
+ self.l = logger.GetLogger(log_dir)
+
+ for machine in self.remote:
+ # machine_manager.AddMachine only adds reachable machines.
+ self.machine_manager.AddMachine(machine)
+ # Now machine_manager._all_machines contains a list of reachable
+ # machines. This is a subset of self.remote. We make both lists the same.
+ self.remote = [m.name for m in self.machine_manager.GetAllMachines()]
+ if not self.remote:
+ raise RuntimeError("No machine available for running experiment.")
+
+ # Initialize checksums for all machines, ignore errors at this time.
+ # The checksum will be double checked, and image will be flashed after
+ # duts are locked/leased.
+ self.SetCheckSums()
+
+ self.start_time = None
+ self.benchmark_runs = self._GenerateBenchmarkRuns(dut_config)
+
+ self._schedv2 = None
+ self._internal_counter_lock = Lock()
+
+ def set_schedv2(self, schedv2):
+ self._schedv2 = schedv2
+
+ def schedv2(self):
+ return self._schedv2
+
+ def _GenerateBenchmarkRuns(self, dut_config):
+ """Generate benchmark runs from labels and benchmark defintions."""
+ benchmark_runs = []
+ for label in self.labels:
+ for benchmark in self.benchmarks:
+ for iteration in range(1, benchmark.iterations + 1):
+
+ benchmark_run_name = "%s: %s (%s)" % (
+ label.name,
+ benchmark.name,
+ iteration,
+ )
+ full_name = "%s_%s_%s" % (
+ label.name,
+ benchmark.name,
+ iteration,
+ )
+ logger_to_use = logger.Logger(
+ self.log_dir, "run.%s" % (full_name), True
+ )
+ benchmark_runs.append(
+ benchmark_run.BenchmarkRun(
+ benchmark_run_name,
+ benchmark,
+ label,
+ iteration,
+ self.cache_conditions,
+ self.machine_manager,
+ logger_to_use,
+ self.log_level,
+ self.share_cache,
+ dut_config,
+ )
+ )
+
+ return benchmark_runs
+
+ def SetCheckSums(self, forceSameImage=False):
+ for label in self.labels:
+ # We filter out label remotes that are not reachable (not in
+ # self.remote). So each label.remote is a sublist of experiment.remote.
+ label.remote = [r for r in label.remote if r in self.remote]
+ try:
+ self.machine_manager.ComputeCommonCheckSum(label)
+ except BadChecksum:
+ # Force same image on all machines, then we do checksum again. No
+ # bailout if checksums still do not match.
+ # TODO (zhizhouy): Need to figure out how flashing image will influence
+ # the new checksum.
+ if forceSameImage:
+ self.machine_manager.ForceSameImageToAllMachines(label)
+ self.machine_manager.ComputeCommonCheckSum(label)
+
+ self.machine_manager.ComputeCommonCheckSumString(label)
+
+ def Build(self):
+ pass
+
+ def Terminate(self):
+ if self._schedv2 is not None:
+ self._schedv2.terminate()
+ else:
+ for t in self.benchmark_runs:
+ if t.isAlive():
+ self.l.LogError("Terminating run: '%s'." % t.name)
+ t.Terminate()
+
+ def IsComplete(self):
+ if self._schedv2:
+ return self._schedv2.is_complete()
+ if self.active_threads:
+ for t in self.active_threads:
+ if t.isAlive():
+ t.join(0)
+ if not t.isAlive():
+ self.num_complete += 1
+ if not t.cache_hit:
+ self.num_run_complete += 1
+ self.active_threads.remove(t)
+ return False
+ return True
+
+ def BenchmarkRunFinished(self, br):
+ """Update internal counters after br finishes.
+
+ Note this is only used by schedv2 and is called by multiple threads.
+ Never throw any exception here.
+ """
+
+ assert self._schedv2 is not None
+ with self._internal_counter_lock:
+ self.num_complete += 1
+ if not br.cache_hit:
+ self.num_run_complete += 1
+
+ def Run(self):
+ self.start_time = time.time()
+ if self._schedv2 is not None:
+ self._schedv2.run_sched()
+ else:
+ self.active_threads = []
+ for run in self.benchmark_runs:
+ # Set threads to daemon so program exits when ctrl-c is pressed.
+ run.daemon = True
+ run.start()
+ self.active_threads.append(run)
+
+ def SetCacheConditions(self, cache_conditions):
+ for run in self.benchmark_runs:
+ run.SetCacheConditions(cache_conditions)
+
+ def Cleanup(self):
+ """Make sure all machines are unlocked."""
+ if self.locks_dir:
+ # We are using the file locks mechanism, so call machine_manager.Cleanup
+ # to unlock everything.
+ self.machine_manager.Cleanup()
+
+ if test_flag.GetTestMode() or not self.locked_machines:
+ return
+
+ # If we locked any machines earlier, make sure we unlock them now.
+ if self.lock_mgr:
+ machine_states = self.lock_mgr.GetMachineStates("unlock")
+ self.lock_mgr.CheckMachineLocks(machine_states, "unlock")
+ unlocked_machines = self.lock_mgr.UpdateMachines(False)
+ failed_machines = [
+ m for m in self.locked_machines if m not in unlocked_machines
+ ]
+ if failed_machines:
+ raise RuntimeError(
+ "These machines are not unlocked correctly: %s"
+ % failed_machines
+ )
+ self.lock_mgr = None