diff options
author | Zhizhou Yang <zhizhouy@google.com> | 2019-10-14 17:36:09 -0700 |
---|---|---|
committer | Zhizhou Yang <zhizhouy@google.com> | 2019-10-15 22:48:08 +0000 |
commit | bf7ee87429d2c9730349ebc22f904b5a3fac7107 (patch) | |
tree | afe85d6bc0535e2d376f6d9be615beffdaed223c /lock_machine.py | |
parent | ea5fbd9e6c99d18e002cec6e86bfb889d0820370 (diff) | |
download | toolchain-utils-bf7ee87429d2c9730349ebc22f904b5a3fac7107.tar.gz |
toolchain-utils: change naming related to AFE lock
After introducing the new locking mechanism, the behavior of
AFELockManager totally changed. This file changed the naming of all
related code in toochain-utils.
This patch also changed the behavior of locks_dir option, deprecated the
use_file_lock option explicitly.
TEST=Tested with different DUT types.
BUG=chromium:1006434
Change-Id: Ib15efce54ec4d4c5c2a18fecca3f350248462035
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/toolchain-utils/+/1863530
Reviewed-by: Caroline Tice <cmtice@chromium.org>
Reviewed-by: Denis Nikitin <denik@chromium.org>
Commit-Queue: Zhizhou Yang <zhizhouy@google.com>
Tested-by: Zhizhou Yang <zhizhouy@google.com>
Diffstat (limited to 'lock_machine.py')
-rwxr-xr-x | lock_machine.py | 618 |
1 files changed, 618 insertions, 0 deletions
diff --git a/lock_machine.py b/lock_machine.py new file mode 100755 index 00000000..40c7d8fd --- /dev/null +++ b/lock_machine.py @@ -0,0 +1,618 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +# +# Copyright 2019 The Chromium OS Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""This module controls locking and unlocking of test machines.""" + +from __future__ import print_function + +import argparse +import enum +import getpass +import os +import sys + +import file_lock_machine + +from cros_utils import command_executer +from cros_utils import logger +from cros_utils import machines + + +class LockException(Exception): + """Base class for exceptions in this module.""" + + +class MachineNotPingable(LockException): + """Raised when machine does not respond to ping.""" + + +class LockingError(LockException): + """Raised when server fails to lock/unlock machine as requested.""" + + +class DontOwnLock(LockException): + """Raised when user attmepts to unlock machine locked by someone else.""" + # This should not be raised if the user specified '--force' + + +class NoAFEServer(LockException): + """Raised when cannot find/access the autotest server.""" + + +class AFEAccessError(LockException): + """Raised when cannot get information about lab machine from lab server.""" + + +class MachineType(enum.Enum): + """Enum class to hold machine type.""" + AFE = 'afe' + LOCAL = 'local' + SKYLAB = 'skylab' + + +class LockManager(object): + """Class for locking/unlocking machines vie three different modes. + + This class contains methods for checking the locked status of machines, + and for changing the locked status. It handles HW lab machines (both AFE + and Skylab), and local machines, using appropriate locking mechanisms for + each. + + !!!IMPORTANT NOTE!!! The AFE server can only be called from the main + thread/process of a program. If you launch threads and try to call it + from a thread, you will get an error. This has to do with restrictions + in the Python virtual machine (and signal handling) and cannot be changed. + """ + + SKYLAB_PATH = '/usr/local/bin/skylab' + LEASE_MINS = 600 + SKYLAB_CREDENTIAL = '/usr/local/google/home/mobiletc-prebuild' \ + '/sheriff_utils/skylab_credential' \ + '/chromeos-swarming-credential.json' + SWARMING = 'chromite/third_party/swarming.client/swarming.py' + SUCCESS = 0 + + def __init__(self, + remotes, + force_option, + chromeos_root, + locks_dir='', + log=None): + """Initializes an LockManager object. + + Args: + remotes: A list of machine names or ip addresses to be managed. Names + and ip addresses should be represented as strings. If the list is + empty, the lock manager will get all known machines. + force_option: A Boolean indicating whether or not to force an unlock of + a machine that was locked by someone else. + chromeos_root: The ChromeOS chroot to use for the autotest scripts. + locks_dir: A directory used for file locking local devices. + log: If not None, this is the logger object to be used for writing out + informational output messages. It is expected to be an instance of + Logger class from cros_utils/logger.py. + """ + self.chromeos_root = chromeos_root + self.user = getpass.getuser() + self.logger = log or logger.GetLogger() + self.ce = command_executer.GetCommandExecuter(self.logger) + autotest_path = os.path.join(chromeos_root, + 'src/third_party/autotest/files') + + sys.path.append(chromeos_root) + sys.path.append(autotest_path) + sys.path.append(os.path.join(autotest_path, 'server', 'cros')) + + self.locks_dir = locks_dir + + # We have to wait to do these imports until the paths above have + # been fixed. + # pylint: disable=import-error + from client import setup_modules + setup_modules.setup( + base_path=autotest_path, root_module_name='autotest_lib') + + from dynamic_suite import frontend_wrappers + + self.afe = frontend_wrappers.RetryingAFE( + timeout_min=30, delay_sec=10, debug=False, server='cautotest') + + self.machines = list(set(remotes)) or [] + self.toolchain_lab_machines = self.GetAllToolchainLabMachines() + + if not self.machines: + self.machines = self.toolchain_lab_machines + self.force = force_option + + self.local_machines = [] + self.skylab_machines = [] + + def CheckMachine(self, machine, error_msg): + """Verifies that machine is responding to ping. + + Args: + machine: String containing the name or ip address of machine to check. + error_msg: Message to print if ping fails. + + Raises: + MachineNotPingable: If machine is not responding to 'ping' + """ + if not machines.MachineIsPingable(machine, logging_level='none'): + cros_machine = machine + '.cros' + if not machines.MachineIsPingable(cros_machine, logging_level='none'): + raise MachineNotPingable(error_msg) + + def GetAllToolchainLabMachines(self): + """Gets a list of all the toolchain machines in the ChromeOS HW lab. + + Returns: + A list of names of the toolchain machines in the ChromeOS HW lab. + """ + machines_file = os.path.join( + os.path.dirname(__file__), 'crosperf', 'default_remotes') + machine_list = [] + with open(machines_file, 'r') as input_file: + lines = input_file.readlines() + for line in lines: + _, remotes = line.split(':') + remotes = remotes.strip() + for r in remotes.split(): + machine_list.append(r.strip()) + return machine_list + + def GetMachineType(self, m): + """Get where the machine is located. + + Args: + m: String containing the name or ip address of machine. + + Returns: + Value of the type in MachineType Enum. + """ + if m in self.local_machines: + return MachineType.LOCAL + if m in self.skylab_machines: + return MachineType.SKYLAB + return MachineType.AFE + + def PrintStatusHeader(self): + """Prints the status header lines for machines.""" + print('\nMachine (Board)\t\t\t\t\tStatus') + print('---------------\t\t\t\t\t------') + + def PrintStatus(self, m, state, machine_type): + """Prints status for a single machine. + + Args: + m: String containing the name or ip address of machine. + state: A dictionary of the current state of the machine. + machine_type: MachineType to determine where the machine is located. + """ + if machine_type == MachineType.AFE and not m.endswith('.cros'): + m += '.cros' + if state['locked']: + print('%s (%s)\t\t%slocked by %s since %s' % + (m, state['board'], '\t\t' if machine_type == MachineType.LOCAL else + '', state['locked_by'], state['lock_time'])) + else: + print( + '%s (%s)\t\t%sunlocked' % (m, state['board'], '\t\t' if + machine_type == MachineType.LOCAL else '')) + + def AddMachineToLocal(self, machine): + """Adds a machine to local machine list. + + Args: + machine: The machine to be added. + """ + if machine not in self.local_machines: + self.local_machines.append(machine) + + def AddMachineToSkylab(self, machine): + """Adds a machine to skylab machine list. + + Args: + machine: The machine to be added. + """ + if machine not in self.skylab_machines: + self.skylab_machines.append(machine) + + def ListMachineStates(self, machine_states): + """Gets and prints the current status for a list of machines. + + Prints out the current status for all of the machines in the current + LockManager's list of machines (set when the object is initialized). + + Args: + machine_states: A dictionary of the current state of every machine in + the current LockManager's list of machines. Normally obtained by + calling LockManager::GetMachineStates. + """ + self.PrintStatusHeader() + for m in machine_states: + machine_type = self.GetMachineType(m) + state = machine_states[m] + self.PrintStatus(m, state, machine_type) + + def UpdateLockInAFE(self, should_lock_machine, machine): + """Calls an AFE server to lock/unlock a machine. + + Args: + should_lock_machine: Boolean indicating whether to lock the machine (True) + or unlock the machine (False). + machine: The machine to update. + + Returns: + True if requested action succeeded, else False. + """ + kwargs = {'locked': should_lock_machine} + if should_lock_machine: + kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user + + m = machine.split('.')[0] + afe_server = self.afe + + try: + afe_server.run( + 'modify_hosts', + host_filter_data={'hostname__in': [m]}, + update_data=kwargs) + except Exception: + return False + return True + + def UpdateLockInSkylab(self, should_lock_machine, machine): + """Ask skylab to lease/release a machine. + + Args: + should_lock_machine: Boolean indicating whether to lock the machine (True) + or unlock the machine (False). + machine: The machine to update. + + Returns: + True if requested action succeeded, else False. + """ + try: + if should_lock_machine: + ret = self.LeaseSkylabMachine(machine) + else: + ret = self.ReleaseSkylabMachine(machine) + except Exception: + return False + return ret + + def UpdateFileLock(self, should_lock_machine, machine): + """Use file lock for local machines, + + Args: + should_lock_machine: Boolean indicating whether to lock the machine (True) + or unlock the machine (False). + machine: The machine to update. + + Returns: + True if requested action succeeded, else False. + """ + try: + if should_lock_machine: + ret = file_lock_machine.Machine(machine, self.locks_dir).Lock( + True, sys.argv[0]) + else: + ret = file_lock_machine.Machine(machine, self.locks_dir).Unlock(True) + except Exception: + return False + return ret + + def UpdateMachines(self, lock_machines): + """Sets the locked state of the machines to the requested value. + + The machines updated are the ones in self.machines (specified when the + class object was intialized). + + Args: + lock_machines: Boolean indicating whether to lock the machines (True) or + unlock the machines (False). + + Returns: + A list of the machines whose state was successfully updated. + """ + updated_machines = [] + action = 'Locking' if lock_machines else 'Unlocking' + for m in self.machines: + # TODO(zhizhouy): Handling exceptions with more details when locking + # doesn't succeed. + machine_type = self.GetMachineType(m) + if machine_type == MachineType.SKYLAB: + ret = self.UpdateLockInSkylab(lock_machines, m) + elif machine_type == MachineType.LOCAL: + ret = self.UpdateFileLock(lock_machines, m) + else: + ret = self.UpdateLockInAFE(lock_machines, m) + + if ret: + self.logger.LogOutput( + '%s %s machine succeeded: %s.' % (action, machine_type.value, m)) + updated_machines.append(m) + else: + self.logger.LogOutput( + '%s %s machine failed: %s.' % (action, machine_type.value, m)) + + self.machines = updated_machines + return updated_machines + + def _InternalRemoveMachine(self, machine): + """Remove machine from internal list of machines. + + Args: + machine: Name of machine to be removed from internal list. + """ + # Check to see if machine is lab machine and if so, make sure it has + # ".cros" on the end. + cros_machine = machine + if machine.find('rack') > 0 and machine.find('row') > 0: + if machine.find('.cros') == -1: + cros_machine = cros_machine + '.cros' + + self.machines = [ + m for m in self.machines if m != cros_machine and m != machine + ] + + def CheckMachineLocks(self, machine_states, cmd): + """Check that every machine in requested list is in the proper state. + + If the cmd is 'unlock' verify that every machine is locked by requestor. + If the cmd is 'lock' verify that every machine is currently unlocked. + + Args: + machine_states: A dictionary of the current state of every machine in + the current LockManager's list of machines. Normally obtained by + calling LockManager::GetMachineStates. + cmd: The user-requested action for the machines: 'lock' or 'unlock'. + + Raises: + DontOwnLock: The lock on a requested machine is owned by someone else. + """ + for k, state in machine_states.iteritems(): + if cmd == 'unlock': + if not state['locked']: + self.logger.LogWarning('Attempt to unlock already unlocked machine ' + '(%s).' % k) + self._InternalRemoveMachine(k) + + # TODO(zhizhouy): Skylab doesn't support host info such as locked_by. + # Need to update this when skylab supports it. + if (state['locked'] and state['locked_by'] and + state['locked_by'] != self.user): + raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' + 'else (%s).' % (k, state['locked_by'])) + elif cmd == 'lock': + if state['locked']: + self.logger.LogWarning( + 'Attempt to lock already locked machine (%s)' % k) + self._InternalRemoveMachine(k) + + def GetMachineStates(self, cmd=''): + """Gets the current state of all the requested machines. + + Gets the current state of all the requested machines. Stores the data in a + dictionary keyed by machine name. + + Args: + cmd: The command for which we are getting the machine states. This is + important because if one of the requested machines is missing we raise + an exception, unless the requested command is 'add'. + + Returns: + A dictionary of machine states for all the machines in the LockManager + object. + + Raises: + NoAFEServer: Cannot find the HW Lab AFE server. + AFEAccessError: An error occurred when querying the server about a + machine. + """ + if not self.afe: + raise NoAFEServer('Error: Cannot connect to main AFE server.') + + machine_list = {} + for m in self.machines: + # For local or skylab machines, we simply set {'locked': status} for them + # TODO(zhizhouy): This is a quick fix since skylab cannot return host info + # as afe does. We need to get more info such as locked_by when skylab + # supports that. + if m in self.local_machines or m in self.skylab_machines: + values = { + 'locked': 0 if cmd == 'lock' else 1, + 'board': '??', + 'locked_by': '', + 'lock_time': '' + } + machine_list[m] = values + else: + # For autotest machines, we use afe APIs to get locking info. + mod_host = m.split('.')[0] + host_info = self.afe.get_hosts(hostname=mod_host) + if not host_info: + raise AFEAccessError('Unable to get information about %s from main' + ' autotest server.' % m) + host_info = host_info[0] + name = host_info.hostname + values = {} + values['board'] = host_info.platform if host_info.platform else '??' + values['locked'] = host_info.locked + if host_info.locked: + values['locked_by'] = host_info.locked_by + values['lock_time'] = host_info.lock_time + else: + values['locked_by'] = '' + values['lock_time'] = '' + machine_list[name] = values + + self.ListMachineStates(machine_list) + + return machine_list + + def CheckMachineInSkylab(self, machine): + """Run command to check if machine is in Skylab or not. + + Returns: + True if machine in skylab, else False + """ + credential = '' + if os.path.exists(self.SKYLAB_CREDENTIAL): + credential = '--auth-service-account-json %s' % self.SKYLAB_CREDENTIAL + swarming = os.path.join(self.chromeos_root, self.SWARMING) + cmd = (('%s query --swarming https://chromeos-swarming.appspot.com ' \ + "%s 'bots/list?is_dead=FALSE&dimensions=dut_name:%s'") % \ + (swarming, + credential, + machine.rstrip('.cros'))) + ret_tup = self.ce.RunCommandWOutput(cmd) + # The command will return a json output as stdout. If machine not in skylab + # stdout will look like this: + # { + # "death_timeout": "600", + # "now": "TIMESTAMP" + # } + # Otherwise there will be a tuple starting with 'items', we simply detect + # this keyword for result. + if 'items' not in ret_tup[1]: + return False + else: + return True + + def LeaseSkylabMachine(self, machine): + """Run command to lease dut from skylab. + + Returns: + True if succeeded, False if failed. + """ + credential = '' + if os.path.exists(self.SKYLAB_CREDENTIAL): + credential = '-service-account-json %s' % self.SKYLAB_CREDENTIAL + cmd = (('%s lease-dut -minutes %s %s %s') % \ + (self.SKYLAB_PATH, + self.LEASE_MINS, + credential, + machine.rstrip('.cros'))) + # Wait 120 seconds for server to start the lease task, if not started, + # we will treat it as unavailable. + check_interval_time = 120 + retval = self.ce.RunCommand(cmd, command_timeout=check_interval_time) + return retval == self.SUCCESS + + def ReleaseSkylabMachine(self, machine): + """Run command to release dut from skylab. + + Returns: + True if succeeded, False if failed. + """ + credential = '' + if os.path.exists(self.SKYLAB_CREDENTIAL): + credential = '-service-account-json %s' % self.SKYLAB_CREDENTIAL + cmd = (('%s release-dut %s %s') % \ + (self.SKYLAB_PATH, + credential, + machine.rstrip('.cros'))) + retval = self.ce.RunCommand(cmd) + return retval == self.SUCCESS + + +def Main(argv): + """Parse the options, initialize lock manager and dispatch proper method. + + Args: + argv: The options with which this script was invoked. + + Returns: + 0 unless an exception is raised. + """ + parser = argparse.ArgumentParser() + + parser.add_argument( + '--list', + dest='cmd', + action='store_const', + const='status', + help='List current status of all known machines.') + parser.add_argument( + '--lock', + dest='cmd', + action='store_const', + const='lock', + help='Lock given machine(s).') + parser.add_argument( + '--unlock', + dest='cmd', + action='store_const', + const='unlock', + help='Unlock given machine(s).') + parser.add_argument( + '--status', + dest='cmd', + action='store_const', + const='status', + help='List current status of given machine(s).') + parser.add_argument( + '--remote', dest='remote', help='machines on which to operate') + parser.add_argument( + '--chromeos_root', + dest='chromeos_root', + required=True, + help='ChromeOS root to use for autotest scripts.') + parser.add_argument( + '--force', + dest='force', + action='store_true', + default=False, + help='Force lock/unlock of machines, even if not' + ' current lock owner.') + + options = parser.parse_args(argv) + + if not options.remote and options.cmd != 'status': + parser.error('No machines specified for operation.') + + if not os.path.isdir(options.chromeos_root): + parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) + + if not options.cmd: + parser.error('No operation selected (--list, --status, --lock, --unlock,' + ' --add_machine, --remove_machine).') + + machine_list = [] + if options.remote: + machine_list = options.remote.split() + + lock_manager = LockManager(machine_list, options.force, options.chromeos_root) + + machine_states = lock_manager.GetMachineStates(cmd=options.cmd) + cmd = options.cmd + + if cmd == 'status': + lock_manager.ListMachineStates(machine_states) + + elif cmd == 'lock': + if not lock_manager.force: + lock_manager.CheckMachineLocks(machine_states, cmd) + lock_manager.UpdateMachines(True) + + elif cmd == 'unlock': + if not lock_manager.force: + lock_manager.CheckMachineLocks(machine_states, cmd) + lock_manager.UpdateMachines(False) + + elif cmd == 'add': + lock_manager.AddMachinesToLocalServer() + + elif cmd == 'remove': + lock_manager.RemoveMachinesFromLocalServer() + + return 0 + + +if __name__ == '__main__': + sys.exit(Main(sys.argv[1:])) |