diff options
Diffstat (limited to 'afe_lock_machine.py')
-rwxr-xr-x | afe_lock_machine.py | 658 |
1 files changed, 658 insertions, 0 deletions
diff --git a/afe_lock_machine.py b/afe_lock_machine.py new file mode 100755 index 00000000..125ac971 --- /dev/null +++ b/afe_lock_machine.py @@ -0,0 +1,658 @@ +#!/usr/bin/python2 +# +# Copyright 2015 Google INc. All Rights Reserved. +"""This module controls locking and unlocking of test machines.""" + +from __future__ import print_function + +import argparse +import getpass +import os +import sys +import traceback + +from cros_utils import logger +from cros_utils import machines + + +class AFELockException(Exception): + """Base class for exceptions in this module.""" + + +class MachineNotPingable(AFELockException): + """Raised when machine does not respond to ping.""" + + +class MissingHostInfo(AFELockException): + """Raised when cannot find info about machine on machine servers.""" + + +class UpdateNonLocalMachine(AFELockException): + """Raised when user requests to add/remove a ChromeOS HW Lab machine..""" + + +class DuplicateAdd(AFELockException): + """Raised when user requests to add a machine that's already on the server.""" + + +class UpdateServerError(AFELockException): + """Raised when attempt to add/remove a machine from local server fails.""" + + +class LockingError(AFELockException): + """Raised when server fails to lock/unlock machine as requested.""" + + +class DontOwnLock(AFELockException): + """Raised when user attmepts to unlock machine locked by someone else.""" + # This should not be raised if the user specified '--force' + + +class NoAFEServer(AFELockException): + """Raised when cannot find/access the autotest server.""" + + +class AFEAccessError(AFELockException): + """Raised when cannot get information about lab machine from lab server.""" + + +class AFELockManager(object): + """Class for locking/unlocking machines vie Autotest Front End servers. + + This class contains methods for checking the locked status of machines + on both the ChromeOS HW Lab AFE server and a local AFE server. It also + has methods for adding/removing machines from the local server, and for + changing the lock status of machines on either server. For the ChromeOS + HW Lab, it only allows access to the toolchain team lab machines, as + defined in toolchain-utils/crosperf/default_remotes. By default it will + look for a local server on chrotomation2.mtv.corp.google.com, but an + alternative local AFE server can be supplied, if desired. + + !!!IMPORTANT NOTE!!! The AFE server can only be called from the main + thread/process of a program. If you launch threads and try to call it + from a thread, you will get an error. This has to do with restrictions + in the Python virtual machine (and signal handling) and cannot be changed. + """ + + LOCAL_SERVER = 'chrotomation2.mtv.corp.google.com' + + def __init__(self, + remotes, + force_option, + chromeos_root, + local_server, + use_local=True, + log=None): + """Initializes an AFELockManager object. + + Args: + remotes: A list of machine names or ip addresses to be managed. Names + and ip addresses should be represented as strings. If the list is + empty, the lock manager will get all known machines. + force_option: A Boolean indicating whether or not to force an unlock of + a machine that was locked by someone else. + chromeos_root: The ChromeOS chroot to use for the autotest scripts. + local_server: A string containing the name or ip address of the machine + that is running an AFE server, which is to be used for managing + machines that are not in the ChromeOS HW lab. + local: A Boolean indicating whether or not to use/allow a local AFE + server to be used (see local_server argument). + log: If not None, this is the logger object to be used for writing out + informational output messages. It is expected to be an instance of + Logger class from cros_utils/logger.py. + """ + self.chromeos_root = chromeos_root + self.user = getpass.getuser() + self.logger = log or logger.GetLogger() + autotest_path = os.path.join(chromeos_root, + 'src/third_party/autotest/files') + + sys.path.append(chromeos_root) + sys.path.append(autotest_path) + sys.path.append(os.path.join(autotest_path, 'server', 'cros')) + + # We have to wait to do these imports until the paths above have + # been fixed. + # pylint: disable=import-error + from client import setup_modules + setup_modules.setup( + base_path=autotest_path, root_module_name='autotest_lib') + + from dynamic_suite import frontend_wrappers + + self.afe = frontend_wrappers.RetryingAFE( + timeout_min=30, delay_sec=10, debug=False, server='cautotest') + + self.local = use_local + self.machines = list(set(remotes)) or [] + self.toolchain_lab_machines = self.GetAllToolchainLabMachines() + if self.machines and self.AllLabMachines(): + self.local = False + + if not self.local: + self.local_afe = None + else: + dargs = {} + dargs['server'] = local_server or AFELockManager.LOCAL_SERVER + # Make sure local server is pingable. + error_msg = ('Local autotest server machine %s not responding to ping.' % + dargs['server']) + self.CheckMachine(dargs['server'], error_msg) + self.local_afe = frontend_wrappers.RetryingAFE( + timeout_min=30, delay_sec=10, debug=False, **dargs) + if not self.machines: + self.machines = self.toolchain_lab_machines + self.GetAllNonlabMachines() + self.force = force_option + + def AllLabMachines(self): + """Check to see if all machines being used are HW Lab machines.""" + all_lab = True + for m in self.machines: + if m not in self.toolchain_lab_machines: + all_lab = False + break + return all_lab + + def CheckMachine(self, machine, error_msg): + """Verifies that machine is responding to ping. + + Args: + machine: String containing the name or ip address of machine to check. + error_msg: Message to print if ping fails. + + Raises: + MachineNotPingable: If machine is not responding to 'ping' + """ + if not machines.MachineIsPingable(machine, logging_level='none'): + cros_machine = machine + '.cros' + if not machines.MachineIsPingable(cros_machine, logging_level='none'): + raise MachineNotPingable(error_msg) + + def MachineIsKnown(self, machine): + """Checks to see if either AFE server knows the given machine. + + Args: + machine: String containing name or ip address of machine to check. + + Returns: + Boolean indicating if the machine is in the list of known machines for + either AFE server. + """ + if machine in self.toolchain_lab_machines: + return True + elif self.local_afe and machine in self.GetAllNonlabMachines(): + return True + + return False + + def GetAllToolchainLabMachines(self): + """Gets a list of all the toolchain machines in the ChromeOS HW lab. + + Returns: + A list of names of the toolchain machines in the ChromeOS HW lab. + """ + machines_file = os.path.join( + os.path.dirname(__file__), 'crosperf', 'default_remotes') + machine_list = [] + with open(machines_file, 'r') as input_file: + lines = input_file.readlines() + for line in lines: + _, remotes = line.split(':') + remotes = remotes.strip() + for r in remotes.split(): + machine_list.append(r.strip()) + return machine_list + + def GetAllNonlabMachines(self): + """Gets a list of all known machines on the local AFE server. + + Returns: + A list of the names of the machines on the local AFE server. + """ + non_lab_machines = [] + if self.local_afe: + non_lab_machines = self.local_afe.get_hostnames() + return non_lab_machines + + def PrintStatusHeader(self, is_lab_machine): + """Prints the status header lines for machines. + + Args: + is_lab_machine: Boolean indicating whether to print HW Lab header or + local machine header (different spacing). + """ + if is_lab_machine: + print('\nMachine (Board)\t\t\t\t\tStatus') + print('---------------\t\t\t\t\t------\n') + else: + print('\nMachine (Board)\t\tStatus') + print('---------------\t\t------\n') + + def RemoveLocalMachine(self, m): + """Removes a machine from the local AFE server. + + Args: + m: The machine to remove. + + Raises: + MissingHostInfo: Can't find machine to be removed. + """ + if self.local_afe: + host_info = self.local_afe.get_hosts(hostname=m) + if host_info: + host_info = host_info[0] + host_info.delete() + else: + raise MissingHostInfo('Cannot find/delete machine %s.' % m) + + def AddLocalMachine(self, m): + """Adds a machine to the local AFE server. + + Args: + m: The machine to be added. + """ + if self.local_afe: + error_msg = 'Machine %s is not responding to ping.' % m + self.CheckMachine(m, error_msg) + self.local_afe.create_host(m) + + def AddMachinesToLocalServer(self): + """Adds one or more machines to the local AFE server. + + Verify that the requested machines are legal to add to the local server, + i.e. that they are not ChromeOS HW lab machines, and they are not already + on the local server. Call AddLocalMachine for each valid machine. + + Raises: + DuplicateAdd: Attempt to add a machine that is already on the server. + UpdateNonLocalMachine: Attempt to add a ChromeOS HW lab machine. + UpdateServerError: Something went wrong while attempting to add a + machine. + """ + for m in self.machines: + for cros_name in [m, m + '.cros']: + if cros_name in self.toolchain_lab_machines: + raise UpdateNonLocalMachine('Machine %s is already in the ChromeOS HW' + 'Lab. Cannot add it to local server.' % + cros_name) + host_info = self.local_afe.get_hosts(hostname=m) + if host_info: + raise DuplicateAdd('Machine %s is already on the local server.' % m) + try: + self.AddLocalMachine(m) + self.logger.LogOutput('Successfully added %s to local server.' % m) + except Exception as e: + traceback.print_exc() + raise UpdateServerError( + 'Error occurred while attempting to add %s. %s' % (m, str(e))) + + def RemoveMachinesFromLocalServer(self): + """Removes one or more machines from the local AFE server. + + Verify that the requested machines are legal to remove from the local + server, i.e. that they are not ChromeOS HW lab machines. Call + RemoveLocalMachine for each valid machine. + + Raises: + UpdateServerError: Something went wrong while attempting to remove a + machine. + """ + for m in self.machines: + for cros_name in [m, m + '.cros']: + if cros_name in self.toolchain_lab_machines: + raise UpdateNonLocalMachine( + 'Machine %s is in the ChromeOS HW Lab. ' + 'This script cannot remove lab machines.' % cros_name) + try: + self.RemoveLocalMachine(m) + self.logger.LogOutput('Successfully removed %s from local server.' % m) + except Exception as e: + traceback.print_exc() + raise UpdateServerError('Error occurred while attempting to remove %s ' + '(%s).' % (m, str(e))) + + def ListMachineStates(self, machine_states): + """Gets and prints the current status for a list of machines. + + Prints out the current status for all of the machines in the current + AFELockManager's list of machines (set when the object is initialized). + + Args: + machine_states: A dictionary of the current state of every machine in + the current AFELockManager's list of machines. Normally obtained by + calling AFELockManager::GetMachineStates. + """ + local_machines = [] + printed_hdr = False + for m in machine_states: + cros_name = m + '.cros' + if (m in self.toolchain_lab_machines or + cros_name in self.toolchain_lab_machines): + name = m if m in self.toolchain_lab_machines else cros_name + if not printed_hdr: + self.PrintStatusHeader(True) + printed_hdr = True + state = machine_states[m] + if state['locked']: + print('%s (%s)\tlocked by %s since %s' % + (name, state['board'], state['locked_by'], state['lock_time'])) + else: + print('%s (%s)\tunlocked' % (name, state['board'])) + else: + local_machines.append(m) + + if local_machines: + self.PrintStatusHeader(False) + for m in local_machines: + state = machine_states[m] + if state['locked']: + print('%s (%s)\tlocked by %s since %s' % + (m, state['board'], state['locked_by'], state['lock_time'])) + else: + print('%s (%s)\tunlocked' % (m, state['board'])) + + def UpdateLockInAFE(self, should_lock_machine, machine): + """Calls an AFE server to lock/unlock a machine. + + Args: + should_lock_machine: Boolean indicating whether to lock the machine (True) + or unlock the machine (False). + machine: The machine to update. + + Raises: + LockingError: An error occurred while attempting to update the machine + state. + """ + action = 'lock' + if not should_lock_machine: + action = 'unlock' + kwargs = {'locked': should_lock_machine} + kwargs['lock_reason'] = 'toolchain user request (%s)' % self.user + + cros_name = machine + '.cros' + if cros_name in self.toolchain_lab_machines: + machine = cros_name + if machine in self.toolchain_lab_machines: + m = machine.split('.')[0] + afe_server = self.afe + else: + m = machine + afe_server = self.local_afe + + try: + afe_server.run('modify_hosts', + host_filter_data={'hostname__in': [m]}, + update_data=kwargs) + except Exception as e: + traceback.print_exc() + raise LockingError('Unable to %s machine %s. %s' % (action, m, str(e))) + + def UpdateMachines(self, lock_machines): + """Sets the locked state of the machines to the requested value. + + The machines updated are the ones in self.machines (specified when the + class object was intialized). + + Args: + lock_machines: Boolean indicating whether to lock the machines (True) or + unlock the machines (False). + + Returns: + A list of the machines whose state was successfully updated. + """ + updated_machines = [] + for m in self.machines: + self.UpdateLockInAFE(lock_machines, m) + # Since we returned from self.UpdateLockInAFE we assume the request + # succeeded. + if lock_machines: + self.logger.LogOutput('Locked machine(s) %s.' % m) + else: + self.logger.LogOutput('Unlocked machine(s) %s.' % m) + updated_machines.append(m) + + return updated_machines + + def _InternalRemoveMachine(self, machine): + """Remove machine from internal list of machines. + + Args: + machine: Name of machine to be removed from internal list. + """ + # Check to see if machine is lab machine and if so, make sure it has + # ".cros" on the end. + cros_machine = machine + if machine.find('rack') > 0 and machine.find('row') > 0: + if machine.find('.cros') == -1: + cros_machine = cros_machine + '.cros' + + self.machines = [m for m in self.machines + if m != cros_machine and m != machine] + + def CheckMachineLocks(self, machine_states, cmd): + """Check that every machine in requested list is in the proper state. + + If the cmd is 'unlock' verify that every machine is locked by requestor. + If the cmd is 'lock' verify that every machine is currently unlocked. + + Args: + machine_states: A dictionary of the current state of every machine in + the current AFELockManager's list of machines. Normally obtained by + calling AFELockManager::GetMachineStates. + cmd: The user-requested action for the machines: 'lock' or 'unlock'. + + Raises: + DontOwnLock: The lock on a requested machine is owned by someone else. + """ + for k, state in machine_states.iteritems(): + if cmd == 'unlock': + if not state['locked']: + self.logger.LogWarning('Attempt to unlock already unlocked machine ' + '(%s).' % k) + self._InternalRemoveMachine(k) + + if state['locked'] and state['locked_by'] != self.user: + raise DontOwnLock('Attempt to unlock machine (%s) locked by someone ' + 'else (%s).' % (k, state['locked_by'])) + elif cmd == 'lock': + if state['locked']: + self.logger.LogWarning('Attempt to lock already locked machine (%s)' % + k) + self._InternalRemoveMachine(k) + + def HasAFEServer(self, local): + """Verifies that the AFELockManager has appropriate AFE server. + + Args: + local: Boolean indicating whether we are checking for the local server + (True) or for the global server (False). + + Returns: + A boolean indicating if the AFELockManager has the requested AFE server. + """ + if local: + return self.local_afe is not None + else: + return self.afe is not None + + def GetMachineStates(self, cmd=''): + """Gets the current state of all the requested machines. + + Gets the current state of all the requested machines, both from the HW lab + sever and from the local server. Stores the data in a dictionary keyed + by machine name. + + Args: + cmd: The command for which we are getting the machine states. This is + important because if one of the requested machines is missing we raise + an exception, unless the requested command is 'add'. + + Returns: + A dictionary of machine states for all the machines in the AFELockManager + object. + + Raises: + NoAFEServer: Cannot find the HW Lab or local AFE server. + AFEAccessError: An error occurred when querying the server about a + machine. + """ + if not self.HasAFEServer(False): + raise NoAFEServer('Error: Cannot connect to main AFE server.') + + if self.local and not self.HasAFEServer(True): + raise NoAFEServer('Error: Cannot connect to local AFE server.') + + machine_list = {} + for m in self.machines: + host_info = None + cros_name = m + '.cros' + if (m in self.toolchain_lab_machines or + cros_name in self.toolchain_lab_machines): + mod_host = m.split('.')[0] + host_info = self.afe.get_hosts(hostname=mod_host) + if not host_info: + raise AFEAccessError('Unable to get information about %s from main' + ' autotest server.' % m) + else: + host_info = self.local_afe.get_hosts(hostname=m) + if not host_info and cmd != 'add': + raise AFEAccessError('Unable to get information about %s from ' + 'local autotest server.' % m) + if host_info: + host_info = host_info[0] + name = host_info.hostname + values = {} + values['board'] = host_info.platform if host_info.platform else '??' + values['locked'] = host_info.locked + if host_info.locked: + values['locked_by'] = host_info.locked_by + values['lock_time'] = host_info.lock_time + else: + values['locked_by'] = '' + values['lock_time'] = '' + machine_list[name] = values + else: + machine_list[m] = {} + return machine_list + + +def Main(argv): + """Parse the options, initialize lock manager and dispatch proper method. + + Args: + argv: The options with which this script was invoked. + + Returns: + 0 unless an exception is raised. + """ + parser = argparse.ArgumentParser() + + parser.add_argument( + '--list', + dest='cmd', + action='store_const', + const='status', + help='List current status of all known machines.') + parser.add_argument( + '--lock', + dest='cmd', + action='store_const', + const='lock', + help='Lock given machine(s).') + parser.add_argument( + '--unlock', + dest='cmd', + action='store_const', + const='unlock', + help='Unlock given machine(s).') + parser.add_argument( + '--status', + dest='cmd', + action='store_const', + const='status', + help='List current status of given machine(s).') + parser.add_argument( + '--add_machine', + dest='cmd', + action='store_const', + const='add', + help='Add machine to local machine server.') + parser.add_argument( + '--remove_machine', + dest='cmd', + action='store_const', + const='remove', + help='Remove machine from the local machine server.') + parser.add_argument( + '--nolocal', + dest='local', + action='store_false', + default=True, + help='Do not try to use local machine server.') + parser.add_argument( + '--remote', dest='remote', help='machines on which to operate') + parser.add_argument( + '--chromeos_root', + dest='chromeos_root', + required=True, + help='ChromeOS root to use for autotest scripts.') + parser.add_argument( + '--local_server', + dest='local_server', + default=None, + help='Alternate local autotest server to use.') + parser.add_argument( + '--force', + dest='force', + action='store_true', + default=False, + help='Force lock/unlock of machines, even if not' + ' current lock owner.') + + options = parser.parse_args(argv) + + if not options.remote and options.cmd != 'status': + parser.error('No machines specified for operation.') + + if not os.path.isdir(options.chromeos_root): + parser.error('Cannot find chromeos_root: %s.' % options.chromeos_root) + + if not options.cmd: + parser.error('No operation selected (--list, --status, --lock, --unlock,' + ' --add_machine, --remove_machine).') + + machine_list = [] + if options.remote: + machine_list = options.remote.split() + + lock_manager = AFELockManager(machine_list, options.force, + options.chromeos_root, options.local_server, + options.local) + + machine_states = lock_manager.GetMachineStates(cmd=options.cmd) + cmd = options.cmd + + if cmd == 'status': + lock_manager.ListMachineStates(machine_states) + + elif cmd == 'lock': + if not lock_manager.force: + lock_manager.CheckMachineLocks(machine_states, cmd) + lock_manager.UpdateMachines(True) + + elif cmd == 'unlock': + if not lock_manager.force: + lock_manager.CheckMachineLocks(machine_states, cmd) + lock_manager.UpdateMachines(False) + + elif cmd == 'add': + lock_manager.AddMachinesToLocalServer() + + elif cmd == 'remove': + lock_manager.RemoveMachinesFromLocalServer() + + return 0 + + +if __name__ == '__main__': + sys.exit(Main(sys.argv[1:])) |