# Copyright 2014 The Chromium OS Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Script to discover dependencies and other file information from a build. Some files in the image are installed to provide some functionality, such as chrome, shill or bluetoothd provide different functionality that can be present or not on a given build. Many other files are dependencies from these files that need to be present in the image for them to work. These dependencies come from needed shared libraries, executed files and other configuration files read. This script currently discovers dependencies between ELF files for libraries required at load time (libraries loaded by the dynamic linker) but not libraries loaded at runtime with dlopen(). It also computes size and file type in several cases to help understand the contents of the built image. """ from __future__ import print_function import itertools import json import multiprocessing import os import stat from chromite.lib import commandline from chromite.lib import cros_logging as logging from chromite.lib import filetype from chromite.lib import parseelf from chromite.lib import portage_util from chromite.scripts import lddtree # Regex to parse Gentoo atoms. This should match the following ebuild names, # splitting the package name from the version. # without version: # chromeos-base/tty # chromeos-base/libchrome-271506 # sys-kernel/chromeos-kernel-3_8 # with version: # chromeos-base/tty-0.0.1-r4 # chromeos-base/libchrome-271506-r5 # sys-kernel/chromeos-kernel-3_8-3.8.11-r35 RE_EBUILD_WITHOUT_VERSION = r'^([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)$' RE_EBUILD_WITH_VERSION = ( r'^=?([a-z0-9\-]+/[a-zA-Z0-9\_\+\-]+)\-([^\-]+(\-r\d+)?)$') def ParseELFWithArgs(args): """Wrapper to parseelf.ParseELF accepting a single arg. This wrapper is required to use multiprocessing.Pool.map function. Returns: A 2-tuple with the passed relative path and the result of ParseELF(). On error, when ParseELF() returns None, this function returns None. """ elf = parseelf.ParseELF(*args) if elf is None: return return args[1], elf class DepTracker(object): """Tracks dependencies and file information in a root directory. This class computes dependencies and other information related to the files in the root image. """ def __init__(self, root, jobs=1): root_st = os.lstat(root) if not stat.S_ISDIR(root_st.st_mode): raise Exception('root (%s) must be a directory' % root) self._root = root.rstrip('/') + '/' self._file_type_decoder = filetype.FileTypeDecoder(root) # A wrapper to the multiprocess map function. We avoid launching a pool # of processes when jobs is 1 so python exceptions kill the main process, # useful for debugging. if jobs > 1: self._pool = multiprocessing.Pool(jobs) self._imap = self._pool.map else: self._imap = itertools.imap self._files = {} self._ebuilds = {} # Mapping of rel_paths for symlinks and hardlinks. Hardlinks are assumed # to point to the lowest lexicographically file with the same inode. self._symlinks = {} self._hardlinks = {} def Init(self): """Generates the initial list of files.""" # First iteration over all the files in root searching for symlinks and # non-regular files. seen_inodes = {} for basepath, _, filenames in sorted(os.walk(self._root)): for filename in sorted(filenames): full_path = os.path.join(basepath, filename) rel_path = full_path[len(self._root):] st = os.lstat(full_path) file_data = { 'size': st.st_size, } self._files[rel_path] = file_data # Track symlinks. if stat.S_ISLNK(st.st_mode): link_path = os.readlink(full_path) # lddtree's normpath handles a little more cases than the os.path # version. In particular, it handles the '//' case. self._symlinks[rel_path] = ( link_path.lstrip('/') if link_path and link_path[0] == '/' else lddtree.normpath(os.path.join(os.path.dirname(rel_path), link_path))) file_data['deps'] = { 'symlink': [self._symlinks[rel_path]] } # Track hardlinks. if st.st_ino in seen_inodes: self._hardlinks[rel_path] = seen_inodes[st.st_ino] continue seen_inodes[st.st_ino] = rel_path def SaveJSON(self, filename): """Save the computed information to a JSON file. Args: filename: The destination JSON file. """ data = { 'files': self._files, 'ebuilds': self._ebuilds, } json.dump(data, open(filename, 'w')) def ComputeEbuildDeps(self, sysroot): """Compute the dependencies between ebuilds and files. Iterates over the list of ebuilds in the database and annotates the files with the ebuilds they are in. For each ebuild installing a file in the root, also compute the direct dependencies. Stores the information internally. Args: sysroot: The path to the sysroot, for example "/build/link". """ portage_db = portage_util.PortageDB(sysroot) if not os.path.exists(portage_db.db_path): logging.warning('PortageDB directory not found: %s', portage_db.db_path) return for pkg in portage_db.InstalledPackages(): pkg_files = [] pkg_size = 0 cpf = '%s/%s' % (pkg.category, pkg.pf) for typ, rel_path in pkg.ListContents(): # We ignore other entries like for example "dir". if not typ in (pkg.OBJ, pkg.SYM): continue # We ignore files installed in the SYSROOT that weren't copied to the # image. if not rel_path in self._files: continue pkg_files.append(rel_path) file_data = self._files[rel_path] if 'ebuild' in file_data: logging.warning('Duplicated entry for %s: %s and %', rel_path, file_data['ebuild'], cpf) file_data['ebuild'] = cpf pkg_size += file_data['size'] # Ignore packages that don't install any file. if not pkg_files: continue self._ebuilds[cpf] = { 'size': pkg_size, 'files': len(pkg_files), 'atom': '%s/%s' % (pkg.category, pkg.package), 'version': pkg.version, } # TODO(deymo): Parse dependencies between ebuilds. def ComputeELFFileDeps(self): """Computes the dependencies between files. Computes the dependencies between the files in the root directory passed during construction. The dependencies are inferred for ELF files. The list of dependencies for each file in the passed rootfs as a dict(). The result's keys are the relative path of the files and the value of each file is a list of dependencies. A dependency is a tuple (dep_path, dep_type) where the dep_path is relative path from the passed root to the dependent file and dep_type is one the following strings stating how the dependency was discovered: 'ldd': The dependent ELF file is listed as needed in the dynamic section. 'symlink': The dependent file is a symlink to the depending. If there are dependencies of a given type whose target file wasn't determined, a tuple (None, dep_type) is included. This is the case for example is a program uses library that wasn't found. """ ldpaths = lddtree.LoadLdpaths(self._root) # First iteration over all the files in root searching for symlinks and # non-regular files. parseelf_args = [] for rel_path, file_data in self._files.iteritems(): if rel_path in self._symlinks or rel_path in self._hardlinks: continue full_path = os.path.join(self._root, rel_path) st = os.lstat(full_path) if not stat.S_ISREG(st.st_mode): continue parseelf_args.append((self._root, rel_path, ldpaths)) # Parallelize the ELF lookup step since it is quite expensive. elfs = dict(x for x in self._imap(ParseELFWithArgs, parseelf_args) if not x is None) for rel_path, elf in elfs.iteritems(): file_data = self._files[rel_path] # Fill in the ftype if not set yet. We complete this value at this point # to avoid re-parsing the ELF file later. if not 'ftype' in file_data: ftype = self._file_type_decoder.GetType(rel_path, elf=elf) if ftype: file_data['ftype'] = ftype file_deps = file_data.get('deps', {}) # Dependencies based on the result of ldd. for lib in elf.get('needed', []): lib_path = elf['libs'][lib]['path'] if not 'ldd' in file_deps: file_deps['ldd'] = [] file_deps['ldd'].append(lib_path) if file_deps: file_data['deps'] = file_deps def ComputeFileTypes(self): """Computes all the missing file type for the files in the root.""" for rel_path, file_data in self._files.iteritems(): if 'ftype' in file_data: continue ftype = self._file_type_decoder.GetType(rel_path) if ftype: file_data['ftype'] = ftype def ParseArgs(argv): """Return parsed commandline arguments.""" parser = commandline.ArgumentParser() parser.add_argument( '-j', '--jobs', type=int, default=multiprocessing.cpu_count(), help='number of simultaneous jobs.') parser.add_argument( '--sysroot', type='path', metavar='SYSROOT', help='parse portage DB for ebuild information from the provided sysroot.') parser.add_argument( '--json', type='path', help='store information in JSON file.') parser.add_argument( 'root', type='path', help='path to the directory where the rootfs is mounted.') opts = parser.parse_args(argv) opts.Freeze() return opts def main(argv): """Main function to start the script.""" opts = ParseArgs(argv) logging.debug('Options are %s', opts) dt = DepTracker(opts.root, jobs=opts.jobs) dt.Init() dt.ComputeELFFileDeps() dt.ComputeFileTypes() if opts.sysroot: dt.ComputeEbuildDeps(opts.sysroot) if opts.json: dt.SaveJSON(opts.json)