From 59316699f5407c52d6d8846ab882715c42bbc567 Mon Sep 17 00:00:00 2001 From: Ramon Pantin Date: Thu, 5 Dec 2019 00:25:03 -0800 Subject: add kmi_defines.py program This is the first half of the reimplementation of the LKA tools into a single Python 3 script: kmi_defines.py. kmi_defines is supposed to extract #define compile time constants from a Linux build. This is work in progress. Bug: 139885116 Test: manually, all code passes pylint3, ptype, and formatted with yapf Change-Id: I0cdda58aa8471b8894a0d9272f8c08d3b6cd6836 Signed-off-by: Ramon Pantin Signed-off-by: Matthias Maennich --- abi/kmi_defines.py | 644 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 644 insertions(+) create mode 100755 abi/kmi_defines.py diff --git a/abi/kmi_defines.py b/abi/kmi_defines.py new file mode 100755 index 00000000..024ff3e5 --- /dev/null +++ b/abi/kmi_defines.py @@ -0,0 +1,644 @@ +#!/usr/bin/env python3 +"""kmi_defines extract #define compile time constants from a Linux build. + +The kmi_defines tool is used to examine the output of a Linux build +and extract from it C #define statements that define compile time +constant expressions for the purpose of tracking them as part of the +KMI (Kernel Module Interface) so that changes to their values can be +prevented so as to ensure a constant KMI for kernel modules for the +AOSP GKI Linux kernel project. + +This code is python3 only, it does not require any from __future__ +imports. This is a standalone program, it is not meant to be used as +a module by other programs. + +This program runs under the multiprocessing module. Work done within +a multiprocessing.Pool does not perform error logging or affects any +state other than the value that it computes and returns via the function +mapped through the pool's map() function. The reason that no external +state is affected (for example error loggiing) is to avoid to have to +even think about what concurrent updates would cause to shuch a facility. +""" + +# TODO(pantin): per Matthias review feedback: "drop the .py from the +# filename after(!) the review has completed. As last action. Until +# then we can have the syntax highlighting here in Gerrit." + +import argparse +import logging +import multiprocessing +import os +import pathlib +import re +import subprocess +import sys +from typing import List, Tuple, Iterable +from typing import Set # pytype needs this, pylint: disable=unused-import + +COMPILER = "clang" # TODO(pantin): should be determined at run-time +DEBUG = True # TODO(pantin): should be a program argument +INDENT = 4 # number of spaces to indent for each depth level +PROGRAM = os.path.basename(sys.argv[0]) + + +class StopError(Exception): + """Exception raised to stop work when an unexpected error occurs.""" + + +def dump(this) -> None: + """Dump the data in this. + + This is for debugging purposes, it does not handle every type, only + the types used by the underlying code are handled. This will not be + part of the final code, or if it is, it will be significantly enhanced + or replaced by some other introspection mechanism to serialize data. + """ + def dump_this(this, name: str, depth: int) -> None: + """Dump the data in this.""" + if name: + name += " = " + if isinstance(this, str): + indent = " " * (depth * INDENT) + print(indent + name + this) + elif isinstance(this, bool): + indent = " " * (depth * INDENT) + print(indent + name + str(this)) + elif isinstance(this, List): + dump_list(this, name, depth) + elif isinstance(this, Set): + dump_set(this, name, depth) + else: + dump_object(this, name, depth) + + def dump_list(lst: List[str], name: str, depth: int) -> None: + """Dump the data in lst.""" + indent = " " * (depth * INDENT) + print(indent + name + "{") + index = 0 + for entry in lst: + dump_this(entry, f"[{index}]", depth + 1) + index += 1 + print(indent + "}") + + def dump_set(aset: Set[str], name: str, depth: int) -> None: + """Dump the data in aset.""" + lst = list(aset) + lst.sort() + dump_list(lst, name, depth) + + def dump_object(this, name: str, depth: int) -> None: + """Dump the data in this.""" + indent = " " * (depth * INDENT) + print(indent + name + + re.sub(r"(^$)", "", str(type(this))) + " {") + for key, val in this.__dict__.items(): + dump_this(val, key, depth + 1) + print(indent + "}") + + dump_this(this, "", 0) + + +def readfile(name: str) -> str: + """Open a file and return its contents in a string as its value.""" + try: + with open(name) as file: + return file.read() + except OSError as os_error: + raise StopError("readfile() failed for: " + name + "\n" + "original OSError: " + str(os_error.args)) + + +def file_must_exist(file: str) -> None: + """If file is invalid print raise a StopError.""" + if not os.path.exists(file): + raise StopError("file does not exist: " + file) + if not os.path.isfile(file): + raise StopError("file is not a regular file: " + file) + + +def makefile_depends_get_dependencies(depends: str) -> List[str]: + """Return list with the dependencies of a makefile target. + + Split the makefile depends specification, the name of the dependent is + followed by ":" its dependencies follow the ":". There could be spaces + around the ":". Line continuation characters, i.e. "\" are consumed by + the regular expression that splits the specification. + + This results in a list with the dependent first, and its dependencies + in the remainder of the list, return everything in the list other than + the first element. + """ + return re.split(r"[:\s\\]+", re.sub(r"[\s\\]*\Z", "", depends))[1:] + + +def makefile_assignment_split(assignment: str) -> Tuple[str, str]: + """Split left:=right into a tuple with the left and right parts. + + Spaces around the := are also removed. + """ + result = re.split(r"\s*:=\s*", assignment, maxsplit=1) + if len(result) != 2: + raise StopError( + "expected: 'left:=right' in: " + + assignment) + return result[0], result[1] # left, right + + +def extract_c_src(obj: str, dependencies: List[str]) -> Tuple[str, List[str]]: + """Separate the C source file from the other dependencies. + + Returns a tuple with the C source code file and a list with the remaining + dependencies. If the source file is not a C source file: + return "", [] + an ealier implementation used: + returne None, [] + but that caused type errors from ptype. Instead of obfuscating the type + of the value of this function, returning "", [] is appropriate. + """ + if not dependencies: + raise StopError("empty dependencies for: " + obj) + src = dependencies[0] + if not src.endswith(".c"): + return "", [] + return src, dependencies[1:] + + +def lines_to_list(lines: str) -> List[str]: + """Split a string into a list of non-empty lines.""" + return [line for line in lines.strip().splitlines() if line] + + +def lines_get_first_line(lines: str) -> str: + """Return the first non-empty line in lines.""" + return lines.strip().splitlines()[0] + + +def shell_line_to_o_files_list(line: str) -> List[str]: + """Return a list of .o files in the files list.""" + return [entry for entry in line.split() if entry.endswith(".o")] + + +class KernelModule: + """A kernel module, i.e. a *.ko file.""" + def __init__(self, kofile: str) -> None: + """Construct a KernelModule object.""" + # An example argument is used below, assuming kofile is: + # possibly/empty/dirs/modname.ko + # + # Meant to refer to this module, shown here relative to the top of + # the build directory: + # drivers/usb/gadget/udc/modname.ko + # the values assigned to the members are shown in the comments below. + + self._file = os.path.realpath(kofile) # /abs/dirs/modname.ko + self._base = os.path.basename(self._file) # modname.ko + self._directory = os.path.dirname(self._file) # /abs/dirs + self._cmd_file = os.path.join(self._directory, + "." + self._base + ".cmd") + self._cmd_text = readfile(self._cmd_file) + + # Some builds append a '; true' to the .modname.ko.cmd, remove it + + self._cmd_text = re.sub(r";\s*true\s*$", "", self._cmd_text) + + # The modules .modname.ko.cmd file contains a makefile snippet, + # for example: + # cmd_drivers/usb/gadget/udc/dummy_hcd.ko := ld.lld -r ... + # + # Split the string prior to the spaces followed by ":=", and get + # the first element of the resulting list. If the string was not + # split (because it did not contain a ":=" then the input string + # is returned, by the re.sub() below, as the only element of the list. + + left, _ = makefile_assignment_split(self._cmd_text) + self._rel_file = re.sub(r"^cmd_", "", left) + if self._rel_file == left: + raise StopError("expected: 'cmd_' at start of content of: " + + self._cmd_file) + + base = os.path.basename(self._rel_file) + if base != self._base: + raise StopError("module name mismatch: " + base + " vs " + + self._base) + + self._rel_dir = os.path.dirname(self._rel_file) + + # The final step in the build of kernel modules is based on two .o + # files, one with the module name followed by .o and another followed + # by .mod.o + # + # The following test verifies that assumption, in case a module is + # built differently in the future. + # + # Even when there are multiple source files, the .o files that result + # from compiling them are all linked into a single .o file through an + # intermediate link step, that .o files is named: + # os.path.join(self._rel_dir, kofile_name + ".o") + + kofile_name, _ = os.path.splitext(self._base) + objs = shell_line_to_o_files_list(self._cmd_text) + objs.sort() + expected = [ # sorted, i.e.: .mod.o < .o + os.path.join(self._rel_dir, kofile_name + ".mod.o"), + os.path.join(self._rel_dir, kofile_name + ".o") + ] + if objs != expected: + raise StopError("unexpected .o files in: " + self._cmd_file) + + def get_build_dir(self) -> str: + """Return the top level build directory. + + I.e. the directory where the output of the Linux build is stored. + + Note that this, like pretty much all the code, can raise an exception, + by construction, if an exception is raised while an object is being + constructed, or after it is constructed, the object will not be used + thereafter (at least not any object explicitly created by this + program). Many other places, for example the ones that call readfile() + can raise exceptions, the code is located where it belongs. + + In this specific case, the computation of index, and the derived + invariant that it be >= 0, is predicated by the condition checked + below, if the exception is not raised, then index is >= 0. + """ + if not self._file.endswith(self._rel_file): + raise StopError("could not find: " + self._rel_file + + " at end of: " + self._file) + index = len(self._file) - len(self._rel_file) + if index > 0 and self._file[index - 1] == os.sep: + index -= 1 + build_dir = self._file[0:index] + return build_dir + + def get_object_files(self, build_dir: str) -> List[str]: + """Return a list object files that used to link the kernel module. + + The ocmd_file is the file with extension ".o.cmd" (see below). + If the ocmd_file has a more than one line in it, its because the + module is made of a single source file and the ocmd_file has the + compilation rule and dependencies to build it. If it has a single + line single line it is because it builds the .o file by linking + multiple .o files. + """ + + kofile_name, _ = os.path.splitext(self._base) + ocmd_file = os.path.join(build_dir, self._rel_dir, + "." + kofile_name + ".o.cmd") + ocmd_content = readfile(ocmd_file) + + olines = lines_to_list(ocmd_content) + if len(olines) > 1: # module made from a single .o file + return [os.path.join(build_dir, self._rel_dir, kofile_name + ".o")] + + # Multiple .o files in the module + + _, ldline = makefile_assignment_split(olines[0]) + return [ + os.path.realpath(os.path.join(build_dir, obj)) + for obj in shell_line_to_o_files_list(ldline) + ] + + +class Kernel: + """The Linux kernel component itself, i.e. vmlinux.o.""" + def __init__(self, kernel: str) -> None: + """Construct a Kernel object.""" + self._kernel = os.path.realpath(kernel) + self._build_dir = os.path.dirname(self._kernel) + libs = os.path.join(self._build_dir, "vmlinux.libs") + objs = os.path.join(self._build_dir, "vmlinux.objs") + file_must_exist(libs) + file_must_exist(objs) + contents = readfile(libs) + archives_and_objects = contents.split() + contents = readfile(objs) + archives_and_objects += contents.split() + self._archives_and_objects = [(os.path.join(self._build_dir, file) + if not os.path.isabs(file) else file) + for file in archives_and_objects] + + def get_build_dir(self) -> str: + """Return the top level build directory. + + I.e. the directory where the output of the Linux build is stored. + """ + return self._build_dir + + def get_object_files(self, build_dir: str) -> List[str]: + """Return a list object files that where used to link the kernel.""" + olist = [] + for file in self._archives_and_objects: + if file.endswith(".o"): + if not os.path.isabs(file): + file = os.path.join(build_dir, file) + olist.append(os.path.realpath(file)) + continue + + if not file.endswith(".a"): + raise StopError("unknown file type: " + file) + + try: + # This argument does not always work: check=False + # neither that nor: check=True prevents an exception from + # being raised if "ar" can not be found + completion = subprocess.run(["ar", "t", file], + capture_output=True, + text=True) + if completion.returncode != 0: + raise StopError("ar failed for: ar t " + file) + objs = lines_to_list(completion.stdout) + except OSError as os_error: + raise StopError("failure executing: ar t", file, "\n" + "original OSError: " + str(os_error.args)) + + for obj in objs: + if not os.path.isabs(obj): + obj = os.path.join(build_dir, obj) + olist.append(os.path.realpath(obj)) + + return olist + + +class Target: # pylint: disable=too-few-public-methods + """Target of build and the information used to build it.""" + def __init__(self, obj: str, src: str, cc_line: str, + deps: List[str]) -> None: + self._obj = obj + self._src = src + self._deps = deps + + # The cc_line, eventually slightly modified, will be used to run + # the compiler in various ways. The cc_line could be fed through + # the shell to deal with the single-quotes in the cc_line that are + # there to quote the double-quotes meant to be part of a C string + # literal. Specifically, this occurs in to pass KBUILD_MODNAME and + # KBUILD_BASENAME, for example: + # -DKBUILD_MODNAME='"aes_ce_cipher"' + # -DKBUILD_BASENAME='"aes_cipher_glue"' + # + # Causing an extra execve(2) of the shell, just to deal with a few + # quotes is wasteful, so instead, here the quotes, in this specific + # case are removed. This can be done, easiest just by removing the + # single quotes with: + # cc_cmd = re.sub(r"'", "", cc_line) + # + # But this could mess up other quote usage in the future, for example + # using double quotes or backslash to quote a single quote meant to + # actually be seen by the compiler. + # + # As an alternative, and for this to be more robust, the specific + # cases that are known, i.e. the two -D shown above, are dealt with + # individually and if there are any single or double quotes, or + # backslashes the underlying work is stopped. + # + # Note that the cc_line comes from the .foo.o.cmd file which is a + # makefile snippet, so the actual syntax there is also subject to + # whatever other things make would want to do with them. Instead + # of doing the absolutely correct thing, which would actually be + # to run this through make to have make run then through the shell + # this program already has knowledge about these .cmd files and how + # they are formed. This compromise, or coupling of knowledge, is a + # source of fragility, but not expected to cause much trouble in the + # future as the Linux build evolves. + + cc_cmd = re.sub( + r"""-D(KBUILD_BASENAME|KBUILD_MODNAME)='("[a-zA-Z0-9_.:]*")'""", + r"-D\1=\2", cc_line) + cc_list = cc_cmd.split() + + # At least: cc -c -o file.o file.c (last four must be those shown) + if (len(cc_list) < 5 or cc_list[-4] != "-c" or cc_list[-3] != "-o" + or not obj.endswith(cc_list[-2]) + or not src.endswith(cc_list[-1])): + raise StopError("unexpected or missing arguments for " + obj + + " cc_line: " + cc_line) + self._cc_list = cc_list + + +class KernelComponentBase: # pylint: disable=too-few-public-methods + """Base class for KernelComponentCreationError and KernelComponent. + + There is not much purpose for this class other than to satisfy the strong + typing checks of pytype, with looser typing, this could be removed but at + the risk of invoking member functions at run-time on objects that do not + provide them. Having this class makes the code more reliable. + """ + def get_error(self) -> str: # pylint: disable=no-self-use + """Return an empty error string, which means there was no error.""" + return "" + + def get_deps_set(self) -> Set[str]: # pylint: disable=no-self-use + """Return the set of dependencies for the kernel component.""" + return set() + + def is_kernel(self) -> bool: # pylint: disable=no-self-use + """Is this the kernel?""" + return False + + +class KernelComponentCreationError(KernelComponentBase): # pylint: disable=too-few-public-methods + """A KernelComponent creation error. + + When a KernelComponent creation fails, or the creation of its subordinate + Kernel or KernelModule creation fails, a KernelComponentCreationError + object is created to store the information relevant to the failure. + """ + def __init__(self, filename: str, error: str) -> None: + """Construct a KernelComponentCreationError object.""" + self._error = error + self._filename = filename + + def get_error(self) -> str: + """Return the error.""" + return self._filename + ": " + self._error + + +class KernelComponent(KernelComponentBase): + """A kernel component, either vmlinux.o or a *.ko file. + + Inspect a Linux kernel module (a *.ko file) or the Linux kernel to + determine what was used to build it: object filess, source files, header + files, and other information that is produced as a by-product of its build. + """ + def __init__(self, filename: str) -> None: + """Construct a KernelComponent object.""" + if filename.endswith("vmlinux.o"): + self._kernel = True + self._kind = Kernel(filename) + else: + self._kernel = False + self._kind = KernelModule(filename) + self._build_dir = self._kind.get_build_dir() + self._source_dir = self._get_source_dir() + self._files_o = self._kind.get_object_files(self._build_dir) + self._files_o.sort() + + # using a set because there is no unique flag to list.sort() + deps_set = set() + + self._targets = [] + for obj in self._files_o: + file_must_exist(obj) + dot_obj = os.path.join(os.path.dirname(obj), + "." + os.path.basename(obj)) + cmd = dot_obj + ".cmd" + content = readfile(cmd) + + line = lines_get_first_line(content) + _, cc_line = makefile_assignment_split(line) + if cc_line.split(maxsplit=1)[0] != COMPILER: + # The object file was made by strip, symbol renames, etc. + # i.e. it was not the result of running the compiler, thus + # it can not contribute to #define compile time constants. + continue + + odkeep = dot_obj + ".d.keep" + file_must_exist(odkeep) + + content = readfile(odkeep) + src, dependendencies = extract_c_src( + obj, makefile_depends_get_dependencies(content)) + if not src: + continue + file_must_exist(src) + + depends = [] + for dep in dependendencies: + if not os.path.isabs(dep): + dep = os.path.join(self._build_dir, dep) + dep = os.path.realpath(dep) + depends.append(dep) + deps_set.add(dep) + + if not os.path.isabs(src): + src = os.path.join(self._build_dir, src) + src = os.path.realpath(src) + self._targets.append(Target(obj, src, cc_line, depends)) + + for dep in [dep for dep in list(deps_set) if not dep.endswith(".h")]: + deps_set.remove(dep) + self._deps_set = deps_set + + def _get_source_dir(self) -> str: + """Return the top level Linux kernel source directory.""" + source = os.path.join(self._build_dir, "source") + if not os.path.islink(source): + raise StopError("could not find source symlink: " + source) + + if not os.path.isdir(source): + raise StopError("source symlink not a directory: " + source) + + source_dir = os.path.realpath(source) + if not os.path.isdir(source_dir): + raise StopError("source directory not a directory: " + source_dir) + + return source_dir + + def get_deps_set(self) -> Set[str]: + """Return the set of dependencies for the kernel component.""" + return self._deps_set + + def is_kernel(self) -> bool: + """Is this the kernel?""" + return self._kernel + + +def kernel_component_factory(filename: str) -> KernelComponentBase: + """Make an InfoKmod or an InfoKernel object for file and return it.""" + try: + return KernelComponent(filename) + except StopError as stop_error: + return KernelComponentCreationError(filename, + " ".join([*stop_error.args])) + + +def all_ko_and_vmlinux() -> Iterable[str]: + """Generator that yields vmlinux.o and all the *.ko files.""" + + # TODO(pantin): remove if no measurable slowdown when using (below): + # + # components = pool.map(kernel_component_factory, ["vmlinux.o"] + + # [str(ko) for ko in pathlib.Path().rglob("*.ko")]) + # + # instead of: + # + # components = pool.map( + # kernel_component_factory, all_ko_and_vmlinux()) + # + yield "vmlinux.o" # yield vmlinux.o so its worked on first + for kofile in pathlib.Path().rglob("*.ko"): + yield str(kofile) + + +def work_on_all_components() -> List[KernelComponentBase]: + """Return a list of KernelComponentBase objects.""" + + # TODO(pantin): Matthias suggested: "make it a command line option + # to run on the main thread only" ... "for debugging purposes that + # can be very helpful" + # + with multiprocessing.Pool(os.cpu_count()) as pool: + components = pool.map(kernel_component_factory, ["vmlinux.o"] + + [str(ko) for ko in pathlib.Path().rglob("*.ko")]) + return components + + +def work_on_whole_build() -> int: + """Work on the whole build to extract the #define constants.""" + components = work_on_all_components() + all_kmod_h_set = set() + kernel_h_set = set() + failed = False + for comp in components: + error = comp.get_error() + if error != "": + logging.error(error) + failed = True + continue + if comp.is_kernel(): + kernel_h_set = comp.get_deps_set() + else: + all_kmod_h_set |= comp.get_deps_set() + if failed: + return 1 + if DEBUG: + dump(components) + headers = kernel_h_set & all_kmod_h_set + hlist = list(headers) + hlist.sort() + for dep in hlist: + print(dep) + return 0 + + +def main() -> int: + """Extract #define compile time constants from a Linux build.""" + def existing_file(file): + if not os.path.isfile(file): + raise argparse.ArgumentTypeError( + "{0} is not a valid file".format(file)) + return file + + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("file", + nargs='?', + default="vmlinux.o", + type=existing_file) + args = arg_parser.parse_args() + + if len(sys.argv) == 1: + return work_on_whole_build() + + comp = kernel_component_factory(args.file) + + error = comp.get_error() + if error != "": + logging.error(error) + return 1 + if DEBUG: + dump([comp]) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) -- cgit v1.2.3