summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRamon Pantin <pantin@google.com>2019-12-05 00:25:03 -0800
committerRamon Pantin <pantin@google.com>2020-02-19 17:55:44 -0800
commit59316699f5407c52d6d8846ab882715c42bbc567 (patch)
treec461e0242fbed7b81fcd702ff671ef41b3e3765d
parent0c0380c70e5f959b1a149d957011fc5d185b6535 (diff)
downloadbuild-59316699f5407c52d6d8846ab882715c42bbc567.tar.gz
add kmi_defines.py program
This is the first half of the reimplementation of the LKA tools into a single Python 3 script: kmi_defines.py. kmi_defines is supposed to extract #define compile time constants from a Linux build. This is work in progress. Bug: 139885116 Test: manually, all code passes pylint3, ptype, and formatted with yapf Change-Id: I0cdda58aa8471b8894a0d9272f8c08d3b6cd6836 Signed-off-by: Ramon Pantin <pantin@google.com> Signed-off-by: Matthias Maennich <maennich@google.com>
-rwxr-xr-xabi/kmi_defines.py644
1 files changed, 644 insertions, 0 deletions
diff --git a/abi/kmi_defines.py b/abi/kmi_defines.py
new file mode 100755
index 00000000..024ff3e5
--- /dev/null
+++ b/abi/kmi_defines.py
@@ -0,0 +1,644 @@
+#!/usr/bin/env python3
+"""kmi_defines extract #define compile time constants from a Linux build.
+
+The kmi_defines tool is used to examine the output of a Linux build
+and extract from it C #define statements that define compile time
+constant expressions for the purpose of tracking them as part of the
+KMI (Kernel Module Interface) so that changes to their values can be
+prevented so as to ensure a constant KMI for kernel modules for the
+AOSP GKI Linux kernel project.
+
+This code is python3 only, it does not require any from __future__
+imports. This is a standalone program, it is not meant to be used as
+a module by other programs.
+
+This program runs under the multiprocessing module. Work done within
+a multiprocessing.Pool does not perform error logging or affects any
+state other than the value that it computes and returns via the function
+mapped through the pool's map() function. The reason that no external
+state is affected (for example error loggiing) is to avoid to have to
+even think about what concurrent updates would cause to shuch a facility.
+"""
+
+# TODO(pantin): per Matthias review feedback: "drop the .py from the
+# filename after(!) the review has completed. As last action. Until
+# then we can have the syntax highlighting here in Gerrit."
+
+import argparse
+import logging
+import multiprocessing
+import os
+import pathlib
+import re
+import subprocess
+import sys
+from typing import List, Tuple, Iterable
+from typing import Set # pytype needs this, pylint: disable=unused-import
+
+COMPILER = "clang" # TODO(pantin): should be determined at run-time
+DEBUG = True # TODO(pantin): should be a program argument
+INDENT = 4 # number of spaces to indent for each depth level
+PROGRAM = os.path.basename(sys.argv[0])
+
+
+class StopError(Exception):
+ """Exception raised to stop work when an unexpected error occurs."""
+
+
+def dump(this) -> None:
+ """Dump the data in this.
+
+ This is for debugging purposes, it does not handle every type, only
+ the types used by the underlying code are handled. This will not be
+ part of the final code, or if it is, it will be significantly enhanced
+ or replaced by some other introspection mechanism to serialize data.
+ """
+ def dump_this(this, name: str, depth: int) -> None:
+ """Dump the data in this."""
+ if name:
+ name += " = "
+ if isinstance(this, str):
+ indent = " " * (depth * INDENT)
+ print(indent + name + this)
+ elif isinstance(this, bool):
+ indent = " " * (depth * INDENT)
+ print(indent + name + str(this))
+ elif isinstance(this, List):
+ dump_list(this, name, depth)
+ elif isinstance(this, Set):
+ dump_set(this, name, depth)
+ else:
+ dump_object(this, name, depth)
+
+ def dump_list(lst: List[str], name: str, depth: int) -> None:
+ """Dump the data in lst."""
+ indent = " " * (depth * INDENT)
+ print(indent + name + "{")
+ index = 0
+ for entry in lst:
+ dump_this(entry, f"[{index}]", depth + 1)
+ index += 1
+ print(indent + "}")
+
+ def dump_set(aset: Set[str], name: str, depth: int) -> None:
+ """Dump the data in aset."""
+ lst = list(aset)
+ lst.sort()
+ dump_list(lst, name, depth)
+
+ def dump_object(this, name: str, depth: int) -> None:
+ """Dump the data in this."""
+ indent = " " * (depth * INDENT)
+ print(indent + name +
+ re.sub(r"(^<class '__main__\.|'>$)", "", str(type(this))) + " {")
+ for key, val in this.__dict__.items():
+ dump_this(val, key, depth + 1)
+ print(indent + "}")
+
+ dump_this(this, "", 0)
+
+
+def readfile(name: str) -> str:
+ """Open a file and return its contents in a string as its value."""
+ try:
+ with open(name) as file:
+ return file.read()
+ except OSError as os_error:
+ raise StopError("readfile() failed for: " + name + "\n"
+ "original OSError: " + str(os_error.args))
+
+
+def file_must_exist(file: str) -> None:
+ """If file is invalid print raise a StopError."""
+ if not os.path.exists(file):
+ raise StopError("file does not exist: " + file)
+ if not os.path.isfile(file):
+ raise StopError("file is not a regular file: " + file)
+
+
+def makefile_depends_get_dependencies(depends: str) -> List[str]:
+ """Return list with the dependencies of a makefile target.
+
+ Split the makefile depends specification, the name of the dependent is
+ followed by ":" its dependencies follow the ":". There could be spaces
+ around the ":". Line continuation characters, i.e. "\" are consumed by
+ the regular expression that splits the specification.
+
+ This results in a list with the dependent first, and its dependencies
+ in the remainder of the list, return everything in the list other than
+ the first element.
+ """
+ return re.split(r"[:\s\\]+", re.sub(r"[\s\\]*\Z", "", depends))[1:]
+
+
+def makefile_assignment_split(assignment: str) -> Tuple[str, str]:
+ """Split left:=right into a tuple with the left and right parts.
+
+ Spaces around the := are also removed.
+ """
+ result = re.split(r"\s*:=\s*", assignment, maxsplit=1)
+ if len(result) != 2:
+ raise StopError(
+ "expected: 'left<optional_spaces>:=<optional_spaces>right' in: " +
+ assignment)
+ return result[0], result[1] # left, right
+
+
+def extract_c_src(obj: str, dependencies: List[str]) -> Tuple[str, List[str]]:
+ """Separate the C source file from the other dependencies.
+
+ Returns a tuple with the C source code file and a list with the remaining
+ dependencies. If the source file is not a C source file:
+ return "", []
+ an ealier implementation used:
+ returne None, []
+ but that caused type errors from ptype. Instead of obfuscating the type
+ of the value of this function, returning "", [] is appropriate.
+ """
+ if not dependencies:
+ raise StopError("empty dependencies for: " + obj)
+ src = dependencies[0]
+ if not src.endswith(".c"):
+ return "", []
+ return src, dependencies[1:]
+
+
+def lines_to_list(lines: str) -> List[str]:
+ """Split a string into a list of non-empty lines."""
+ return [line for line in lines.strip().splitlines() if line]
+
+
+def lines_get_first_line(lines: str) -> str:
+ """Return the first non-empty line in lines."""
+ return lines.strip().splitlines()[0]
+
+
+def shell_line_to_o_files_list(line: str) -> List[str]:
+ """Return a list of .o files in the files list."""
+ return [entry for entry in line.split() if entry.endswith(".o")]
+
+
+class KernelModule:
+ """A kernel module, i.e. a *.ko file."""
+ def __init__(self, kofile: str) -> None:
+ """Construct a KernelModule object."""
+ # An example argument is used below, assuming kofile is:
+ # possibly/empty/dirs/modname.ko
+ #
+ # Meant to refer to this module, shown here relative to the top of
+ # the build directory:
+ # drivers/usb/gadget/udc/modname.ko
+ # the values assigned to the members are shown in the comments below.
+
+ self._file = os.path.realpath(kofile) # /abs/dirs/modname.ko
+ self._base = os.path.basename(self._file) # modname.ko
+ self._directory = os.path.dirname(self._file) # /abs/dirs
+ self._cmd_file = os.path.join(self._directory,
+ "." + self._base + ".cmd")
+ self._cmd_text = readfile(self._cmd_file)
+
+ # Some builds append a '; true' to the .modname.ko.cmd, remove it
+
+ self._cmd_text = re.sub(r";\s*true\s*$", "", self._cmd_text)
+
+ # The modules .modname.ko.cmd file contains a makefile snippet,
+ # for example:
+ # cmd_drivers/usb/gadget/udc/dummy_hcd.ko := ld.lld -r ...
+ #
+ # Split the string prior to the spaces followed by ":=", and get
+ # the first element of the resulting list. If the string was not
+ # split (because it did not contain a ":=" then the input string
+ # is returned, by the re.sub() below, as the only element of the list.
+
+ left, _ = makefile_assignment_split(self._cmd_text)
+ self._rel_file = re.sub(r"^cmd_", "", left)
+ if self._rel_file == left:
+ raise StopError("expected: 'cmd_' at start of content of: " +
+ self._cmd_file)
+
+ base = os.path.basename(self._rel_file)
+ if base != self._base:
+ raise StopError("module name mismatch: " + base + " vs " +
+ self._base)
+
+ self._rel_dir = os.path.dirname(self._rel_file)
+
+ # The final step in the build of kernel modules is based on two .o
+ # files, one with the module name followed by .o and another followed
+ # by .mod.o
+ #
+ # The following test verifies that assumption, in case a module is
+ # built differently in the future.
+ #
+ # Even when there are multiple source files, the .o files that result
+ # from compiling them are all linked into a single .o file through an
+ # intermediate link step, that .o files is named:
+ # os.path.join(self._rel_dir, kofile_name + ".o")
+
+ kofile_name, _ = os.path.splitext(self._base)
+ objs = shell_line_to_o_files_list(self._cmd_text)
+ objs.sort()
+ expected = [ # sorted, i.e.: .mod.o < .o
+ os.path.join(self._rel_dir, kofile_name + ".mod.o"),
+ os.path.join(self._rel_dir, kofile_name + ".o")
+ ]
+ if objs != expected:
+ raise StopError("unexpected .o files in: " + self._cmd_file)
+
+ def get_build_dir(self) -> str:
+ """Return the top level build directory.
+
+ I.e. the directory where the output of the Linux build is stored.
+
+ Note that this, like pretty much all the code, can raise an exception,
+ by construction, if an exception is raised while an object is being
+ constructed, or after it is constructed, the object will not be used
+ thereafter (at least not any object explicitly created by this
+ program). Many other places, for example the ones that call readfile()
+ can raise exceptions, the code is located where it belongs.
+
+ In this specific case, the computation of index, and the derived
+ invariant that it be >= 0, is predicated by the condition checked
+ below, if the exception is not raised, then index is >= 0.
+ """
+ if not self._file.endswith(self._rel_file):
+ raise StopError("could not find: " + self._rel_file +
+ " at end of: " + self._file)
+ index = len(self._file) - len(self._rel_file)
+ if index > 0 and self._file[index - 1] == os.sep:
+ index -= 1
+ build_dir = self._file[0:index]
+ return build_dir
+
+ def get_object_files(self, build_dir: str) -> List[str]:
+ """Return a list object files that used to link the kernel module.
+
+ The ocmd_file is the file with extension ".o.cmd" (see below).
+ If the ocmd_file has a more than one line in it, its because the
+ module is made of a single source file and the ocmd_file has the
+ compilation rule and dependencies to build it. If it has a single
+ line single line it is because it builds the .o file by linking
+ multiple .o files.
+ """
+
+ kofile_name, _ = os.path.splitext(self._base)
+ ocmd_file = os.path.join(build_dir, self._rel_dir,
+ "." + kofile_name + ".o.cmd")
+ ocmd_content = readfile(ocmd_file)
+
+ olines = lines_to_list(ocmd_content)
+ if len(olines) > 1: # module made from a single .o file
+ return [os.path.join(build_dir, self._rel_dir, kofile_name + ".o")]
+
+ # Multiple .o files in the module
+
+ _, ldline = makefile_assignment_split(olines[0])
+ return [
+ os.path.realpath(os.path.join(build_dir, obj))
+ for obj in shell_line_to_o_files_list(ldline)
+ ]
+
+
+class Kernel:
+ """The Linux kernel component itself, i.e. vmlinux.o."""
+ def __init__(self, kernel: str) -> None:
+ """Construct a Kernel object."""
+ self._kernel = os.path.realpath(kernel)
+ self._build_dir = os.path.dirname(self._kernel)
+ libs = os.path.join(self._build_dir, "vmlinux.libs")
+ objs = os.path.join(self._build_dir, "vmlinux.objs")
+ file_must_exist(libs)
+ file_must_exist(objs)
+ contents = readfile(libs)
+ archives_and_objects = contents.split()
+ contents = readfile(objs)
+ archives_and_objects += contents.split()
+ self._archives_and_objects = [(os.path.join(self._build_dir, file)
+ if not os.path.isabs(file) else file)
+ for file in archives_and_objects]
+
+ def get_build_dir(self) -> str:
+ """Return the top level build directory.
+
+ I.e. the directory where the output of the Linux build is stored.
+ """
+ return self._build_dir
+
+ def get_object_files(self, build_dir: str) -> List[str]:
+ """Return a list object files that where used to link the kernel."""
+ olist = []
+ for file in self._archives_and_objects:
+ if file.endswith(".o"):
+ if not os.path.isabs(file):
+ file = os.path.join(build_dir, file)
+ olist.append(os.path.realpath(file))
+ continue
+
+ if not file.endswith(".a"):
+ raise StopError("unknown file type: " + file)
+
+ try:
+ # This argument does not always work: check=False
+ # neither that nor: check=True prevents an exception from
+ # being raised if "ar" can not be found
+ completion = subprocess.run(["ar", "t", file],
+ capture_output=True,
+ text=True)
+ if completion.returncode != 0:
+ raise StopError("ar failed for: ar t " + file)
+ objs = lines_to_list(completion.stdout)
+ except OSError as os_error:
+ raise StopError("failure executing: ar t", file, "\n"
+ "original OSError: " + str(os_error.args))
+
+ for obj in objs:
+ if not os.path.isabs(obj):
+ obj = os.path.join(build_dir, obj)
+ olist.append(os.path.realpath(obj))
+
+ return olist
+
+
+class Target: # pylint: disable=too-few-public-methods
+ """Target of build and the information used to build it."""
+ def __init__(self, obj: str, src: str, cc_line: str,
+ deps: List[str]) -> None:
+ self._obj = obj
+ self._src = src
+ self._deps = deps
+
+ # The cc_line, eventually slightly modified, will be used to run
+ # the compiler in various ways. The cc_line could be fed through
+ # the shell to deal with the single-quotes in the cc_line that are
+ # there to quote the double-quotes meant to be part of a C string
+ # literal. Specifically, this occurs in to pass KBUILD_MODNAME and
+ # KBUILD_BASENAME, for example:
+ # -DKBUILD_MODNAME='"aes_ce_cipher"'
+ # -DKBUILD_BASENAME='"aes_cipher_glue"'
+ #
+ # Causing an extra execve(2) of the shell, just to deal with a few
+ # quotes is wasteful, so instead, here the quotes, in this specific
+ # case are removed. This can be done, easiest just by removing the
+ # single quotes with:
+ # cc_cmd = re.sub(r"'", "", cc_line)
+ #
+ # But this could mess up other quote usage in the future, for example
+ # using double quotes or backslash to quote a single quote meant to
+ # actually be seen by the compiler.
+ #
+ # As an alternative, and for this to be more robust, the specific
+ # cases that are known, i.e. the two -D shown above, are dealt with
+ # individually and if there are any single or double quotes, or
+ # backslashes the underlying work is stopped.
+ #
+ # Note that the cc_line comes from the .foo.o.cmd file which is a
+ # makefile snippet, so the actual syntax there is also subject to
+ # whatever other things make would want to do with them. Instead
+ # of doing the absolutely correct thing, which would actually be
+ # to run this through make to have make run then through the shell
+ # this program already has knowledge about these .cmd files and how
+ # they are formed. This compromise, or coupling of knowledge, is a
+ # source of fragility, but not expected to cause much trouble in the
+ # future as the Linux build evolves.
+
+ cc_cmd = re.sub(
+ r"""-D(KBUILD_BASENAME|KBUILD_MODNAME)='("[a-zA-Z0-9_.:]*")'""",
+ r"-D\1=\2", cc_line)
+ cc_list = cc_cmd.split()
+
+ # At least: cc -c -o file.o file.c (last four must be those shown)
+ if (len(cc_list) < 5 or cc_list[-4] != "-c" or cc_list[-3] != "-o"
+ or not obj.endswith(cc_list[-2])
+ or not src.endswith(cc_list[-1])):
+ raise StopError("unexpected or missing arguments for " + obj +
+ " cc_line: " + cc_line)
+ self._cc_list = cc_list
+
+
+class KernelComponentBase: # pylint: disable=too-few-public-methods
+ """Base class for KernelComponentCreationError and KernelComponent.
+
+ There is not much purpose for this class other than to satisfy the strong
+ typing checks of pytype, with looser typing, this could be removed but at
+ the risk of invoking member functions at run-time on objects that do not
+ provide them. Having this class makes the code more reliable.
+ """
+ def get_error(self) -> str: # pylint: disable=no-self-use
+ """Return an empty error string, which means there was no error."""
+ return ""
+
+ def get_deps_set(self) -> Set[str]: # pylint: disable=no-self-use
+ """Return the set of dependencies for the kernel component."""
+ return set()
+
+ def is_kernel(self) -> bool: # pylint: disable=no-self-use
+ """Is this the kernel?"""
+ return False
+
+
+class KernelComponentCreationError(KernelComponentBase): # pylint: disable=too-few-public-methods
+ """A KernelComponent creation error.
+
+ When a KernelComponent creation fails, or the creation of its subordinate
+ Kernel or KernelModule creation fails, a KernelComponentCreationError
+ object is created to store the information relevant to the failure.
+ """
+ def __init__(self, filename: str, error: str) -> None:
+ """Construct a KernelComponentCreationError object."""
+ self._error = error
+ self._filename = filename
+
+ def get_error(self) -> str:
+ """Return the error."""
+ return self._filename + ": " + self._error
+
+
+class KernelComponent(KernelComponentBase):
+ """A kernel component, either vmlinux.o or a *.ko file.
+
+ Inspect a Linux kernel module (a *.ko file) or the Linux kernel to
+ determine what was used to build it: object filess, source files, header
+ files, and other information that is produced as a by-product of its build.
+ """
+ def __init__(self, filename: str) -> None:
+ """Construct a KernelComponent object."""
+ if filename.endswith("vmlinux.o"):
+ self._kernel = True
+ self._kind = Kernel(filename)
+ else:
+ self._kernel = False
+ self._kind = KernelModule(filename)
+ self._build_dir = self._kind.get_build_dir()
+ self._source_dir = self._get_source_dir()
+ self._files_o = self._kind.get_object_files(self._build_dir)
+ self._files_o.sort()
+
+ # using a set because there is no unique flag to list.sort()
+ deps_set = set()
+
+ self._targets = []
+ for obj in self._files_o:
+ file_must_exist(obj)
+ dot_obj = os.path.join(os.path.dirname(obj),
+ "." + os.path.basename(obj))
+ cmd = dot_obj + ".cmd"
+ content = readfile(cmd)
+
+ line = lines_get_first_line(content)
+ _, cc_line = makefile_assignment_split(line)
+ if cc_line.split(maxsplit=1)[0] != COMPILER:
+ # The object file was made by strip, symbol renames, etc.
+ # i.e. it was not the result of running the compiler, thus
+ # it can not contribute to #define compile time constants.
+ continue
+
+ odkeep = dot_obj + ".d.keep"
+ file_must_exist(odkeep)
+
+ content = readfile(odkeep)
+ src, dependendencies = extract_c_src(
+ obj, makefile_depends_get_dependencies(content))
+ if not src:
+ continue
+ file_must_exist(src)
+
+ depends = []
+ for dep in dependendencies:
+ if not os.path.isabs(dep):
+ dep = os.path.join(self._build_dir, dep)
+ dep = os.path.realpath(dep)
+ depends.append(dep)
+ deps_set.add(dep)
+
+ if not os.path.isabs(src):
+ src = os.path.join(self._build_dir, src)
+ src = os.path.realpath(src)
+ self._targets.append(Target(obj, src, cc_line, depends))
+
+ for dep in [dep for dep in list(deps_set) if not dep.endswith(".h")]:
+ deps_set.remove(dep)
+ self._deps_set = deps_set
+
+ def _get_source_dir(self) -> str:
+ """Return the top level Linux kernel source directory."""
+ source = os.path.join(self._build_dir, "source")
+ if not os.path.islink(source):
+ raise StopError("could not find source symlink: " + source)
+
+ if not os.path.isdir(source):
+ raise StopError("source symlink not a directory: " + source)
+
+ source_dir = os.path.realpath(source)
+ if not os.path.isdir(source_dir):
+ raise StopError("source directory not a directory: " + source_dir)
+
+ return source_dir
+
+ def get_deps_set(self) -> Set[str]:
+ """Return the set of dependencies for the kernel component."""
+ return self._deps_set
+
+ def is_kernel(self) -> bool:
+ """Is this the kernel?"""
+ return self._kernel
+
+
+def kernel_component_factory(filename: str) -> KernelComponentBase:
+ """Make an InfoKmod or an InfoKernel object for file and return it."""
+ try:
+ return KernelComponent(filename)
+ except StopError as stop_error:
+ return KernelComponentCreationError(filename,
+ " ".join([*stop_error.args]))
+
+
+def all_ko_and_vmlinux() -> Iterable[str]:
+ """Generator that yields vmlinux.o and all the *.ko files."""
+
+ # TODO(pantin): remove if no measurable slowdown when using (below):
+ #
+ # components = pool.map(kernel_component_factory, ["vmlinux.o"] +
+ # [str(ko) for ko in pathlib.Path().rglob("*.ko")])
+ #
+ # instead of:
+ #
+ # components = pool.map(
+ # kernel_component_factory, all_ko_and_vmlinux())
+ #
+ yield "vmlinux.o" # yield vmlinux.o so its worked on first
+ for kofile in pathlib.Path().rglob("*.ko"):
+ yield str(kofile)
+
+
+def work_on_all_components() -> List[KernelComponentBase]:
+ """Return a list of KernelComponentBase objects."""
+
+ # TODO(pantin): Matthias suggested: "make it a command line option
+ # to run on the main thread only" ... "for debugging purposes that
+ # can be very helpful"
+ #
+ with multiprocessing.Pool(os.cpu_count()) as pool:
+ components = pool.map(kernel_component_factory, ["vmlinux.o"] +
+ [str(ko) for ko in pathlib.Path().rglob("*.ko")])
+ return components
+
+
+def work_on_whole_build() -> int:
+ """Work on the whole build to extract the #define constants."""
+ components = work_on_all_components()
+ all_kmod_h_set = set()
+ kernel_h_set = set()
+ failed = False
+ for comp in components:
+ error = comp.get_error()
+ if error != "":
+ logging.error(error)
+ failed = True
+ continue
+ if comp.is_kernel():
+ kernel_h_set = comp.get_deps_set()
+ else:
+ all_kmod_h_set |= comp.get_deps_set()
+ if failed:
+ return 1
+ if DEBUG:
+ dump(components)
+ headers = kernel_h_set & all_kmod_h_set
+ hlist = list(headers)
+ hlist.sort()
+ for dep in hlist:
+ print(dep)
+ return 0
+
+
+def main() -> int:
+ """Extract #define compile time constants from a Linux build."""
+ def existing_file(file):
+ if not os.path.isfile(file):
+ raise argparse.ArgumentTypeError(
+ "{0} is not a valid file".format(file))
+ return file
+
+ arg_parser = argparse.ArgumentParser()
+ arg_parser.add_argument("file",
+ nargs='?',
+ default="vmlinux.o",
+ type=existing_file)
+ args = arg_parser.parse_args()
+
+ if len(sys.argv) == 1:
+ return work_on_whole_build()
+
+ comp = kernel_component_factory(args.file)
+
+ error = comp.get_error()
+ if error != "":
+ logging.error(error)
+ return 1
+ if DEBUG:
+ dump([comp])
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())