summaryrefslogtreecommitdiff
path: root/licensing/licenses_lib.py
diff options
context:
space:
mode:
Diffstat (limited to 'licensing/licenses_lib.py')
-rw-r--r--licensing/licenses_lib.py1241
1 files changed, 1241 insertions, 0 deletions
diff --git a/licensing/licenses_lib.py b/licensing/licenses_lib.py
new file mode 100644
index 000000000..c116814ef
--- /dev/null
+++ b/licensing/licenses_lib.py
@@ -0,0 +1,1241 @@
+#!/usr/bin/python
+# Copyright 2012 The Chromium OS Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Library for validating ebuild license information, and generating credits.
+
+Documentation on this script is also available here:
+ http://www.chromium.org/chromium-os/licensing
+"""
+
+import cgi
+import codecs
+import logging
+import os
+import re
+import tempfile
+
+from chromite.cbuildbot import constants
+from chromite.cbuildbot import portage_utilities
+from chromite.lib import cros_build_lib
+from chromite.lib import osutils
+
+# We are imported by src/repohooks/pre-upload.py in a non chroot environment
+# where yaml may not be there, so we don't error on that since it's not needed
+# in that case.
+try:
+ import yaml
+except ImportError:
+ yaml = None
+
+debug = True
+
+# See http://crbug.com/207004 for discussion.
+PER_PKG_LICENSE_DIR = '/var/db/pkg'
+
+STOCK_LICENSE_DIRS = [
+ os.path.join(constants.SOURCE_ROOT,
+ 'src/third_party/portage-stable/licenses'),
+]
+
+# There are licenses for custom software we got and isn't part of
+# upstream gentoo.
+CUSTOM_LICENSE_DIRS = [
+ os.path.join(constants.SOURCE_ROOT,
+ 'src/third_party/chromiumos-overlay/licenses'),
+]
+
+COPYRIGHT_ATTRIBUTION_DIR = (
+ os.path.join(
+ constants.SOURCE_ROOT,
+ 'src/third_party/chromiumos-overlay/licenses/copyright-attribution'))
+
+# Virtual packages don't need to have a license and often don't, so we skip them
+# chromeos-base contains google platform packages that are covered by the
+# general license at top of tree, so we skip those too.
+SKIPPED_CATEGORIES = [
+ 'virtual',
+]
+
+SKIPPED_PACKAGES = [
+ # Fix these packages by adding a real license in the code.
+ # You should not skip packages just because the license scraping doesn't
+ # work. Stick those special cases into PACKAGE_LICENSES.
+ # Packages should only be here because they are sub/split packages already
+ # covered by the license of the main package.
+
+ # These are Chrome-OS-specific packages, copyright BSD-Google
+ 'sys-kernel/chromeos-kernel', # already manually credit Linux
+]
+
+SKIPPED_LICENSES = [
+ # Some of our packages contain binary blobs for which we have special
+ # negotiated licenses, and no need to display anything publicly. Strongly
+ # consider using Google-TOS instead, if possible.
+ 'Proprietary-Binary',
+
+ # If you have an early repo for which license terms have yet to be decided
+ # use this. It will cause licensing for the package to be mostly ignored.
+ # Official should error for any package with this license.
+ 'TAINTED', # TODO(dgarrett): Error on official builds with this license.
+]
+
+LICENSE_NAMES_REGEX = [
+ r'^copyright$',
+ r'^copyright[.]txt$',
+ r'^copyright[.]regex$', # llvm
+ r'^copying.*$',
+ r'^licen[cs]e.*$',
+ r'^licensing.*$', # libatomic_ops
+ r'^ipa_font_license_agreement_v1[.]0[.]txt$', # ja-ipafonts
+ r'^PKG-INFO$', # copyright assignment for
+ # some python packages
+ # (netifaces, unittest2)
+]
+
+# These are _temporary_ license mappings for packages that do not have a valid
+# shared/custom license, or LICENSE file we can use.
+# Once this script runs earlier (during the package build process), it will
+# block new source without a LICENSE file if the ebuild contains a license
+# that requires copyright assignment (BSD and friends).
+# At that point, new packages will get fixed to include LICENSE instead of
+# adding workaround mappings like those below.
+# The way you now fix copyright attribution cases create a custom file with the
+# right license directly in COPYRIGHT_ATTRIBUTION_DIR.
+PACKAGE_LICENSES = {
+ # TODO: replace the naive license parsing code in this script with a hook
+ # into portage's license parsing. See http://crbug.com/348779
+
+ # Chrome (the browser) is complicated, it has a morphing license that is
+ # either BSD-Google, or BSD-Google,Google-TOS depending on how it was
+ # built. We bypass this problem for now by hardcoding the Google-TOS bit as
+ # per ChromeOS with non free bits
+ 'chromeos-base/chromeos-chrome': ['BSD-Google', 'Google-TOS'],
+
+ # Currently the code cannot parse LGPL-3 || ( LGPL-2.1 MPL-1.1 )
+ 'dev-python/pycairo': ['LGPL-3', 'LGPL-2.1'],
+}
+
+# Any license listed list here found in the ebuild will make the code look for
+# license files inside the package source code in order to get copyright
+# attribution from them.
+COPYRIGHT_ATTRIBUTION_LICENSES = [
+ 'BSD', # requires distribution of copyright notice
+ 'BSD-2', # so does BSD-2 http://opensource.org/licenses/BSD-2-Clause
+ 'BSD-3', # and BSD-3? http://opensource.org/licenses/BSD-3-Clause
+ 'BSD-4', # and 4?
+ 'BSD-with-attribution',
+ 'MIT',
+ 'MIT-with-advertising',
+ 'Old-MIT',
+]
+
+# The following licenses are not invalid or to show as a less helpful stock
+# license, but it's better to look in the source code for a more specific
+# license if there is one, but not an error if no better one is found.
+# Note that you don't want to set just anything here since any license here
+# will be included once in stock form and a second time in custom form if
+# found (there is no good way to know that a license we found on disk is the
+# better version of the stock version, so we show both).
+LOOK_IN_SOURCE_LICENSES = [
+ 'as-is', # The stock license is very vague, source always has more details.
+ 'PSF-2', # The custom license in python is more complete than the template.
+
+ # As far as I know, we have no requirement to do copyright attribution for
+ # these licenses, but the license included in the code has slightly better
+ # information than the stock Gentoo one (including copyright attribution).
+ 'BZIP2', # Single use license, do copyright attribution.
+ 'OFL', # Almost single use license, do copyright attribution.
+ 'OFL-1.1', # Almost single use license, do copyright attribution.
+ 'UoI-NCSA', # Only used by NSCA, might as well show their custom copyright.
+]
+
+# This used to provide overrides. I can't find a valid reason to add any more
+# here, though.
+PACKAGE_HOMEPAGES = {
+ # Example:
+ # 'x11-proto/glproto': ['http://www.x.org/'],
+}
+
+# These are tokens found in LICENSE= in an ebuild that aren't licenses we
+# can actually read from disk.
+# You should not use this to blacklist real licenses.
+LICENCES_IGNORE = [
+ ')', # Ignore OR tokens from LICENSE="|| ( LGPL-2.1 MPL-1.1 )"
+ '(',
+ '||',
+]
+
+TMPL = 'about_credits.tmpl'
+ENTRY_TMPL = 'about_credits_entry.tmpl'
+SHARED_LICENSE_TMPL = 'about_credits_shared_license_entry.tmpl'
+
+
+# This is called directly by src/repohooks/pre-upload.py
+def GetLicenseTypesFromEbuild(ebuild_path):
+ """Returns a list of license types from the ebuild file.
+
+ This function does not always return the correct list, but it is
+ faster than using portageq for not having to access chroot. It is
+ intended to be used for tasks such as presubmission checks.
+
+ Args:
+ ebuild_path: ebuild to read.
+
+ Returns:
+ list of licenses read from ebuild.
+
+ Raises:
+ ValueError: ebuild errors.
+ """
+ ebuild_env_tmpl = """
+has() { [[ " ${*:2} " == *" $1 "* ]]; }
+inherit() {
+ local overlay_list="%(overlay_list)s"
+ local eclass overlay f
+ for eclass; do
+ has ${eclass} ${_INHERITED_} && continue
+ _INHERITED_+=" ${eclass}"
+ for overlay in %(overlay_list)s; do
+ f="${overlay}/eclass/${eclass}.eclass"
+ if [[ -e ${f} ]]; then
+ source "${f}"
+ break
+ fi
+ done
+ done
+}
+source %(ebuild)s"""
+
+ # TODO: the overlay_list hard-coded here should be changed to look
+ # at the current overlay, and then the master overlays. E.g. for an
+ # ebuild file in overlay-parrot, we will look at parrot overlay
+ # first, and then look at portage-stable and chromiumos, which are
+ # listed as masters in overlay-parrot/metadata/layout.conf.
+ tmpl_env = {
+ 'ebuild': ebuild_path,
+ 'overlay_list': '%s %s' % (
+ os.path.join(constants.SOURCE_ROOT,
+ 'src/third_party/chromiumos-overlay'),
+ os.path.join(constants.SOURCE_ROOT,
+ 'src/third_party/portage-stable'))
+ }
+
+ with tempfile.NamedTemporaryFile(bufsize=0) as f:
+ osutils.WriteFile(f.name, ebuild_env_tmpl % tmpl_env)
+ env = osutils.SourceEnvironment(
+ f.name, whitelist=['LICENSE'], ifs=' ', multiline=True)
+
+ if not env.get('LICENSE'):
+ raise ValueError('No LICENSE found in the ebuild.')
+ if re.search(r'[,;]', env['LICENSE']):
+ raise ValueError(
+ 'LICENSE field in the ebuild should be whitespace-limited.')
+
+ return env['LICENSE'].split()
+
+
+class PackageLicenseError(Exception):
+ """Thrown if something fails while getting license information for a package.
+
+ This will cause the processing to error in the end.
+ """
+
+
+class PackageInfo(object):
+ """Package info containers, mostly for storing licenses."""
+
+ def __init__(self):
+
+ self.board = None
+ self.revision = None
+
+ # Array of scanned license texts.
+ self.license_text_scanned = []
+
+ self.category = None
+ self.name = None
+ self.version = None
+
+ # Looks something like this
+ # /mnt/host/source/src/
+ # third_party/portage-stable/net-misc/rsync/rsync-3.0.8.ebuild
+ self.ebuild_path = None
+
+ # Array of license names retrieved from ebuild or override in this code.
+ self.ebuild_license_names = []
+ self.homepages = []
+ # This contains licenses names we can read from Gentoo or custom licenses.
+ # These are supposed to be shared licenses (i.e. licenses referenced by
+ # more then one package), but after all processing, we may find out that
+ # some are only used once and they get taken out of the shared pool and
+ # pasted directly in the sole package that was using them (see
+ # GenerateHTMLLicenseOutput).
+ self.license_names = set()
+
+ # We set this if the ebuild has a BSD/MIT like license that requires
+ # scanning for a LICENSE file in the source code, or a static mapping
+ # in PACKAGE_LICENSES. Not finding one once this is set, is fatal.
+ self.need_copyright_attribution = False
+ # This flag just says we'd like to include licenses from the source, but
+ # not finding any is not fatal.
+ self.scan_source_for_licenses = False
+
+ # After reading basic package information, we can mark the package as
+ # one to skip in licensing.
+ self.skip = False
+
+ # If we failed to get licensing for this package, mark it as such so that
+ # it can be flagged when the full license file is being generated.
+ self.licensing_failed = False
+
+ # If we are called from a hook, we grab package info from the soure tree.
+ # This is also used as a flag to know whether we should do package work
+ # based on an installed package, or one that is being built and we got
+ # called from the hook.
+ self.build_source_tree = None
+
+ @property
+ def fullnamerev(self):
+ s = '%s-%s' % (self.fullname, self.version)
+ if self.revision:
+ s += '-r%s' % self.revision
+ return s
+
+ @property
+ def fullname(self):
+ return '%s/%s' % (self.category, self.name)
+
+ @property
+ def license_dump_path(self):
+ """e.g. /build/x86-alex//var/db/pkg/sys-apps/dtc-1.4.0/license.yaml."""
+ return "%s/%s/%s/license.yaml" % (cros_build_lib.GetSysroot(self.board),
+ PER_PKG_LICENSE_DIR, self.fullnamerev)
+
+ def _BuildInfo(self, filename):
+ filename = '%s/build-info/%s' % (self.build_source_tree, filename)
+ # Buildinfo properties we read are in US-ASCII, not Unicode.
+ try:
+ bi = open(filename).read().rstrip()
+ # Some properties like HOMEPAGE may be absent.
+ except IOError:
+ bi = ""
+ return bi
+
+ def _RunEbuildPhases(self, phases):
+ """Run a list of ebuild phases on an ebuild.
+
+ Args:
+ phases: list of phases like ['clean', 'fetch'] or ['unpack'].
+
+ Returns:
+ ebuild command output
+ """
+
+ return cros_build_lib.RunCommand(
+ ['ebuild-%s' % self.board, self.ebuild_path] + phases, print_cmd=debug,
+ redirect_stdout=True)
+
+ def _GetOverrideLicense(self):
+ """Look in COPYRIGHT_ATTRIBUTION_DIR for license with copyright attribution.
+
+ For dev-util/bsdiff-4.3-r5, the code will look for
+ dev-util/bsdiff-4.3-r5
+ dev-util/bsdiff-4.3
+ dev-util/bsdiff
+
+ It is ok to have more than one bsdiff license file, and an empty file acts
+ as a rubout (i.e. an empty dev-util/bsdiff-4.4 will shadow dev-util/bsdiff
+ and tell the licensing code to look in the package source for a license
+ instead of using dev-util/bsdiff as an override).
+
+ Returns:
+ False (no license found) or a multiline license string.
+ """
+ license_read = None
+ # dev-util/bsdiff-4.3-r5 -> bsdiff-4.3-r5
+ filename = os.path.basename(self.fullnamerev)
+ license_path = os.path.join(COPYRIGHT_ATTRIBUTION_DIR,
+ os.path.dirname(self.fullnamerev))
+ pv = portage_utilities.SplitPV(filename)
+ pv_no_rev = '%s-%s' % (pv.package, pv.version_no_rev)
+ for filename in (pv.pv, pv_no_rev, pv.package):
+ file_path = os.path.join(license_path, filename)
+ logging.debug("Looking for override copyright attribution license in %s",
+ file_path)
+ if os.path.exists(file_path):
+ # Turn
+ # /../merlin/trunk/src/third_party/chromiumos-overlay/../dev-util/bsdiff
+ # into
+ # chromiumos-overlay/../dev-util/bsdiff
+ short_dir_path = os.path.join(*file_path.rsplit(os.path.sep, 5)[1:])
+ license_read = "Copyright Attribution License %s:\n\n" % short_dir_path
+ license_read += ReadUnknownEncodedFile(
+ file_path, "read copyright attribution license")
+ break
+
+ return license_read
+
+ def _ExtractLicenses(self):
+ """Scrounge for text licenses in the source of package we'll unpack.
+
+ This is only called if we couldn't get usable licenses from the ebuild,
+ or one of them is BSD/MIT like which forces us to look for a file with
+ copyright attribution in the source code itself.
+
+ First, we have a shortcut where we scan COPYRIGHT_ATTRIBUTION_DIR to see if
+ we find a license for this package. If so, we use that.
+ Typically it'll be used if the unpacked source does not have the license
+ that we're required to display for copyright attribution (in some cases it's
+ plain absent, in other cases, it could be in a filename we don't look for).
+
+ Otherwise, we scan the unpacked source code for what looks like license
+ files as defined in LICENSE_NAMES_REGEX.
+
+ Raises:
+ AssertionError: on runtime errors
+ PackageLicenseError: couldn't find copyright attribution file.
+ """
+ license_override = self._GetOverrideLicense()
+ if license_override:
+ self.license_text_scanned = [license_override]
+ return
+
+ if self.build_source_tree:
+ workdir = "%s/work" % self.build_source_tree
+ else:
+ self._RunEbuildPhases(['clean', 'fetch'])
+ output = self._RunEbuildPhases(['unpack']).output.splitlines()
+ # Output is spammy, it looks like this:
+ # * gc-7.2d.tar.gz RMD160 SHA1 SHA256 size ;-) ... [ ok ]
+ # * checking gc-7.2d.tar.gz ;-) ... [ ok ]
+ # * Running stacked hooks for pre_pkg_setup
+ # * sysroot_build_bin_dir ...
+ # [ ok ]
+ # * Running stacked hooks for pre_src_unpack
+ # * python_multilib_setup ...
+ # [ ok ]
+ # >>> Unpacking source...
+ # >>> Unpacking gc-7.2d.tar.gz to /build/x86-alex/tmp/po/[...]ps-7.2d/work
+ # >>> Source unpacked in /build/x86-alex/tmp/portage/[...]ops-7.2d/work
+ # So we only keep the last 2 lines, the others we don't care about.
+ output = [line for line in output if line[0:3] == ">>>" and
+ line != ">>> Unpacking source..."]
+ for line in output:
+ logging.info(line)
+
+ args = ['portageq-%s' % self.board, 'envvar', 'PORTAGE_TMPDIR']
+ result = cros_build_lib.RunCommand(args, print_cmd=debug,
+ redirect_stdout=True)
+ tmpdir = result.output.splitlines()[0]
+ # tmpdir gets something like /build/daisy/tmp/
+ workdir = os.path.join(tmpdir, 'portage', self.fullnamerev, 'work')
+
+ if not os.path.exists(workdir):
+ raise AssertionError("Unpack of %s didn't create %s. Version mismatch" %
+ (self.fullnamerev, workdir))
+
+ # You may wonder how deep should we go?
+ # In case of packages with sub-packages, it could be deep.
+ # Let's just be safe and get everything we can find.
+ # In the case of libatomic_ops, it's actually required to look deep
+ # to find the MIT license:
+ # dev-libs/libatomic_ops-7.2d/work/gc-7.2/libatomic_ops/doc/LICENSING.txt
+ args = ['find', workdir, '-type', 'f']
+ result = cros_build_lib.RunCommand(args, print_cmd=debug,
+ redirect_stdout=True).output.splitlines()
+ # Truncate results to look like this: swig-2.0.4/COPYRIGHT
+ files = [x[len(workdir):].lstrip('/') for x in result]
+ license_files = []
+ for name in files:
+ # When we scan a source tree managed by git, this can contain license
+ # files that are not part of the source. Exclude those.
+ # (e.g. .git/refs/heads/licensing)
+ if ".git/" in name:
+ continue
+ basename = os.path.basename(name)
+ # Looking for license.* brings up things like license.gpl, and we
+ # never want a GPL license when looking for copyright attribution,
+ # so we skip them here. We also skip regexes that can return
+ # license.py (seen in some code).
+ if re.search(r".*GPL.*", basename) or re.search(r"\.py$", basename):
+ continue
+ for regex in LICENSE_NAMES_REGEX:
+ if re.search(regex, basename, re.IGNORECASE):
+ license_files.append(name)
+ break
+
+ if not license_files:
+ if self.need_copyright_attribution:
+ logging.error("""
+%s: unable to find usable license.
+Typically this will happen because the ebuild says it's MIT or BSD, but there
+was no license file that this script could find to include along with a
+copyright attribution (required for BSD/MIT).
+
+If this is Google source, please change
+LICENSE="BSD"
+to
+LICENSE="BSD-Google"
+
+If not, go investigate the unpacked source in %s,
+and find which license to assign. Once you found it, you should copy that
+license to a file under %s
+(or you can modify LICENSE_NAMES_REGEX to pickup a license file that isn't
+being scraped currently).""",
+ self.fullnamerev, workdir, COPYRIGHT_ATTRIBUTION_DIR)
+ raise PackageLicenseError()
+ else:
+ # We can get called for a license like as-is where it's preferable
+ # to find a better one in the source, but not fatal if we didn't.
+ logging.info("Was not able to find a better license for %s "
+ "in %s to replace the more generic one from ebuild",
+ self.fullnamerev, workdir)
+
+ # Examples of multiple license matches:
+ # dev-lang/swig-2.0.4-r1: swig-2.0.4/COPYRIGHT swig-2.0.4/LICENSE
+ # dev-libs/glib-2.32.4-r1: glib-2.32.4/COPYING pkg-config-0.26/COPYING
+ # dev-libs/libnl-3.2.14: libnl-doc-3.2.14/COPYING libnl-3.2.14/COPYING
+ # dev-libs/libpcre-8.30-r2: pcre-8.30/LICENCE pcre-8.30/COPYING
+ # dev-libs/libusb-0.1.12-r6: libusb-0.1.12/COPYING libusb-0.1.12/LICENSE
+ # dev-libs/pyzy-0.1.0-r1: db/COPYING pyzy-0.1.0/COPYING
+ # net-misc/strongswan-5.0.2-r4: strongswan-5.0.2/COPYING
+ # strongswan-5.0.2/LICENSE
+ # sys-process/procps-3.2.8_p11: debian/copyright procps-3.2.8/COPYING
+ logging.info('License(s) for %s: %s', self.fullnamerev,
+ ' '.join(license_files))
+ for license_file in sorted(license_files):
+ # Joy and pink ponies. Some license_files are encoded as latin1 while
+ # others are utf-8 and of course you can't know but only guess.
+ license_path = os.path.join(workdir, license_file)
+ license_txt = ReadUnknownEncodedFile(license_path, "Adding License")
+
+ self.license_text_scanned += [
+ "Scanned Source License %s:\n\n%s" % (license_file, license_txt)]
+
+ # We used to clean up here, but there have been many instances where
+ # looking at unpacked source to see where the licenses were, was useful
+ # so let's disable this for now
+ # self._RunEbuildPhases(['clean'])
+
+ def GetPackageInfo(self, fullnamewithrev):
+ """Populate PackageInfo with package license, and homepage.
+
+ self.ebuild_license_names will not be filled if the package is skipped
+ or if there was an issue getting data from the ebuild.
+ self.license_names will only get the licenses that we can paste
+ as shared licenses.
+ scan_source_for_licenses will be set if we should unpack the source to look
+ for licenses
+ if need_copyright_attribution is also set, not finding a license in the
+ source is fatal (PackageLicenseError will get raised).
+
+ Args:
+ fullnamewithrev: e.g. dev-libs/libatomic_ops-7.2d
+
+ Raises:
+ AssertionError: on runtime errors
+ """
+ if not fullnamewithrev:
+ if not self.build_source_tree:
+ raise AssertionError("Cannot continue without full name or source tree")
+ fullnamewithrev = "%s/%s" % (self._BuildInfo("CATEGORY"),
+ self._BuildInfo("PF"))
+ logging.debug("Computed package name %s from %s", fullnamewithrev,
+ self.build_source_tree)
+
+ try:
+ cpv = portage_utilities.SplitCPV(fullnamewithrev)
+ # A bad package can either raise a TypeError exception or return None,
+ # so we catch both cases.
+ if not cpv:
+ raise TypeError
+ except TypeError:
+ raise AssertionError("portage couldn't find %s, missing version number?" %
+ fullnamewithrev)
+
+ self.category, self.name, self.version, self.revision = (
+ cpv.category, cpv.package, cpv.version_no_rev, cpv.rev)
+
+ if self.revision is not None:
+ self.revision = str(self.revision).lstrip('r')
+ if self.revision == '0':
+ self.revision = None
+
+ if self.category in SKIPPED_CATEGORIES:
+ logging.info("%s in SKIPPED_CATEGORIES, skip package", self.fullname)
+ self.skip = True
+ return
+
+ if self.fullname in SKIPPED_PACKAGES:
+ logging.info("%s in SKIPPED_PACKAGES, skip package", self.fullname)
+ self.skip = True
+ return
+
+ def _FindEbuildPath(self):
+ """Populate package info from an ebuild retrieved via equery."""
+ # By default, equery returns the latest version of the package. A
+ # build may have used an older version than what is currently
+ # available in the source tree (a build dependency can be pinned
+ # to an older version of a package for compatibility
+ # reasons). Therefore we need to tell equery that we want the
+ # exact version number used in the image build as opposed to the
+ # latest available in the source tree.
+ args = ['equery-%s' % self.board, '-q', '-C', 'which', self.fullnamerev]
+ try:
+ path = cros_build_lib.RunCommand(args, print_cmd=True,
+ redirect_stdout=True).output.strip()
+ if not path:
+ raise AssertionError
+ except:
+ raise AssertionError('GetEbuildPath for %s failed.\n'
+ 'Is your tree clean? Delete %s and rebuild' %
+ (self.name,
+ cros_build_lib.GetSysroot(board=self.board)))
+ logging.debug("%s -> %s", " ".join(args), path)
+
+ if not os.access(path, os.F_OK):
+ raise AssertionError("Can't access %s", path)
+
+ self.ebuild_path = path
+
+ def _ReadEbuildMetadata(self):
+ """Read package metadata retrieved via portageq."""
+ args = ['portageq-%s' % self.board, 'metadata',
+ cros_build_lib.GetSysroot(board=self.board), 'ebuild',
+ self.fullnamerev, 'HOMEPAGE', 'LICENSE']
+ tmp = cros_build_lib.RunCommand(args, print_cmd=debug,
+ redirect_stdout=True)
+ lines = tmp.output.splitlines()
+ # Runs:
+ # portageq metadata /build/x86-alex ebuild net-misc/wget-1.12-r2 \
+ # HOMEPAGE LICENSE
+ # Returns:
+ # http://www.gnu.org/software/wget/
+ # GPL-3
+ self.homepages, self.ebuild_license_names = (
+ lines[0].split(), lines[1].split())
+
+ def _TestEbuildContents(self):
+ """Discover if the ebuild installed any files.
+
+ Returns:
+ bool which tells if any files were installed.
+ """
+ # Search for anything the ebuild might install, other than a directory.
+ args = ['equery-%s' % self.board, '-q', '-C', 'files', self.fullnamerev,
+ '-f', 'obj']
+ tmp = cros_build_lib.RunCommand(args, print_cmd=debug, redirect_stdout=True)
+ lines = tmp.output.splitlines()
+
+ # lines is an array of the file names installed by the ebuild.
+ return bool(lines)
+
+ def GetLicenses(self):
+ """Get licenses from the ebuild field and the unpacked source code.
+
+ Some packages have static license mappings applied to them that get
+ retrieved from the ebuild.
+
+ For others, we figure out whether the package source should be scanned to
+ add licenses found there.
+
+ Raises:
+ AssertionError: on runtime errors
+ PackageLicenseError: couldn't find license in ebuild and source.
+ """
+ if self.build_source_tree:
+ # If the total size installed is zero, we installed no content to license.
+ if self._BuildInfo("SIZE").strip() == '0':
+ self.skip = True
+ return
+ self.homepages = self._BuildInfo("HOMEPAGE").split()
+ self.ebuild_license_names = self._BuildInfo("LICENSE").split()
+ else:
+ self._FindEbuildPath()
+ self._ReadEbuildMetadata()
+ self.skip = self.skip or not self._TestEbuildContents()
+
+ # If this ebuild only uses skipped licenses, skip it.
+ if (self.ebuild_license_names and
+ all(l in SKIPPED_LICENSES for l in self.ebuild_license_names)):
+ self.skip = True
+
+ if self.skip:
+ return
+
+ if self.fullname in PACKAGE_HOMEPAGES:
+ self.homepages = PACKAGE_HOMEPAGES[self.fullname]
+
+ # Packages with missing licenses or licenses that need mapping (like
+ # BSD/MIT) are hardcoded here:
+ if self.fullname in PACKAGE_LICENSES:
+ self.ebuild_license_names = PACKAGE_LICENSES[self.fullname]
+ logging.info("Static license mapping for %s: %s", self.fullnamerev,
+ ",".join(self.ebuild_license_names))
+ else:
+ logging.info("Read licenses for %s: %s", self.fullnamerev,
+ ",".join(self.ebuild_license_names))
+
+ # Lots of packages in chromeos-base have their license set to BSD instead
+ # of BSD-Google:
+ new_license_names = []
+ for license_name in self.ebuild_license_names:
+ # TODO: temp workaround for http;//crbug.com/348750 , remove when the bug
+ # is fixed.
+ if (license_name == "BSD" and
+ self.fullnamerev.startswith("chromeos-base/")):
+ license_name = "BSD-Google"
+ logging.error(
+ "Fixed BSD->BSD-Google for %s because it's in chromeos-base. "
+ "Please fix the LICENSE field in the ebuild", self.fullnamerev)
+ # TODO: temp workaround for http;//crbug.com/348749 , remove when the bug
+ # is fixed.
+ if license_name == "Proprietary":
+ license_name = "Google-TOS"
+ logging.error(
+ "Fixed Proprietary -> Google-TOS for %s. "
+ "Please fix the LICENSE field in the ebuild", self.fullnamerev)
+ new_license_names.append(license_name)
+ self.ebuild_license_names = new_license_names
+
+ # The ebuild license field can look like:
+ # LICENSE="GPL-3 LGPL-3 Apache-2.0" (this means AND, as in all 3)
+ # for third_party/portage-stable/app-admin/rsyslog/rsyslog-5.8.11.ebuild
+ # LICENSE="|| ( LGPL-2.1 MPL-1.1 )"
+ # for third_party/portage-stable/x11-libs/cairo/cairo-1.8.8.ebuild
+
+ # The parser isn't very smart and only has basic support for the
+ # || ( X Y ) OR logic to do the following:
+ # In order to save time needlessly unpacking packages and looking or a
+ # cleartext license (which is really a crapshoot), if we have a license
+ # like BSD that requires looking for copyright attribution, but we can
+ # chose another license like GPL, we do that.
+
+ if not self.skip and not self.ebuild_license_names:
+ logging.error("%s: no license found in ebuild. FIXME!", self.fullnamerev)
+ # In a bind, you could comment this out. I'm making the output fail to
+ # get your attention since this error really should be fixed, but if you
+ # comment out the next line, the script will try to find a license inside
+ # the source.
+ raise PackageLicenseError()
+
+ # This is not invalid, but the parser can't deal with it, so if it ever
+ # happens, error out to tell the programmer to do something.
+ # dev-python/pycairo-1.10.0-r4: LGPL-3 || ( LGPL-2.1 MPL-1.1 )
+ if "||" in self.ebuild_license_names[1:]:
+ logging.error("%s: Can't parse || in the middle of a license: %s",
+ self.fullnamerev, ' '.join(self.ebuild_license_names))
+ raise PackageLicenseError()
+
+ or_licenses_and_one_is_no_attribution = False
+ # We do a quick early pass first so that the longer pass below can
+ # run accordingly.
+ for license_name in [x for x in self.ebuild_license_names
+ if x not in LICENCES_IGNORE]:
+ # Here we have an OR case, and one license that we can use stock, so
+ # we remember that in order to be able to skip license attributions if
+ # any were in the OR.
+ if (self.ebuild_license_names[0] == "||" and
+ license_name not in COPYRIGHT_ATTRIBUTION_LICENSES):
+ or_licenses_and_one_is_no_attribution = True
+
+ for license_name in [x for x in self.ebuild_license_names
+ if x not in LICENCES_IGNORE]:
+ # Licenses like BSD or MIT can't be used as is because they do not contain
+ # copyright self. They have to be replaced by copyright file given in the
+ # source code, or manually mapped by us in PACKAGE_LICENSES
+ if license_name in COPYRIGHT_ATTRIBUTION_LICENSES:
+ # To limit needless efforts, if a package is BSD or GPL, we ignore BSD
+ # and use GPL to avoid scanning the package, but we can only do this if
+ # or_licenses_and_one_is_no_attribution has been set above.
+ # This ensures that if we have License: || (BSD3 BSD4), we will
+ # look in the source.
+ if or_licenses_and_one_is_no_attribution:
+ logging.info("%s: ignore license %s because ebuild LICENSES had %s",
+ self.fullnamerev, license_name,
+ ' '.join(self.ebuild_license_names))
+ else:
+ logging.info("%s: can't use %s, will scan source code for copyright",
+ self.fullnamerev, license_name)
+ self.need_copyright_attribution = True
+ self.scan_source_for_licenses = True
+ else:
+ self.license_names.add(license_name)
+ # We can't display just 2+ because it only contains text that says to
+ # read v2 or v3.
+ if license_name == 'GPL-2+':
+ self.license_names.add('GPL-2')
+ if license_name == 'LGPL-2+':
+ self.license_names.add('LGPL-2')
+
+ if license_name in LOOK_IN_SOURCE_LICENSES:
+ logging.info("%s: Got %s, will try to find better license in source...",
+ self.fullnamerev, license_name)
+ self.scan_source_for_licenses = True
+
+ if self.license_names:
+ logging.info('%s: using stock|cust license(s) %s',
+ self.fullnamerev, ','.join(self.license_names))
+
+ # If the license(s) could not be found, or one requires copyright
+ # attribution, dig in the source code for license files:
+ # For instance:
+ # Read licenses from ebuild for net-dialup/ppp-2.4.5-r3: BSD,GPL-2
+ # We need get the substitution file for BSD and add it to GPL.
+ if self.scan_source_for_licenses:
+ self._ExtractLicenses()
+
+ # This shouldn't run, but leaving as sanity check.
+ if not self.license_names and not self.license_text_scanned:
+ raise AssertionError("Didn't find usable licenses for %s" %
+ self.fullnamerev)
+
+
+class Licensing(object):
+ """Do the actual work of extracting licensing info and outputting html."""
+
+ def __init__(self, board, package_fullnames, gen_licenses):
+ # eg x86-alex
+ self.board = board
+ # List of stock and custom licenses referenced in ebuilds. Used to
+ # print a report. Dict value says which packages use that license.
+ self.licenses = {}
+
+ # Licenses are supposed to be generated at package build time and be
+ # ready for us, but in case they're not, they can be generated.
+ self.gen_licenses = gen_licenses
+
+ # This keeps track of whether we have an incomplete license file due to
+ # package errors during parsing.
+ # Any non empty list at the end shows the list of packages that caused
+ # errors.
+ self.incomplete_packages = []
+
+ self.package_text = {}
+ self.entry_template = None
+
+ # We need to have a dict for the list of packages objects, index by package
+ # fullnamerev, so that when we scan our licenses at the end, and find out
+ # some shared licenses are only used by one package, we can access that
+ # package object by name, and add the license directly in that object.
+ self.packages = {}
+ self._package_fullnames = package_fullnames
+
+ @property
+ def sorted_licenses(self):
+ return sorted(self.licenses.keys(), key=str.lower)
+
+ def _SaveLicenseDump(self, pkg):
+ if pkg.build_source_tree:
+ save_file = "%s/build-info/license.yaml" % pkg.build_source_tree
+ else:
+ save_file = pkg.license_dump_path
+ logging.debug("Saving license to %s", save_file)
+ save_dir = os.path.dirname(save_file)
+ if not os.path.isdir(save_dir):
+ os.makedirs(save_dir, 0755)
+ with open(save_file, "w") as f:
+ yaml_dump = []
+ for key, value in pkg.__dict__.items():
+ yaml_dump.append([key, value])
+ f.write(yaml.dump(yaml_dump))
+
+ def _LoadLicenseDump(self, pkg):
+ save_file = pkg.license_dump_path
+ logging.debug("Getting license from %s for %s", save_file, pkg.name)
+ with open(save_file, "r") as f:
+ # yaml.safe_load barfs on unicode it output, but we don't really need it.
+ yaml_dump = yaml.load(f)
+ for key, value in yaml_dump:
+ pkg.__dict__[key] = value
+
+ def LicensedPackages(self, license_name):
+ """Return list of packages using a given license."""
+ return self.licenses[license_name]
+
+ def LoadPackageInfo(self, board):
+ """Populate basic package info for all packages from their ebuild."""
+ for package_name in self._package_fullnames:
+ pkg = PackageInfo()
+ pkg.board = board
+ pkg.GetPackageInfo(package_name)
+ self.packages[package_name] = pkg
+
+ def HookPackageProcess(self, pkg_build_path):
+ """Different entry point to populate a packageinfo.
+
+ This is called instead of LoadPackageInfo when called by a package build.
+
+ Args:
+ pkg_build_path: unpacked being built by emerge.
+ """
+ pkg = PackageInfo()
+ pkg.build_source_tree = pkg_build_path
+ pkg.GetPackageInfo(None)
+ if not pkg.skip:
+ pkg.GetLicenses()
+ self._SaveLicenseDump(pkg)
+
+ def ProcessPackageLicenses(self):
+ """Iterate through all packages provided and gather their licenses.
+
+ GetLicenses will scrape licenses from the code and/or gather stock license
+ names. We gather the list of stock and custom ones for later processing.
+
+ Do not call this after adding virtual packages with AddExtraPkg.
+ """
+ for package_name in self.packages:
+ pkg = self.packages[package_name]
+ if pkg.skip:
+ if self.gen_licenses:
+ logging.info("Package %s is in skip list", package_name)
+ else:
+ # If we do a licensing run expecting to get licensing objects from
+ # an image build, virtual packages will be missing such objects
+ # because virtual packages do not get the install hook run at build
+ # time. Because this script may not have permissions to write in the
+ # /var/db/ directory, we don't want it to generate useless license
+ # bits for virtual packages. As a result, ignore virtual packages
+ # here.
+ if pkg.category == "virtual":
+ logging.debug("Ignoring %s virtual package", package_name)
+ continue
+
+ # Other skipped packages get dumped with incomplete info and the skip flag
+ if not os.path.exists(pkg.license_dump_path) and not self.gen_licenses:
+ logging.warning(">>> License for %s is missing, creating now <<<",
+ package_name)
+ if not os.path.exists(pkg.license_dump_path) or self.gen_licenses:
+ if not pkg.skip:
+ try:
+ pkg.GetLicenses()
+ except PackageLicenseError:
+ pkg.licensing_failed = True
+ # We dump packages where licensing failed too.
+ self._SaveLicenseDump(pkg)
+
+ # To debug the code, we force the data to be re-read from the dumps
+ # instead of reusing what we may have in memory.
+ for package_name in self.packages:
+ pkg = self.packages[package_name]
+ if pkg.category == "virtual":
+ continue
+
+ self._LoadLicenseDump(pkg)
+ logging.debug("loaded dump for %s", pkg.fullnamerev)
+ if pkg.skip:
+ logging.info("Package %s is in skip list", pkg.fullnamerev)
+ if pkg.licensing_failed:
+ logging.info("Package %s failed licensing", pkg.fullnamerev)
+ self.incomplete_packages += [pkg.fullnamerev]
+
+ def AddExtraPkg(self, pkg_data):
+ """Allow adding pre-created virtual packages.
+
+ GetLicenses will not work on them, so add them after having run
+ ProcessPackages.
+
+ Args:
+ pkg_data: array of package data as defined below
+ """
+ pkg = PackageInfo()
+ pkg.board = self.board
+ pkg.category = pkg_data[0]
+ pkg.name = pkg_data[1]
+ pkg.version = pkg_data[2]
+ pkg.homepages = pkg_data[3] # this is a list
+ pkg.license_names = pkg_data[4] # this is also a list
+ pkg.ebuild_license_names = pkg_data[4]
+ self.packages[pkg.fullnamerev] = pkg
+
+ # Called directly by src/repohooks/pre-upload.py
+ @staticmethod
+ def FindLicenseType(license_name):
+ """Says if a license is stock Gentoo, custom, or doesn't exist."""
+
+ for directory in STOCK_LICENSE_DIRS:
+ path = '%s/%s' % (directory, license_name)
+ if os.path.exists(path):
+ return "Gentoo Package Stock"
+
+ for directory in CUSTOM_LICENSE_DIRS:
+ path = '%s/%s' % (directory, license_name)
+ if os.path.exists(path):
+ return "Custom"
+
+ if license_name in SKIPPED_LICENSES:
+ return "Custom"
+
+ raise AssertionError("""
+license %s could not be found in %s
+If the license in the ebuild is correct,
+a) a stock license should be added to portage-stable/licenses :
+running `cros_portage_upgrade` inside of the chroot should clone this repo
+to /tmp/portage/:
+https://chromium.googlesource.com/chromiumos/overlays/portage/+/gentoo
+find the new licenses under licenses, and add them to portage-stable/licenses
+
+b) if it's a non gentoo package with a custom license, you can copy that license
+to third_party/chromiumos-overlay/licenses/
+
+Try re-running the script with -p cat/package-ver --generate
+after fixing the license.""" %
+ (license_name,
+ '\n'.join(STOCK_LICENSE_DIRS + CUSTOM_LICENSE_DIRS))
+ )
+
+ @staticmethod
+ def ReadSharedLicense(license_name):
+ """Read and return stock or cust license file specified in an ebuild."""
+
+ license_path = None
+ for directory in STOCK_LICENSE_DIRS + CUSTOM_LICENSE_DIRS:
+ path = os.path.join(directory, license_name)
+ if os.path.exists(path):
+ license_path = path
+ break
+
+ if license_path:
+ return ReadUnknownEncodedFile(license_path, "read license")
+ else:
+ raise AssertionError("license %s could not be found in %s"
+ % (license_name,
+ '\n'.join(STOCK_LICENSE_DIRS +
+ CUSTOM_LICENSE_DIRS))
+ )
+
+ @staticmethod
+ def EvaluateTemplate(template, env):
+ """Expand a template with vars like {{foo}} using a dict of expansions."""
+ # TODO switch to stock python templates.
+ for key, val in env.iteritems():
+ template = template.replace('{{%s}}' % key, val)
+ return template
+
+ def _GeneratePackageLicenseText(self, pkg):
+ """Concatenate all licenses related to a pkg.
+
+ This means a combination of ebuild shared licenses and licenses read from
+ the pkg source tree, if any.
+
+ Args:
+ pkg: PackageInfo object
+
+ Raises:
+ AssertionError: on runtime errors
+ """
+ license_text = []
+ for license_text_scanned in pkg.license_text_scanned:
+ license_text.append(license_text_scanned)
+ license_text.append('%s\n' % ('-=' * 40))
+
+ license_pointers = []
+ # sln: shared license name.
+ for sln in pkg.license_names:
+ # Says whether it's a stock gentoo or custom license.
+ license_type = self.FindLicenseType(sln)
+ license_pointers.append(
+ "<li><a href='#%s'>%s License %s</a></li>" % (
+ sln, license_type, sln))
+
+ # This should get caught earlier, but one extra check.
+ if not license_text + license_pointers:
+ raise AssertionError('Ended up with no license_text for %s', pkg.name)
+
+ env = {
+ 'name': "%s-%s" % (pkg.name, pkg.version),
+ 'url': cgi.escape(pkg.homepages[0]) if pkg.homepages else '',
+ 'licenses_txt': cgi.escape('\n'.join(license_text)) or '',
+ 'licenses_ptr': '\n'.join(license_pointers) or '',
+ }
+ self.package_text[pkg] = self.EvaluateTemplate(self.entry_template, env)
+
+ def GenerateHTMLLicenseOutput(self, output_file,
+ output_template=TMPL,
+ entry_template=ENTRY_TMPL,
+ license_template=SHARED_LICENSE_TMPL):
+ """Generate the combined html license file used in ChromeOS.
+
+ Args:
+ output_file: resulting HTML license output.
+ output_template: template for the entire HTML file.
+ entry_template: template for per package entries.
+ license_template: template for shared license entries.
+ """
+ self.entry_template = ReadUnknownEncodedFile(entry_template)
+ sorted_license_txt = []
+
+ # Keep track of which licenses are used by which packages.
+ for pkg in self.packages.values():
+ if pkg.skip or pkg.licensing_failed:
+ continue
+ for sln in pkg.license_names:
+ self.licenses.setdefault(sln, []).append(pkg.fullnamerev)
+
+ # Find licenses only used once, and roll them in the package that uses them.
+ # We use keys() because licenses is modified in the loop, so we can't use
+ # an iterator.
+ for sln in self.licenses.keys():
+ if len(self.licenses[sln]) == 1:
+ pkg_fullnamerev = self.licenses[sln][0]
+ logging.info("Collapsing shared license %s into single use license "
+ "(only used by %s)", sln, pkg_fullnamerev)
+ license_type = self.FindLicenseType(sln)
+ license_txt = self.ReadSharedLicense(sln)
+ single_license = "%s License %s:\n\n%s" % (license_type, sln,
+ license_txt)
+ pkg = self.packages[pkg_fullnamerev]
+ pkg.license_text_scanned.append(single_license)
+ pkg.license_names.remove(sln)
+ del self.licenses[sln]
+
+ for pkg in sorted(self.packages.values(),
+ key=lambda x: (x.name.lower(), x.version, x.revision)):
+ if pkg.skip:
+ logging.debug("Skipping package %s", pkg.fullnamerev)
+ continue
+ if pkg.licensing_failed:
+ logging.debug("Package %s failed licensing, skipping", pkg.fullnamerev)
+ continue
+ self._GeneratePackageLicenseText(pkg)
+ sorted_license_txt += [self.package_text[pkg]]
+
+ # Now generate the bottom of the page that will contain all the shared
+ # licenses and a list of who is pointing to them.
+ license_template = ReadUnknownEncodedFile(license_template)
+
+ licenses_txt = []
+ for license_name in self.sorted_licenses:
+ env = {
+ 'license_name': license_name,
+ 'license': cgi.escape(self.ReadSharedLicense(license_name)),
+ 'license_type': self.FindLicenseType(license_name),
+ 'license_packages': ' '.join(self.LicensedPackages(license_name)),
+ }
+ licenses_txt += [self.EvaluateTemplate(license_template, env)]
+
+ file_template = ReadUnknownEncodedFile(output_template)
+ env = {
+ 'entries': '\n'.join(sorted_license_txt),
+ 'licenses': '\n'.join(licenses_txt),
+ }
+ osutils.WriteFile(output_file,
+ self.EvaluateTemplate(file_template, env).encode('UTF-8'))
+
+
+def ListInstalledPackages(board, all_packages=False):
+ """Return a list of all packages installed for a particular board."""
+
+ # If all_packages is set to True, all packages visible in the build
+ # chroot are used to generate the licensing file. This is not what you want
+ # for a release license file, but it's a way to run licensing checks against
+ # all packages.
+ # If it's set to False, it will only generate a licensing file that contains
+ # packages used for a release build (as determined by the dependencies for
+ # virtual/target-os).
+
+ if all_packages:
+ # The following returns all packages that were part of the build tree
+ # (many get built or used during the build, but do not get shipped).
+ # Note that it also contains packages that are in the build as
+ # defined by build_packages but not part of the image we ship.
+ args = ["equery-%s" % board, "list", "*"]
+ packages = cros_build_lib.RunCommand(args, print_cmd=debug,
+ redirect_stdout=True
+ ).output.splitlines()
+ else:
+ # The following returns all packages that were part of the build tree
+ # (many get built or used during the build, but do not get shipped).
+ # Note that it also contains packages that are in the build as
+ # defined by build_packages but not part of the image we ship.
+ args = ["emerge-%s" % board, "--with-bdeps=y", "--usepkgonly",
+ "--emptytree", "--pretend", "--color=n", "virtual/target-os"]
+ emerge = cros_build_lib.RunCommand(args, print_cmd=debug,
+ redirect_stdout=True).output.splitlines()
+ # Another option which we've decided not to use, is bdeps=n. This outputs
+ # just the packages we ship, but does not packages that were used to build
+ # them, including a package like flex which generates a .a that is included
+ # and shipped in ChromeOS.
+ # We've decided to credit build packages, even if we're not legally required
+ # to (it's always nice to do), and that way we get corner case packages like
+ # flex. This is why we use bdep=y and not bdep=n.
+
+ packages = []
+ # [binary R ] x11-libs/libva-1.1.1 to /build/x86-alex/
+ pkg_rgx = re.compile(r'\[[^]]+R[^]]+\] (.+) to /build/.*')
+ # If we match something else without the 'R' like
+ # [binary U ] chromeos-base/pepper-flash-13.0.0.133-r1 [12.0.0.77-r1]
+ # this is bad and we should die on this.
+ pkg_rgx2 = re.compile(r'(\[[^]]+\] .+) to /build/.*')
+ for line in emerge:
+ match = pkg_rgx.search(line)
+ match2 = pkg_rgx2.search(line)
+ if match:
+ packages.append(match.group(1))
+ elif match2:
+ raise AssertionError("Package incorrectly installed, try eclean-%s" %
+ board, "\n%s" % match2.group(1))
+
+ return packages
+
+
+def _HandleIllegalXMLChars(text):
+ """Handles illegal XML Characters.
+
+ XML 1.0 acceptable character range:
+ Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | \
+ [#x10000-#x10FFFF]
+
+ This function finds all illegal characters in the text and filters
+ out all whitelisted characters (e.g. ^L).
+
+ Args:
+ text: text to examine.
+
+ Returns:
+ Filtered |text| and a list of non-whitelisted illegal characters found.
+ """
+ whitelist_re = re.compile(u'[\x0c]')
+ text = whitelist_re.sub('', text)
+ # illegal_chars_re includes all illegal characters (whitelisted or
+ # not), so we can expand the whitelist without modifying this line.
+ illegal_chars_re = re.compile(
+ u'[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]')
+ return (text, illegal_chars_re.findall(text))
+
+
+def ReadUnknownEncodedFile(file_path, logging_text=None):
+ """Read a file of unknown encoding (UTF-8 or latin) by trying in sequence.
+
+ Args:
+ file_path: what to read.
+ logging_text: what to display for logging depending on file read.
+
+ Returns:
+ File content, possibly converted from latin1 to UTF-8.
+
+ Raises:
+ Assertion error: if non-whitelisted illegal XML characters
+ are found in the file.
+ ValueError: returned if we get invalid XML.
+ """
+ try:
+ with codecs.open(file_path, encoding="utf-8") as c:
+ file_txt = c.read()
+ if logging_text:
+ logging.info("%s %s (UTF-8)", logging_text, file_path)
+ except UnicodeDecodeError:
+ with codecs.open(file_path, encoding="latin1") as c:
+ file_txt = c.read()
+ if logging_text:
+ logging.info("%s %s (latin1)", logging_text, file_path)
+
+ file_txt, char_list = _HandleIllegalXMLChars(file_txt)
+
+ if char_list:
+ raise ValueError('Illegal XML characters %s found in %s.' %
+ (char_list, file_path))
+
+ return file_txt