aboutsummaryrefslogtreecommitdiff
path: root/idsearcher/idsearcher.go
diff options
context:
space:
mode:
Diffstat (limited to 'idsearcher/idsearcher.go')
-rw-r--r--idsearcher/idsearcher.go228
1 files changed, 228 insertions, 0 deletions
diff --git a/idsearcher/idsearcher.go b/idsearcher/idsearcher.go
new file mode 100644
index 0000000..253bdaa
--- /dev/null
+++ b/idsearcher/idsearcher.go
@@ -0,0 +1,228 @@
+// Package idsearcher is used to search for short-form IDs in files
+// within a directory, and to build an SPDX Document containing those
+// license findings.
+// SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later
+package idsearcher
+
+import (
+ "bufio"
+ "fmt"
+ "os"
+ "path/filepath"
+ "regexp"
+ "sort"
+ "strings"
+
+ "github.com/spdx/tools-golang/builder"
+ "github.com/spdx/tools-golang/spdx"
+ "github.com/spdx/tools-golang/utils"
+)
+
+// Config is a collection of configuration settings for docbuilder
+// (for version 2.1 SPDX Documents). A few mandatory fields are set here
+// so that they can be repeatedly reused in multiple calls to Build2_1.
+type Config struct {
+ // NamespacePrefix should be a URI representing a prefix for the
+ // namespace with which the SPDX Document will be associated.
+ // It will be used in the DocumentNamespace field in the CreationInfo
+ // section, followed by the per-Document package name and a random UUID.
+ NamespacePrefix string
+
+ // BuilderPathsIgnored lists certain paths to be omitted from the built
+ // document. Each string should be a path, relative to the package's
+ // dirRoot, to a specific file or (for all files in a directory) ending
+ // in a slash. Prefix the string with "**" to omit all instances of that
+ // file / directory, regardless of where it is in the file tree.
+ BuilderPathsIgnored []string
+
+ // SearcherPathsIgnored lists certain paths that should not be searched
+ // by idsearcher, even if those paths have Files present. It uses the
+ // same format as BuilderPathsIgnored.
+ SearcherPathsIgnored []string
+}
+
+// BuildIDsDocument creates an SPDX Document (version 2.1) and searches for
+// short-form IDs in each file, filling in license fields as appropriate. It
+// returns that document or error if any is encountered. Arguments:
+// - packageName: name of package / directory
+// - dirRoot: path to directory to be analyzed
+// - namespacePrefix: URI representing a prefix for the
+// namespace with which the SPDX Document will be associated
+func BuildIDsDocument(packageName string, dirRoot string, idconfig *Config) (*spdx.Document2_1, error) {
+ // first, build the Document using builder
+ bconfig := &builder.Config2_1{
+ NamespacePrefix: idconfig.NamespacePrefix,
+ CreatorType: "Tool",
+ Creator: "github.com/spdx/tools-golang/idsearcher",
+ PathsIgnored: idconfig.BuilderPathsIgnored,
+ }
+ doc, err := builder.Build2_1(packageName, dirRoot, bconfig)
+ if err != nil {
+ return nil, err
+ }
+ if doc == nil {
+ return nil, fmt.Errorf("builder returned nil Document")
+ }
+ if doc.Packages == nil {
+ return nil, fmt.Errorf("builder returned nil Package")
+ }
+ if len(doc.Packages) != 1 {
+ return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages))
+ }
+
+ // now, walk through each file and find its licenses (if any)
+ pkg := doc.Packages[0]
+ if pkg.Files == nil {
+ return nil, fmt.Errorf("builder returned nil Files in Package")
+ }
+ licsForPackage := map[string]int{}
+ for _, f := range pkg.Files {
+ // start by initializing / clearing values
+ f.LicenseInfoInFile = []string{"NOASSERTION"}
+ f.LicenseConcluded = "NOASSERTION"
+
+ // check whether the searcher should ignore this file
+ if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) {
+ continue
+ }
+
+ fPath := filepath.Join(dirRoot, f.FileName)
+ // FIXME this is not preferable -- ignoring error
+ ids, _ := searchFileIDs(fPath)
+ // FIXME for now, proceed onwards with whatever IDs we obtained.
+ // FIXME instead of ignoring the error, should probably either log it,
+ // FIXME and/or enable the caller to configure what should happen.
+
+ // separate out for this file's licenses
+ licsForFile := map[string]int{}
+ licsParens := []string{}
+ for _, lid := range ids {
+ // get individual elements and add for file and package
+ licElements := getIndividualLicenses(lid)
+ for _, elt := range licElements {
+ licsForFile[elt] = 1
+ licsForPackage[elt] = 1
+ }
+ // parenthesize if needed and add to slice for joining
+ licsParens = append(licsParens, makeElement(lid))
+ }
+
+ // OK -- now we can fill in the file's details, or NOASSERTION if none
+ if len(licsForFile) > 0 {
+ f.LicenseInfoInFile = []string{}
+ for lic := range licsForFile {
+ f.LicenseInfoInFile = append(f.LicenseInfoInFile, lic)
+ }
+ sort.Strings(f.LicenseInfoInFile)
+ // avoid adding parens and joining for single-ID items
+ if len(licsParens) == 1 {
+ f.LicenseConcluded = ids[0]
+ } else {
+ f.LicenseConcluded = strings.Join(licsParens, " AND ")
+ }
+ }
+ }
+
+ // and finally, we can fill in the package's details
+ if len(licsForPackage) == 0 {
+ pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"}
+ } else {
+ pkg.PackageLicenseInfoFromFiles = []string{}
+ for lic := range licsForPackage {
+ pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic)
+ }
+ sort.Strings(pkg.PackageLicenseInfoFromFiles)
+ }
+
+ return doc, nil
+}
+
+// ===== Utility functions =====
+func searchFileIDs(filePath string) ([]string, error) {
+ idsMap := map[string]int{}
+ ids := []string{}
+
+ f, err := os.Open(filePath)
+ if err != nil {
+ return nil, err
+ }
+ defer f.Close()
+
+ scanner := bufio.NewScanner(f)
+
+ for scanner.Scan() {
+ if strings.Contains(scanner.Text(), "SPDX-License-Identifier:") {
+ strs := strings.SplitN(scanner.Text(), "SPDX-License-Identifier:", 2)
+
+ // if prefixed by more than n characters, it's probably not a
+ // short-form ID; it's probably code to detect short-form IDs.
+ // Like this function itself, for example =)
+ prefix := stripTrash(strs[0])
+ if len(prefix) > 5 {
+ continue
+ }
+
+ // stop before trailing */ if it is present
+ lidToExtract := strs[1]
+ lidToExtract = strings.Split(lidToExtract, "*/")[0]
+ lid := strings.TrimSpace(lidToExtract)
+ lid = stripTrash(lid)
+ idsMap[lid] = 1
+ }
+ }
+
+ // FIXME for now, ignore scanner errors because we want to return whatever
+ // FIXME IDs were in fact found. should probably be changed to either
+ // FIXME log the error, and/or be configurable for what should happen.
+ // if err = scanner.Err(); err != nil {
+ // return nil, err
+ // }
+
+ // now, convert map to string
+ for lid := range idsMap {
+ ids = append(ids, lid)
+ }
+
+ // and sort it
+ sort.Strings(ids)
+
+ return ids, nil
+}
+
+func stripTrash(lid string) string {
+ re := regexp.MustCompile(`[^\w\s\d.\-\+()]+`)
+ return re.ReplaceAllString(lid, "")
+}
+
+func makeElement(lic string) string {
+ if strings.Contains(lic, " AND ") || strings.Contains(lic, " OR ") {
+ return fmt.Sprintf("(%s)", lic)
+ }
+
+ return lic
+}
+
+func getIndividualLicenses(lic string) []string {
+ // replace parens and '+' with spaces
+ lic = strings.Replace(lic, "(", " ", -1)
+ lic = strings.Replace(lic, ")", " ", -1)
+ lic = strings.Replace(lic, "+", " ", -1)
+
+ // now, split by spaces, trim, and add to slice
+ licElements := strings.Split(lic, " ")
+ lics := []string{}
+ for _, elt := range licElements {
+ elt := strings.TrimSpace(elt)
+ // don't add if empty or if case-insensitive operator
+ if elt == "" || strings.EqualFold(elt, "AND") ||
+ strings.EqualFold(elt, "OR") || strings.EqualFold(elt, "WITH") {
+ continue
+ }
+
+ lics = append(lics, elt)
+ }
+
+ // sort before returning
+ sort.Strings(lics)
+ return lics
+}