// Package idsearcher is used to search for short-form IDs in files // within a directory, and to build an SPDX Document containing those // license findings. // SPDX-License-Identifier: Apache-2.0 OR GPL-2.0-or-later package idsearcher import ( "bufio" "fmt" "github.com/spdx/tools-golang/spdx/v2_3" "os" "path/filepath" "regexp" "sort" "strings" "github.com/spdx/tools-golang/builder" "github.com/spdx/tools-golang/spdx/v2_1" "github.com/spdx/tools-golang/spdx/v2_2" "github.com/spdx/tools-golang/utils" ) // ===== 2.1 Searcher functions ===== // Config2_1 is a collection of configuration settings for docbuilder // (for version 2.1 SPDX Documents). A few mandatory fields are set here // so that they can be repeatedly reused in multiple calls to Build2_1. type Config2_1 struct { // NamespacePrefix should be a URI representing a prefix for the // namespace with which the SPDX Document will be associated. // It will be used in the DocumentNamespace field in the CreationInfo // section, followed by the per-Document package name and a random UUID. NamespacePrefix string // BuilderPathsIgnored lists certain paths to be omitted from the built // document. Each string should be a path, relative to the package's // dirRoot, to a specific file or (for all files in a directory) ending // in a slash. Prefix the string with "**" to omit all instances of that // file / directory, regardless of where it is in the file tree. BuilderPathsIgnored []string // SearcherPathsIgnored lists certain paths that should not be searched // by idsearcher, even if those paths have Files present. It uses the // same format as BuilderPathsIgnored. SearcherPathsIgnored []string } // BuildIDsDocument2_1 creates an SPDX Document (version 2.1) and searches for // short-form IDs in each file, filling in license fields as appropriate. It // returns that document or error if any is encountered. Arguments: // - packageName: name of package / directory // - dirRoot: path to directory to be analyzed // - namespacePrefix: URI representing a prefix for the // namespace with which the SPDX Document will be associated func BuildIDsDocument2_1(packageName string, dirRoot string, idconfig *Config2_1) (*v2_1.Document, error) { // first, build the Document using builder bconfig := &builder.Config2_1{ NamespacePrefix: idconfig.NamespacePrefix, CreatorType: "Tool", Creator: "github.com/spdx/tools-golang/idsearcher", PathsIgnored: idconfig.BuilderPathsIgnored, } doc, err := builder.Build2_1(packageName, dirRoot, bconfig) if err != nil { return nil, err } if doc == nil { return nil, fmt.Errorf("builder returned nil Document") } if doc.Packages == nil { return nil, fmt.Errorf("builder returned nil Packages map") } if len(doc.Packages) != 1 { return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages)) } // now, walk through each file and find its licenses (if any) pkg := doc.Packages[0] if pkg == nil { return nil, fmt.Errorf("builder returned nil Package") } if pkg.Files == nil { return nil, fmt.Errorf("builder returned nil Files in Package") } licsForPackage := map[string]int{} for _, f := range pkg.Files { // start by initializing / clearing values f.LicenseInfoInFiles = []string{"NOASSERTION"} f.LicenseConcluded = "NOASSERTION" // check whether the searcher should ignore this file if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) { continue } fPath := filepath.Join(dirRoot, f.FileName) // FIXME this is not preferable -- ignoring error ids, _ := searchFileIDs(fPath) // FIXME for now, proceed onwards with whatever IDs we obtained. // FIXME instead of ignoring the error, should probably either log it, // FIXME and/or enable the caller to configure what should happen. // separate out for this file's licenses licsForFile := map[string]int{} licsParens := []string{} for _, lid := range ids { // get individual elements and add for file and package licElements := getIndividualLicenses(lid) for _, elt := range licElements { licsForFile[elt] = 1 licsForPackage[elt] = 1 } // parenthesize if needed and add to slice for joining licsParens = append(licsParens, makeElement(lid)) } // OK -- now we can fill in the file's details, or NOASSERTION if none if len(licsForFile) > 0 { f.LicenseInfoInFiles = []string{} for lic := range licsForFile { f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic) } sort.Strings(f.LicenseInfoInFiles) // avoid adding parens and joining for single-ID items if len(licsParens) == 1 { f.LicenseConcluded = ids[0] } else { f.LicenseConcluded = strings.Join(licsParens, " AND ") } } } // and finally, we can fill in the package's details if len(licsForPackage) == 0 { pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"} } else { pkg.PackageLicenseInfoFromFiles = []string{} for lic := range licsForPackage { pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic) } sort.Strings(pkg.PackageLicenseInfoFromFiles) } return doc, nil } // ===== 2.2 Searcher functions ===== // Config2_2 is a collection of configuration settings for docbuilder // (for version 2.2 SPDX Documents). A few mandatory fields are set here // so that they can be repeatedly reused in multiple calls to Build2_2. type Config2_2 struct { // NamespacePrefix should be a URI representing a prefix for the // namespace with which the SPDX Document will be associated. // It will be used in the DocumentNamespace field in the CreationInfo // section, followed by the per-Document package name and a random UUID. NamespacePrefix string // BuilderPathsIgnored lists certain paths to be omitted from the built // document. Each string should be a path, relative to the package's // dirRoot, to a specific file or (for all files in a directory) ending // in a slash. Prefix the string with "**" to omit all instances of that // file / directory, regardless of where it is in the file tree. BuilderPathsIgnored []string // SearcherPathsIgnored lists certain paths that should not be searched // by idsearcher, even if those paths have Files present. It uses the // same format as BuilderPathsIgnored. SearcherPathsIgnored []string } // BuildIDsDocument2_2 creates an SPDX Document (version 2.2) and searches for // short-form IDs in each file, filling in license fields as appropriate. It // returns that document or error if any is encountered. Arguments: // - packageName: name of package / directory // - dirRoot: path to directory to be analyzed // - namespacePrefix: URI representing a prefix for the // namespace with which the SPDX Document will be associated func BuildIDsDocument2_2(packageName string, dirRoot string, idconfig *Config2_2) (*v2_2.Document, error) { // first, build the Document using builder bconfig := &builder.Config2_2{ NamespacePrefix: idconfig.NamespacePrefix, CreatorType: "Tool", Creator: "github.com/spdx/tools-golang/idsearcher", PathsIgnored: idconfig.BuilderPathsIgnored, } doc, err := builder.Build2_2(packageName, dirRoot, bconfig) if err != nil { return nil, err } if doc == nil { return nil, fmt.Errorf("builder returned nil Document") } if doc.Packages == nil { return nil, fmt.Errorf("builder returned nil Packages map") } if len(doc.Packages) != 1 { return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages)) } // now, walk through each file and find its licenses (if any) pkg := doc.Packages[0] if pkg == nil { return nil, fmt.Errorf("builder returned nil Package") } if pkg.Files == nil { return nil, fmt.Errorf("builder returned nil Files in Package") } licsForPackage := map[string]int{} for _, f := range pkg.Files { // start by initializing / clearing values f.LicenseInfoInFiles = []string{"NOASSERTION"} f.LicenseConcluded = "NOASSERTION" // check whether the searcher should ignore this file if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) { continue } fPath := filepath.Join(dirRoot, f.FileName) // FIXME this is not preferable -- ignoring error ids, _ := searchFileIDs(fPath) // FIXME for now, proceed onwards with whatever IDs we obtained. // FIXME instead of ignoring the error, should probably either log it, // FIXME and/or enable the caller to configure what should happen. // separate out for this file's licenses licsForFile := map[string]int{} licsParens := []string{} for _, lid := range ids { // get individual elements and add for file and package licElements := getIndividualLicenses(lid) for _, elt := range licElements { licsForFile[elt] = 1 licsForPackage[elt] = 1 } // parenthesize if needed and add to slice for joining licsParens = append(licsParens, makeElement(lid)) } // OK -- now we can fill in the file's details, or NOASSERTION if none if len(licsForFile) > 0 { f.LicenseInfoInFiles = []string{} for lic := range licsForFile { f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic) } sort.Strings(f.LicenseInfoInFiles) // avoid adding parens and joining for single-ID items if len(licsParens) == 1 { f.LicenseConcluded = ids[0] } else { f.LicenseConcluded = strings.Join(licsParens, " AND ") } } } // and finally, we can fill in the package's details if len(licsForPackage) == 0 { pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"} } else { pkg.PackageLicenseInfoFromFiles = []string{} for lic := range licsForPackage { pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic) } sort.Strings(pkg.PackageLicenseInfoFromFiles) } return doc, nil } // ===== 2.3 Searcher functions ===== // Config2_3 is a collection of configuration settings for docbuilder // (for version 2.3 SPDX Documents). A few mandatory fields are set here // so that they can be repeatedly reused in multiple calls to Build2_3. type Config2_3 struct { // NamespacePrefix should be a URI representing a prefix for the // namespace with which the SPDX Document will be associated. // It will be used in the DocumentNamespace field in the CreationInfo // section, followed by the per-Document package name and a random UUID. NamespacePrefix string // BuilderPathsIgnored lists certain paths to be omitted from the built // document. Each string should be a path, relative to the package's // dirRoot, to a specific file or (for all files in a directory) ending // in a slash. Prefix the string with "**" to omit all instances of that // file / directory, regardless of where it is in the file tree. BuilderPathsIgnored []string // SearcherPathsIgnored lists certain paths that should not be searched // by idsearcher, even if those paths have Files present. It uses the // same format as BuilderPathsIgnored. SearcherPathsIgnored []string } // BuildIDsDocument2_3 creates an SPDX Document (version 2.3) and searches for // short-form IDs in each file, filling in license fields as appropriate. It // returns that document or error if any is encountered. Arguments: // - packageName: name of package / directory // - dirRoot: path to directory to be analyzed // - namespacePrefix: URI representing a prefix for the // namespace with which the SPDX Document will be associated func BuildIDsDocument2_3(packageName string, dirRoot string, idconfig *Config2_3) (*v2_3.Document, error) { // first, build the Document using builder bconfig := &builder.Config2_3{ NamespacePrefix: idconfig.NamespacePrefix, CreatorType: "Tool", Creator: "github.com/spdx/tools-golang/idsearcher", PathsIgnored: idconfig.BuilderPathsIgnored, } doc, err := builder.Build2_3(packageName, dirRoot, bconfig) if err != nil { return nil, err } if doc == nil { return nil, fmt.Errorf("builder returned nil Document") } if doc.Packages == nil { return nil, fmt.Errorf("builder returned nil Packages map") } if len(doc.Packages) != 1 { return nil, fmt.Errorf("builder returned %d Packages", len(doc.Packages)) } // now, walk through each file and find its licenses (if any) pkg := doc.Packages[0] if pkg == nil { return nil, fmt.Errorf("builder returned nil Package") } if pkg.Files == nil { return nil, fmt.Errorf("builder returned nil Files in Package") } licsForPackage := map[string]int{} for _, f := range pkg.Files { // start by initializing / clearing values f.LicenseInfoInFiles = []string{"NOASSERTION"} f.LicenseConcluded = "NOASSERTION" // check whether the searcher should ignore this file if utils.ShouldIgnore(f.FileName, idconfig.SearcherPathsIgnored) { continue } fPath := filepath.Join(dirRoot, f.FileName) // FIXME this is not preferable -- ignoring error ids, _ := searchFileIDs(fPath) // FIXME for now, proceed onwards with whatever IDs we obtained. // FIXME instead of ignoring the error, should probably either log it, // FIXME and/or enable the caller to configure what should happen. // separate out for this file's licenses licsForFile := map[string]int{} licsParens := []string{} for _, lid := range ids { // get individual elements and add for file and package licElements := getIndividualLicenses(lid) for _, elt := range licElements { licsForFile[elt] = 1 licsForPackage[elt] = 1 } // parenthesize if needed and add to slice for joining licsParens = append(licsParens, makeElement(lid)) } // OK -- now we can fill in the file's details, or NOASSERTION if none if len(licsForFile) > 0 { f.LicenseInfoInFiles = []string{} for lic := range licsForFile { f.LicenseInfoInFiles = append(f.LicenseInfoInFiles, lic) } sort.Strings(f.LicenseInfoInFiles) // avoid adding parens and joining for single-ID items if len(licsParens) == 1 { f.LicenseConcluded = ids[0] } else { f.LicenseConcluded = strings.Join(licsParens, " AND ") } } } // and finally, we can fill in the package's details if len(licsForPackage) == 0 { pkg.PackageLicenseInfoFromFiles = []string{"NOASSERTION"} } else { pkg.PackageLicenseInfoFromFiles = []string{} for lic := range licsForPackage { pkg.PackageLicenseInfoFromFiles = append(pkg.PackageLicenseInfoFromFiles, lic) } sort.Strings(pkg.PackageLicenseInfoFromFiles) } return doc, nil } // ===== Utility functions (not version-specific) ===== func searchFileIDs(filePath string) ([]string, error) { idsMap := map[string]int{} ids := []string{} f, err := os.Open(filePath) if err != nil { return nil, err } defer f.Close() scanner := bufio.NewScanner(f) for scanner.Scan() { if strings.Contains(scanner.Text(), "SPDX-License-Identifier:") { strs := strings.SplitN(scanner.Text(), "SPDX-License-Identifier:", 2) // if prefixed by more than n characters, it's probably not a // short-form ID; it's probably code to detect short-form IDs. // Like this function itself, for example =) prefix := stripTrash(strs[0]) if len(prefix) > 5 { continue } // stop before trailing */ if it is present lidToExtract := strs[1] lidToExtract = strings.Split(lidToExtract, "*/")[0] lid := strings.TrimSpace(lidToExtract) lid = stripTrash(lid) idsMap[lid] = 1 } } // FIXME for now, ignore scanner errors because we want to return whatever // FIXME IDs were in fact found. should probably be changed to either // FIXME log the error, and/or be configurable for what should happen. // if err = scanner.Err(); err != nil { // return nil, err // } // now, convert map to string for lid := range idsMap { ids = append(ids, lid) } // and sort it sort.Strings(ids) return ids, nil } func stripTrash(lid string) string { re := regexp.MustCompile(`[^\w\s\d.\-\+()]+`) return re.ReplaceAllString(lid, "") } func makeElement(lic string) string { if strings.Contains(lic, " AND ") || strings.Contains(lic, " OR ") { return fmt.Sprintf("(%s)", lic) } return lic } func getIndividualLicenses(lic string) []string { // replace parens and '+' with spaces lic = strings.Replace(lic, "(", " ", -1) lic = strings.Replace(lic, ")", " ", -1) lic = strings.Replace(lic, "+", " ", -1) // now, split by spaces, trim, and add to slice licElements := strings.Split(lic, " ") lics := []string{} for _, elt := range licElements { elt := strings.TrimSpace(elt) // don't add if empty or if case-insensitive operator if elt == "" || strings.EqualFold(elt, "AND") || strings.EqualFold(elt, "OR") || strings.EqualFold(elt, "WITH") { continue } lics = append(lics, elt) } // sort before returning sort.Strings(lics) return lics }