// Copyright 2020 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package classifier import ( "strings" "testing" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" ) func TestCleanupToken(t *testing.T) { tests := []struct { input string output string }{{ input: "cleanup!", output: "cleanup", }, { input: "12345", output: "12345", }, { input: "r1@zx42-", output: "rzx", }, { input: "12345,", output: "12345", }, { input: "12345-6789", output: "12345-6789", }, { input: "1(a)", output: "1", }, { input: "1.2.3", output: "1.2.3", }, } for _, test := range tests { if got := cleanupToken(test.input); got != test.output { t.Errorf("%q: got %q want %q", test.input, got, test.output) } } } func TestTokenize(t *testing.T) { tests := []struct { name string input string output *document }{ { name: "basic scenario", input: `The AWESOME Project LICENSE Modifi- cations prohibited Copyright 1996-2002, 2006 by A. Developer Introduction The AWESOME Project`, output: &document{ Tokens: []*token{ { Text: "the", Index: 0, Line: 1, }, { Text: "awesome", Index: 1, Line: 1, }, { Text: "project", Index: 2, Line: 1, }, { Text: "license", Index: 3, Line: 1, }, { Text: "modifications", Index: 4, Line: 3, }, { Text: "prohibited", Index: 5, Line: 4, }, { Text: "introduction", Index: 6, Line: 8, }, { Text: "the", Index: 7, Line: 10, }, { Text: "awesome", Index: 8, Line: 10, }, { Text: "project", Index: 9, Line: 10, }, }, }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { d := tokenize(test.input) if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) { t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output)) } }) } } func TestTokenizer(t *testing.T) { // This test focuses primarily on the textual content extracted and does not look // at the other parts of the document. tests := []struct { name string input string output string }{ { name: "Basic Tokens", input: "Here are some words. ", output: "here are some words", }, { name: "skips bullet headers", input: "* item the first\n· item the second", output: "item the first item the second", }, { name: "preserves version numbers but not header numbers", input: "sample rules\n1. Python 2.7.8 is a version of the language.", output: "sample rules python 2.7.8 is a version of the language", }, { name: "preserves version numbers across line breaks", input: "Python version\n2.7.8 is a version of the language.", output: "python version 2.7.8 is a version of the language", }, { name: "preserves punctuation", input: "Bill, Larry, and Sergey agree precision is critical!", output: "bill larry and sergey agree precision is critical", }, { name: "ignores comment characters and bullet formatting", input: "/* * item the first", output: "item the first", }, { name: "produces blank line as needed", input: "/* *", output: "", }, { name: "clobbers header looking thing as appropriate", input: " iv. this is a test", output: "this is a test", }, { name: "clobbers header looking thing as appropriate even in comment", input: "/* 1.2.3. this is a test", output: "this is a test", }, { name: "preserve version number (not a header, but header-looking) not at beginning of sentence", input: "This is version 1.1.", output: "this is version 1.1", }, { name: "copyright inside a comment", input: " /* Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved", output: "", }, { name: "FTL copyright text", input: `The FreeType Project LICENSE 2006-Jan-27 2006-01-27 Copyright 1996-2002, 2006 by David Turner, Robert Wilhelm, and Werner Lemberg Introduction The FreeType Project`, output: "the freetype project license introduction the freetype project", }, { name: "Separated text", input: `distribution and modifi‐ cation follow.`, output: "distribution and modification follow", }, { name: "preserve internal references, even on line break", input: "(ii) should be preserved as (ii) is preserved", output: "ii should be preserved as ii is preserved", }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { d := tokenize(test.input) var b strings.Builder for _, tok := range d.Tokens { b.WriteString(tok.Text) b.WriteString(" ") } actual := strings.TrimSpace(b.String()) if actual != test.output { t.Errorf("Tokenize(%q): got %q want %q", test.name, actual, test.output) } }) } }