diff options
Diffstat (limited to 'v2/tokenizer_test.go')
-rw-r--r-- | v2/tokenizer_test.go | 102 |
1 files changed, 83 insertions, 19 deletions
diff --git a/v2/tokenizer_test.go b/v2/tokenizer_test.go index 662685c..6ddab4c 100644 --- a/v2/tokenizer_test.go +++ b/v2/tokenizer_test.go @@ -15,6 +15,7 @@ package classifier import ( + "io" "strings" "testing" @@ -56,7 +57,7 @@ func TestCleanupToken(t *testing.T) { }, } for _, test := range tests { - if got := cleanupToken(test.input); got != test.output { + if got := cleanupToken(0, test.input, true); got != test.output { t.Errorf("%q: got %q want %q", test.input, got, test.output) } } @@ -66,8 +67,21 @@ func TestTokenize(t *testing.T) { tests := []struct { name string input string - output *document + output *indexedDocument }{ + {name: "hyphenization recovery", + input: `basket- +ball`, + output: &indexedDocument{ + Tokens: []indexedToken{ + { + ID: 1, + Line: 1, + }, + }, + Norm: "basketball", + }, + }, { name: "basic scenario", input: `The AWESOME Project LICENSE @@ -80,63 +94,112 @@ Copyright 1996-2002, 2006 by A. Developer Introduction The AWESOME Project`, - output: &document{ - Tokens: []*token{ + output: &indexedDocument{ + Tokens: []indexedToken{ { - Text: "the", + ID: 1, Line: 1, }, { - Text: "awesome", + ID: 2, Line: 1, }, { - Text: "project", + ID: 3, Line: 1, }, { - Text: "license", + ID: 4, Line: 1, }, { - Text: "modifications", + ID: 5, Line: 3, }, { - Text: "prohibited", + ID: 6, Line: 4, }, { - Text: "introduction", + ID: 7, Line: 8, }, { - Text: "the", + ID: 1, Line: 10, }, { - Text: "awesome", + ID: 2, Line: 10, }, { - Text: "project", + ID: 3, Line: 10, }, }, Matches: Matches{&Match{Name: "Copyright", Confidence: 1.0, MatchType: "Copyright", StartLine: 6, EndLine: 6}}, + Norm: "the awesome project license modifications prohibited introduction the awesome project", }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - d := tokenize([]byte(test.input)) - if !cmp.Equal(d, test.output, cmpopts.IgnoreUnexported(document{})) { - t.Errorf("%s failed: %s", test.name, cmp.Diff(d, test.output)) + d := tokenize([]byte(test.input), newDictionary(), true) + if diff := cmp.Diff(d, test.output, cmpopts.IgnoreUnexported(indexedDocument{})); diff != "" { + t.Errorf("%s failed:\nDiff(+got,-want): %s", test.name, diff) } }) } } +type mockReader struct { + t *testing.T + schedule []int + cur int +} + +func (m *mockReader) Read(buf []byte) (int, error) { + if m.cur > len(m.schedule) { + m.t.Fatal("Unexpected read on mock") + } + + if m.cur == len(m.schedule) { + return 0, io.EOF + } + + if len(buf) != m.schedule[m.cur] { + m.t.Fatalf("step %d: got %d, want %d", m.cur, len(buf), m.schedule[m.cur]) + } + m.cur++ + + for i := range buf { + buf[i] = 'a' + } + + return len(buf), nil +} + +func TestTokenizerBuffering(t *testing.T) { + dict := newDictionary() + mr := mockReader{ + t: t, + schedule: []int{1024, 1020, 1020}, + } + d, err := tokenizeStream(&mr, true, dict, true) + if err != nil { + t.Errorf("Read returned unexpected error: %v", err) + } + + // Do a basic test to make sure the data returned is sound + if len(d.Tokens) != 1 { + t.Errorf("Got %d tokens, expected 1", len(d.Tokens)) + } + + if len(d.Norm) != 3064 { + t.Errorf("Got %d bytes, expected 3064", len(d.Norm)) + } +} + func TestTokenizer(t *testing.T) { // This test focuses primarily on the textual content extracted and does not look // at the other parts of the document. @@ -229,10 +292,11 @@ The FreeType Project`, for _, test := range tests { t.Run(test.name, func(t *testing.T) { - d := tokenize([]byte(test.input)) + dict := newDictionary() + d := tokenize([]byte(test.input), dict, true) var b strings.Builder for _, tok := range d.Tokens { - b.WriteString(tok.Text) + b.WriteString(dict.getWord(tok.ID)) b.WriteString(" ") } actual := strings.TrimSpace(b.String()) |