diff options
Diffstat (limited to 'stringclassifier/searchset/tokenizer/tokenizer_test.go')
-rw-r--r-- | stringclassifier/searchset/tokenizer/tokenizer_test.go | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/stringclassifier/searchset/tokenizer/tokenizer_test.go b/stringclassifier/searchset/tokenizer/tokenizer_test.go new file mode 100644 index 0000000..d058e3f --- /dev/null +++ b/stringclassifier/searchset/tokenizer/tokenizer_test.go @@ -0,0 +1,119 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package tokenizer + +import ( + "reflect" + "testing" +) + +func TestTokenizer_Tokenize(t *testing.T) { + tests := []struct { + text string + want Tokens + }{ + { + text: "Tokenize", + want: Tokens{&token{Text: "Tokenize", Offset: 0}}, + }, + { + text: "Hello world", + want: Tokens{ + &token{Text: "Hello", Offset: 0}, + &token{Text: "world", Offset: 6}, + }, + }, + { + text: `Goodnight, +Irene +`, + want: Tokens{ + &token{Text: "Goodnight", Offset: 0}, + &token{Text: ",", Offset: 9}, + &token{Text: "Irene", Offset: 11}, + }, + }, + { + text: "Copyright © 2017 Yoyodyne, Inc.", + want: Tokens{ + &token{Text: "Copyright", Offset: 0}, + &token{Text: "©", Offset: 10}, + &token{Text: "2017", Offset: 13}, + &token{Text: "Yoyodyne", Offset: 18}, + &token{Text: ",", Offset: 26}, + &token{Text: "Inc", Offset: 28}, + &token{Text: ".", Offset: 31}, + }, + }, + } + + for _, tt := range tests { + if got := Tokenize(tt.text); !reflect.DeepEqual(got, tt.want) { + t.Errorf("Tokenize(%q) = %+v, want %+v", tt.text, got, tt.want) + } + } +} + +func TestTokenizer_GenerateHashes(t *testing.T) { + tests := []struct { + text string + sizeFactor int + wantHash []uint32 + wantRanges TokenRanges + }{ + { + text: "", + sizeFactor: 1, + wantHash: nil, + wantRanges: nil, + }, + { + text: "Hashes", + sizeFactor: 1, + wantHash: []uint32{408116689}, + wantRanges: TokenRanges{{Start: 0, End: 1}}, + }, + { + text: "hello world", + sizeFactor: 1, + wantHash: []uint32{222957957}, + wantRanges: TokenRanges{{Start: 0, End: 2}}, + }, + { + text: "Copyright © 2017 Yoyodyne, Inc.", + sizeFactor: 3, + wantHash: []uint32{2473816729, 966085113, 3025678301, 3199087486, 850352802, 1274745089}, + wantRanges: TokenRanges{ + {Start: 0, End: 2}, + {Start: 1, End: 3}, + {Start: 2, End: 4}, + {Start: 3, End: 5}, + {Start: 4, End: 6}, + {Start: 5, End: 7}, + }, + }, + } + + for _, tt := range tests { + hash := make(Hash) + toks := Tokenize(tt.text) + h, tr := toks.GenerateHashes(hash, len(toks)/tt.sizeFactor) + if !reflect.DeepEqual(h, tt.wantHash) { + t.Errorf("GenerateHashes(hash) = %v, want %v", h, tt.wantHash) + } + if !reflect.DeepEqual(tr, tt.wantRanges) { + t.Errorf("GenerateHashes(ranges) = %v, want %v", tr, tt.wantRanges) + } + } +} |