aboutsummaryrefslogtreecommitdiff
path: root/stringclassifier/searchset/tokenizer/tokenizer.go
diff options
context:
space:
mode:
Diffstat (limited to 'stringclassifier/searchset/tokenizer/tokenizer.go')
-rw-r--r--stringclassifier/searchset/tokenizer/tokenizer.go175
1 files changed, 175 insertions, 0 deletions
diff --git a/stringclassifier/searchset/tokenizer/tokenizer.go b/stringclassifier/searchset/tokenizer/tokenizer.go
new file mode 100644
index 0000000..0f842d8
--- /dev/null
+++ b/stringclassifier/searchset/tokenizer/tokenizer.go
@@ -0,0 +1,175 @@
+// Copyright 2017 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package tokenizer converts a text into a stream of tokens.
+package tokenizer
+
+import (
+ "bytes"
+ "fmt"
+ "hash/crc32"
+ "sort"
+ "unicode"
+ "unicode/utf8"
+)
+
+// Token is a non-whitespace sequence (i.e., word or punctuation) in the
+// original string. This is not meant for use outside of this package.
+type token struct {
+ Text string
+ Offset int
+}
+
+// Tokens is a list of Token objects.
+type Tokens []*token
+
+// newToken creates a new token object with an invalid (negative) offset, which
+// will be set before the token's used.
+func newToken() *token {
+ return &token{Offset: -1}
+}
+
+// Tokenize converts a string into a stream of tokens.
+func Tokenize(s string) (toks Tokens) {
+ tok := newToken()
+ for i := 0; i < len(s); {
+ r, size := utf8.DecodeRuneInString(s[i:])
+ switch {
+ case unicode.IsSpace(r):
+ if tok.Offset >= 0 {
+ toks = append(toks, tok)
+ tok = newToken()
+ }
+ case unicode.IsPunct(r):
+ if tok.Offset >= 0 {
+ toks = append(toks, tok)
+ tok = newToken()
+ }
+ toks = append(toks, &token{
+ Text: string(r),
+ Offset: i,
+ })
+ default:
+ if tok.Offset == -1 {
+ tok.Offset = i
+ }
+ tok.Text += string(r)
+ }
+ i += size
+ }
+ if tok.Offset != -1 {
+ // Add any remaining token that wasn't yet included in the list.
+ toks = append(toks, tok)
+ }
+ return toks
+}
+
+// GenerateHashes generates hashes for "size" length substrings. The
+// "stringifyTokens" call takes a long time to run, so not all substrings have
+// hashes, i.e. we skip some of the smaller substrings.
+func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) {
+ if size == 0 {
+ return nil, nil
+ }
+
+ var css []uint32
+ var tr TokenRanges
+ for offset := 0; offset+size <= len(t); offset += size / 2 {
+ var b bytes.Buffer
+ t.stringifyTokens(&b, offset, size)
+ cs := crc32.ChecksumIEEE(b.Bytes())
+ css = append(css, cs)
+ tr = append(tr, &TokenRange{offset, offset + size})
+ h.add(cs, offset, offset+size)
+ if size <= 1 {
+ break
+ }
+ }
+
+ return css, tr
+}
+
+// stringifyTokens serializes a sublist of tokens into a bytes buffer.
+func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) {
+ for j := offset; j < offset+size; j++ {
+ if j != offset {
+ b.WriteRune(' ')
+ }
+ b.WriteString(t[j].Text)
+ }
+}
+
+// TokenRange indicates the range of tokens that map to a particular checksum.
+type TokenRange struct {
+ Start int
+ End int
+}
+
+func (t *TokenRange) String() string {
+ return fmt.Sprintf("[%v, %v)", t.Start, t.End)
+}
+
+// TokenRanges is a list of TokenRange objects. The chance that two different
+// strings map to the same checksum is very small, but unfortunately isn't
+// zero, so we use this instead of making the assumption that they will all be
+// unique.
+type TokenRanges []*TokenRange
+
+func (t TokenRanges) Len() int { return len(t) }
+func (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
+func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start }
+
+// CombineUnique returns the combination of both token ranges with no duplicates.
+func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges {
+ if len(other) == 0 {
+ return t
+ }
+ if len(t) == 0 {
+ return other
+ }
+
+ cu := append(t, other...)
+ sort.Sort(cu)
+
+ if len(cu) == 0 {
+ return nil
+ }
+
+ res := TokenRanges{cu[0]}
+ for prev, i := cu[0], 1; i < len(cu); i++ {
+ if prev.Start != cu[i].Start || prev.End != cu[i].End {
+ res = append(res, cu[i])
+ prev = cu[i]
+ }
+ }
+ return res
+}
+
+// Hash is a map of the hashes of a section of text to the token range covering that text.
+type Hash map[uint32]TokenRanges
+
+// add associates a token range, [start, end], to a checksum.
+func (h Hash) add(checksum uint32, start, end int) {
+ ntr := &TokenRange{Start: start, End: end}
+ if r, ok := h[checksum]; ok {
+ for _, tr := range r {
+ if tr.Start == ntr.Start && tr.End == ntr.End {
+ // The token range already exists at this
+ // checksum. No need to re-add it.
+ return
+ }
+ }
+ }
+ h[checksum] = append(h[checksum], ntr)
+}