diff options
Diffstat (limited to 'stringclassifier/searchset/tokenizer/tokenizer.go')
-rw-r--r-- | stringclassifier/searchset/tokenizer/tokenizer.go | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/stringclassifier/searchset/tokenizer/tokenizer.go b/stringclassifier/searchset/tokenizer/tokenizer.go new file mode 100644 index 0000000..0f842d8 --- /dev/null +++ b/stringclassifier/searchset/tokenizer/tokenizer.go @@ -0,0 +1,175 @@ +// Copyright 2017 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package tokenizer converts a text into a stream of tokens. +package tokenizer + +import ( + "bytes" + "fmt" + "hash/crc32" + "sort" + "unicode" + "unicode/utf8" +) + +// Token is a non-whitespace sequence (i.e., word or punctuation) in the +// original string. This is not meant for use outside of this package. +type token struct { + Text string + Offset int +} + +// Tokens is a list of Token objects. +type Tokens []*token + +// newToken creates a new token object with an invalid (negative) offset, which +// will be set before the token's used. +func newToken() *token { + return &token{Offset: -1} +} + +// Tokenize converts a string into a stream of tokens. +func Tokenize(s string) (toks Tokens) { + tok := newToken() + for i := 0; i < len(s); { + r, size := utf8.DecodeRuneInString(s[i:]) + switch { + case unicode.IsSpace(r): + if tok.Offset >= 0 { + toks = append(toks, tok) + tok = newToken() + } + case unicode.IsPunct(r): + if tok.Offset >= 0 { + toks = append(toks, tok) + tok = newToken() + } + toks = append(toks, &token{ + Text: string(r), + Offset: i, + }) + default: + if tok.Offset == -1 { + tok.Offset = i + } + tok.Text += string(r) + } + i += size + } + if tok.Offset != -1 { + // Add any remaining token that wasn't yet included in the list. + toks = append(toks, tok) + } + return toks +} + +// GenerateHashes generates hashes for "size" length substrings. The +// "stringifyTokens" call takes a long time to run, so not all substrings have +// hashes, i.e. we skip some of the smaller substrings. +func (t Tokens) GenerateHashes(h Hash, size int) ([]uint32, TokenRanges) { + if size == 0 { + return nil, nil + } + + var css []uint32 + var tr TokenRanges + for offset := 0; offset+size <= len(t); offset += size / 2 { + var b bytes.Buffer + t.stringifyTokens(&b, offset, size) + cs := crc32.ChecksumIEEE(b.Bytes()) + css = append(css, cs) + tr = append(tr, &TokenRange{offset, offset + size}) + h.add(cs, offset, offset+size) + if size <= 1 { + break + } + } + + return css, tr +} + +// stringifyTokens serializes a sublist of tokens into a bytes buffer. +func (t Tokens) stringifyTokens(b *bytes.Buffer, offset, size int) { + for j := offset; j < offset+size; j++ { + if j != offset { + b.WriteRune(' ') + } + b.WriteString(t[j].Text) + } +} + +// TokenRange indicates the range of tokens that map to a particular checksum. +type TokenRange struct { + Start int + End int +} + +func (t *TokenRange) String() string { + return fmt.Sprintf("[%v, %v)", t.Start, t.End) +} + +// TokenRanges is a list of TokenRange objects. The chance that two different +// strings map to the same checksum is very small, but unfortunately isn't +// zero, so we use this instead of making the assumption that they will all be +// unique. +type TokenRanges []*TokenRange + +func (t TokenRanges) Len() int { return len(t) } +func (t TokenRanges) Swap(i, j int) { t[i], t[j] = t[j], t[i] } +func (t TokenRanges) Less(i, j int) bool { return t[i].Start < t[j].Start } + +// CombineUnique returns the combination of both token ranges with no duplicates. +func (t TokenRanges) CombineUnique(other TokenRanges) TokenRanges { + if len(other) == 0 { + return t + } + if len(t) == 0 { + return other + } + + cu := append(t, other...) + sort.Sort(cu) + + if len(cu) == 0 { + return nil + } + + res := TokenRanges{cu[0]} + for prev, i := cu[0], 1; i < len(cu); i++ { + if prev.Start != cu[i].Start || prev.End != cu[i].End { + res = append(res, cu[i]) + prev = cu[i] + } + } + return res +} + +// Hash is a map of the hashes of a section of text to the token range covering that text. +type Hash map[uint32]TokenRanges + +// add associates a token range, [start, end], to a checksum. +func (h Hash) add(checksum uint32, start, end int) { + ntr := &TokenRange{Start: start, End: end} + if r, ok := h[checksum]; ok { + for _, tr := range r { + if tr.Start == ntr.Start && tr.End == ntr.End { + // The token range already exists at this + // checksum. No need to re-add it. + return + } + } + } + h[checksum] = append(h[checksum], ntr) +} |