aboutsummaryrefslogtreecommitdiff
path: root/v2/document.go
diff options
context:
space:
mode:
Diffstat (limited to 'v2/document.go')
-rw-r--r--v2/document.go46
1 files changed, 4 insertions, 42 deletions
diff --git a/v2/document.go b/v2/document.go
index 73ccaab..6f3c1b5 100644
--- a/v2/document.go
+++ b/v2/document.go
@@ -30,25 +30,19 @@ type token struct {
Previous string // for the first token in a line, any previous text.
}
-// document is the representation of the input text for downstream filtering and matching.
-type document struct {
- Tokens []*token // ordered tokens of the document
- Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching
-}
-
type indexedToken struct {
Line int // line position of this token in the source
ID tokenID // identifier of the text in the dictionary
}
type indexedDocument struct {
+ Norm string // The normalized token sequence
Tokens []indexedToken // ordered tokens of the document
Matches Matches // these are matches identified while processing the original, untokenized text via regexp matching
f *frequencyTable // frequencies computed for this document
dict *dictionary // The corpus dictionary for this document
s *searchSet // The searchset for this document
runes []rune
- norm string // The normalized token sequence
}
func (d *indexedDocument) generateSearchSet(q int) {
@@ -101,58 +95,26 @@ func max(a, b int) int {
// AddContent incorporates the provided textual content into the classifier for
// matching. This will not modify the supplied content.
func (c *Classifier) AddContent(category, name, variant string, content []byte) {
- doc := tokenize(content)
+ doc := tokenize(content, c.dict, true)
c.addDocument(category, name, variant, doc)
}
// addDocument takes a textual document and incorporates it into the classifier for matching.
-func (c *Classifier) addDocument(category, name, variant string, doc *document) {
+func (c *Classifier) addDocument(category, name, variant string, id *indexedDocument) {
// For documents that are part of the corpus, we add them to the dictionary and
// compute their associated search data eagerly so they are ready for matching against
// candidates.
indexName := c.generateDocName(category, name, variant)
- id := c.generateIndexedDocument(doc, true)
- id.generateFrequencies()
id.generateSearchSet(c.q)
id.s.origin = indexName
c.docs[indexName] = id
}
-// generateIndexedDocument creates an indexedDocument from the supplied document. if addWords
-// is true, the classifier dictionary is updated with new tokens encountered in the document.
-func (c *Classifier) generateIndexedDocument(d *document, addWords bool) *indexedDocument {
- id := &indexedDocument{
- Tokens: make([]indexedToken, 0, len(d.Tokens)),
- dict: c.dict,
- Matches: d.Matches,
- }
-
- for _, t := range d.Tokens {
- var tokID tokenID
- if addWords {
- tokID = id.dict.add(t.Text)
- } else {
- tokID = id.dict.getIndex(t.Text)
- }
-
- id.Tokens = append(id.Tokens, indexedToken{
- Line: t.Line,
- ID: tokID,
- })
-
- }
- id.generateFrequencies()
- id.runes = diffWordsToRunes(id, 0, id.size())
- id.norm = id.normalized()
- return id
-}
-
// createTargetIndexedDocument creates an indexed document without adding the
// words to the classifier dictionary. This should be used for matching targets, not
// populating the corpus.
func (c *Classifier) createTargetIndexedDocument(in []byte) *indexedDocument {
- doc := tokenize(in)
- return c.generateIndexedDocument(doc, false)
+ return tokenize(in, c.dict, false)
}
func (c *Classifier) generateDocName(category, name, variant string) string {