forked from gitea/gitea
1
0
Fork 0
gitea/vendor/github.com/blevesearch/bleve/v2/index/upsidedown/analysis.go

130 lines
3.8 KiB
Go

// Copyright (c) 2015 Couchbase, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package upsidedown
import (
index "github.com/blevesearch/bleve_index_api"
)
type IndexRow interface {
KeySize() int
KeyTo([]byte) (int, error)
Key() []byte
ValueSize() int
ValueTo([]byte) (int, error)
Value() []byte
}
type AnalysisResult struct {
DocID string
Rows []IndexRow
}
func (udc *UpsideDownCouch) Analyze(d index.Document) *AnalysisResult {
return udc.analyze(d)
}
func (udc *UpsideDownCouch) analyze(d index.Document) *AnalysisResult {
rv := &AnalysisResult{
DocID: d.ID(),
Rows: make([]IndexRow, 0, 100),
}
docIDBytes := []byte(d.ID())
// track our back index entries
backIndexStoredEntries := make([]*BackIndexStoreEntry, 0)
// information we collate as we merge fields with same name
fieldTermFreqs := make(map[uint16]index.TokenFrequencies)
fieldLengths := make(map[uint16]int)
fieldIncludeTermVectors := make(map[uint16]bool)
fieldNames := make(map[uint16]string)
analyzeField := func(field index.Field, storable bool) {
fieldIndex, newFieldRow := udc.fieldIndexOrNewRow(field.Name())
if newFieldRow != nil {
rv.Rows = append(rv.Rows, newFieldRow)
}
fieldNames[fieldIndex] = field.Name()
if field.Options().IsIndexed() {
field.Analyze()
fieldLength := field.AnalyzedLength()
tokenFreqs := field.AnalyzedTokenFrequencies()
existingFreqs := fieldTermFreqs[fieldIndex]
if existingFreqs == nil {
fieldTermFreqs[fieldIndex] = tokenFreqs
} else {
existingFreqs.MergeAll(field.Name(), tokenFreqs)
fieldTermFreqs[fieldIndex] = existingFreqs
}
fieldLengths[fieldIndex] += fieldLength
fieldIncludeTermVectors[fieldIndex] = field.Options().IncludeTermVectors()
}
if storable && field.Options().IsStored() {
rv.Rows, backIndexStoredEntries = udc.storeField(docIDBytes, field, fieldIndex, rv.Rows, backIndexStoredEntries)
}
}
// walk all the fields, record stored fields now
// place information about indexed fields into map
// this collates information across fields with
// same names (arrays)
d.VisitFields(func(field index.Field) {
analyzeField(field, true)
})
if d.HasComposite() {
for fieldIndex, tokenFreqs := range fieldTermFreqs {
// see if any of the composite fields need this
d.VisitComposite(func(field index.CompositeField) {
field.Compose(fieldNames[fieldIndex], fieldLengths[fieldIndex], tokenFreqs)
})
}
d.VisitComposite(func(field index.CompositeField) {
analyzeField(field, false)
})
}
rowsCapNeeded := len(rv.Rows) + 1
for _, tokenFreqs := range fieldTermFreqs {
rowsCapNeeded += len(tokenFreqs)
}
rv.Rows = append(make([]IndexRow, 0, rowsCapNeeded), rv.Rows...)
backIndexTermsEntries := make([]*BackIndexTermsEntry, 0, len(fieldTermFreqs))
// walk through the collated information and process
// once for each indexed field (unique name)
for fieldIndex, tokenFreqs := range fieldTermFreqs {
fieldLength := fieldLengths[fieldIndex]
includeTermVectors := fieldIncludeTermVectors[fieldIndex]
// encode this field
rv.Rows, backIndexTermsEntries = udc.indexField(docIDBytes, includeTermVectors, fieldIndex, fieldLength, tokenFreqs, rv.Rows, backIndexTermsEntries)
}
// build the back index row
backIndexRow := NewBackIndexRow(docIDBytes, backIndexTermsEntries, backIndexStoredEntries)
rv.Rows = append(rv.Rows, backIndexRow)
return rv
}