之前的文章分析过,接受索引请求处理的代码在segmenter_worker.go里:
func (engine *Engine) segmenterWorker() {
for {
request := <-engine.segmenterChannel //关键
tokensMap := make(map[string][]int)
numTokens := 0
if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
// 当文档正文不为空时,优先从内容分词中得到关键词
segments := engine.segmenter.Segment([]byte(request.data.Content))
for _, segment := range segments {
token := segment.Token().Text()
if !engine.stopTokens.IsStopToken(token) {
tokensMap[token] = append(tokensMap[token], segment.Start())
}
}
numTokens = len(segments)
} else {
// 否则载入用户输入的关键词
for _, t := range request.data.Tokens {
if !engine.stopTokens.IsStopToken(t.Text) {
tokensMap[t.Text] = t.Locations
}
}
numTokens = len(request.data.Tokens)
}
// 加入非分词的文档标签
for _, label := range request.data.Labels {
if !engine.initOptions.NotUsingSegmenter {
if !engine.stopTokens.IsStopToken(label) {
tokensMap[label] = []int{}
}
} else {
tokensMap[label] = []int{}
}
}
indexerRequest := indexerAddDocumentRequest{
document: &types.DocumentIndex{
DocId: request.docId,
TokenLength: float32(numTokens),
Keywords: make([]types.KeywordIndex, len(tokensMap)),
},
}
iTokens := 0
for k, v := range tokensMap {
indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
Text: k,
// 非分词标注的词频设置为0,不参与tf-idf计算
Frequency: float32(len(v)),
Starts: v}
iTokens++
}
var dealDocInfoChan = make(chan bool, 1)
indexerRequest.dealDocInfoChan = dealDocInfoChan
engine.indexerAddDocumentChannels[request.shard] <- indexerRequest
rankerRequest := rankerAddDocRequest{
docId: request.docId,
fields: request.data.Fields,
dealDocInfoChan: dealDocInfoChan,
}
engine.rankerAddDocChannels[request.shard] <- rankerRequest
}
}
上面代码的作用就是在统计词频和单词位置(注意:tag也是作为搜索的单词,不过其词频是0,而无法参与tf-idf计算),并封装为indexerRequest,发送给engine.indexerAddDocumentChannels[request.shard]
此外,红色部分代码是在为文档评分做准备,engine/ranker_worker.go:
func (engine *Engine) rankerAddDocWorker(shard int) { for { request := <-engine.rankerAddDocChannels[shard] //关键 docInfo := engine.rankers[shard].AddDoc(request.docId, request.fields, request.dealDocInfoChan) // save if engine.initOptions.UsePersistentStorage { engine.persistentStorageIndexDocumentChannels[shard] <- persistentStorageIndexDocumentRequest{ typ: "info", docId: request.docId, docInfo: docInfo, } } } }
AddDoc无非就是将docid对应的fields信息存储起来,为搜索结果rank评分用!
// 给某个文档添加评分字段 func (ranker *Ranker) AddDoc(docId uint64, fields interface{}, dealDocInfoChan <-chan bool) *types.DocInfo { if ranker.initialized == false { log.Fatal("排序器尚未初始化") } <-dealDocInfoChan // 等待索引器处理完成 ranker.DocInfosShard.Lock() defer ranker.DocInfosShard.Unlock() if _, found := ranker.DocInfosShard.DocInfos[docId]; !found { ranker.DocInfosShard.DocInfos[docId] = new(types.DocInfo) ranker.DocInfosShard.NumDocuments++ } ranker.DocInfosShard.DocInfos[docId].Fields = fields return ranker.DocInfosShard.DocInfos[docId] }
本文转自张昺华-sky博客园博客,原文链接:http://www.cnblogs.com/bonelee/p/6582369.html,如需转载请自行联系原作者