4.1 分词器的核心类
1. Analyzer
Lucene内置分词器SimpleAnalyzer、StopAnalyzer、WhitespaceAnalyzer、StandardAnalyzer
主要作用:
KeywordAnalyzer分词,没有任何变化;
SimpleAnalyzer对中文效果太差;
StandardAnalyzer对中文单字拆分;
StopAnalyzer和SimpleAnalyzer差不多;
WhitespaceAnalyzer只按空格划分。
2. TokenStream
分词器做好处理之后得到的一个流,这个流中存储了分词的各种信息,可以通过TokenStream有效的获取到分词单元信息
生成的流程

在这个流中所需要存储的数据

3. Tokenizer
主要负责接收字符流Reader,将Reader进行分词操作。有如下一些实现类

4. TokenFilter
将分词的语汇单元,进行各种各样过滤

5.扩展:TokenFilter各类介绍:
(1),TokenFilter
输入参数为另一个TokerStream的TokerStream,其子类必须覆盖incrementToken()函数。
(2),LowerCaseFilter
将Token分词转换为小写。
(3),FilteringTokenFilter
TokenFilters的一个抽象类,可能会删除分词。如果当前分词要保存,则需要实现accept()方法
并返回一个boolean值。incrementToken()方法将调用accept()方法来决定是否将当前的分词返回
给调用者。
(4),StopFilter
从token stream中移除停止词(stop words).
1
2
3
|
protected boolean accept() {
return !stopWords.contains(termAtt.buffer(), 0 , termAtt.length());
}
|
(5),TypeTokenFilter
从token stream中移除指定类型的分词。
1
2
3
|
protected boolean accept() {
returnuseWhiteList == stopTypes.contains(typeAttribute.type());
}
|
(6),LetterTokenizer
是一个编译器,将文本在非字母。说,它定义了令牌的最大字符串相邻的字母
(7),TokenFilter的顺序问题
此时停止词 the 就未被去除了。先全部转换为小写字母,再过滤停止词(The 转换成 the 才可以与停止词词组里的 the 匹配),如果不限制大小写,停止词的组合就太多了。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
importorg.apache.lucene.analysis.LetterTokenizer;
importorg.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
public class MyStopAnalyzer extends Analyzer {
privateSet<Object> words;
publicMyStopAnalyzer(){}
publicMyStopAnalyzer(String[] words ){
this .words=StopFilter.makeStopSet(Version.LUCENE_35,words, true );
this .words.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET)
}
@Override
publicTokenStream tokenStream(String fieldName, Reader reader) {
return newStopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35, newLetterTokenizer(Version.LUCENE_35,reader)), this .words);
}
}
|
4.2Attribute
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
public static void displayAllTokenInfo(Stringstr,Analyzer a) {
try {
TokenStreamstream = a.tokenStream( "content" , new StringReader(str));
PositionIncrementAttributepia =
stream.addAttribute(PositionIncrementAttribute. class );
OffsetAttributeoa =
stream.addAttribute(OffsetAttribute. class );
CharTermAttributecta =
stream.addAttribute(CharTermAttribute. class );
TypeAttributeta =
stream.addAttribute(TypeAttribute. class );
for (;stream.incrementToken();){
System.out.print(pia.getPositionIncrement()+ ":" );
System.out.print(cta+ "[" +oa.startOffset()+ "-" +oa.endOffset()+ "]-->" +ta.type()+ "\n" );
}
} catch (Exception e) {
e.printStackTrace();
}
}
|
4.3 自定义分词器
1.自定义Stop分词器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
package com.mzsx.analyzer;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
importorg.apache.lucene.analysis.LetterTokenizer;
importorg.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
public class MyStopAnalyzer extends Analyzer {
privateSet<Object> words;
publicMyStopAnalyzer(){}
publicMyStopAnalyzer(String[] words ){
this .words=StopFilter.makeStopSet(Version.LUCENE_35,words, true );
this .words.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
@Override
publicTokenStream tokenStream(String fieldName, Reader reader) {
returnnew StopFilter(Version.LUCENE_35, new LowerCaseFilter(Version.LUCENE_35, newLetterTokenizer(Version.LUCENE_35,reader)), this .words);
}
}
|
1
2
3
4
5
6
7
8
9
|
@Test
publicvoid myStopAnalyzer() {
Analyzera1 = new MyStopAnalyzer( new String[]{ "I" , "you" , "hate" });
Analyzera2 = new MyStopAnalyzer();
Stringtxt = "how are you thank you I hate you" ;
AnalyzerUtils.displayAllTokenInfo(txt,a1);
}
|
2.简单实现同义词索引
1
2
3
4
5
|
package com.mzsx.analyzer;
public interface SamewordContext {
publicString[] getSamewords(String name);
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
package com.mzsx.analyzer;
import java.util.HashMap;
import java.util.Map;
public class SimpleSamewordContext implementsSamewordContext {
Map<String,String[]>maps = new HashMap<String,String[]>();
publicSimpleSamewordContext() {
maps.put( "中国" , new String[]{ "天朝" , "大陆" });
maps.put( "我" , new String[]{ "咱" , "俺" });
maps.put( "china" , new String[]{ "chinese" });
}
@Override
publicString[] getSamewords(String name) {
returnmaps.get(name);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
package com.mzsx.analyzer;
import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
importorg.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class MySameTokenFilter extendsTokenFilter {
privateCharTermAttribute cta = null ;
privatePositionIncrementAttribute pia = null ;
privateAttributeSource.State current;
privateStack<String> sames = null ;
privateSamewordContext samewordContext;
protectedMySameTokenFilter(TokenStream input,SamewordContext samewordContext) {
super (input);
cta= this .addAttribute(CharTermAttribute. class );
pia= this .addAttribute(PositionIncrementAttribute. class );
sames= new Stack<String>();
this .samewordContext= samewordContext;
}
@Override
publicboolean incrementToken() throws IOException {
if (sames.size()> 0 ){
Stringstr = sames.pop();
restoreState(current);
cta.setEmpty();
cta.append(str);
pia.setPositionIncrement( 0 );
returntrue;
}
if (! this .input.incrementToken()) return false ;
if (addSames(cta.toString())){
current= captureState();
}
returntrue;
}
privateboolean addSames(String name) {
String[]sws = samewordContext.getSamewords(name);
if (sws!= null ){
for (Stringstr:sws) {
sames.push(str);
}
returntrue;
}
returnfalse;
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
|
package com.mzsx.analyzer;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
importcom.chenlb.mmseg4j.analysis.MMSegTokenizer;
public class MySameAnalyzer extends Analyzer {
privateSamewordContext samewordContext;
publicMySameAnalyzer(SamewordContext swc) {
samewordContext= swc;
}
@Override
publicTokenStream tokenStream(String fieldName, Reader reader) {
Dictionarydic = Dictionary.getInstance( "D:/luceneIndex/dic" );
returnnew MySameTokenFilter(
newMMSegTokenizer( new MaxWordSeg(dic), reader),samewordContext);
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
|
@Test
publicvoid testSameAnalyzer() {
try {
Analyzera2 = new MySameAnalyzer( new SimpleSamewordContext());
Stringtxt = "我来自中国海南儋州第一中学,welcome to china !" ;
Directorydir = new RAMDirectory();
IndexWriterwriter = new IndexWriter(dir, new IndexWriterConfig(Version.LUCENE_35, a2));
Documentdoc = new Document();
doc.add(newField( "content" ,txt,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
writer.close();
IndexSearchersearcher = new IndexSearcher(IndexReader.open(dir));
TopDocstds = searcher.search( new TermQuery( new Term( "content" , "咱" )), 10 );
Documentd = searcher.doc(tds.scoreDocs[ 0 ].doc);
System.out.println( "原文:" +d.get( "content" ));
AnalyzerUtils.displayAllTokenInfo(txt,a2);
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
|
本文转自 梦朝思夕 51CTO博客,原文链接:http://blog.51cto.com/qiangmzsx/1549902