跟益达学Solr5之拼音分词[改进版]-阿里云开发者社区

开发者社区> 人工智能> 正文
登录阅读全文

跟益达学Solr5之拼音分词[改进版]

简介:

   之前一篇介绍过如何自定义实现拼音分词器,不过当初只考虑了全拼这种情况,且有些BUG,趁着抗日胜利70周年阅兵3天假期有时间,又把当初的代码拿起来进行了改进,改进点包括支持全拼,简拼以及全拼+简拼,支持汉字数字是否NGram处理的可配置,支持NGram长度范围的可配置等,特此更新此篇进行分享!如有不妥之处,还望不吝指正!

      废话不多说,直接上代码:

Java代码  收藏代码
  1. import java.io.IOException;  
  2. import java.util.ArrayList;  
  3. import java.util.Collection;  
  4. import java.util.Iterator;  
  5.   
  6. import org.apache.lucene.analysis.TokenFilter;  
  7. import org.apache.lucene.analysis.TokenStream;  
  8. import org.apache.lucene.analysis.pinyin.utils.Constant;  
  9. import org.apache.lucene.analysis.pinyin.utils.Pinyin4jUtil;  
  10. import org.apache.lucene.analysis.pinyin.utils.StringUtils;  
  11. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;  
  12. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;  
  13. import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;  
  14. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;  
  15. /** 
  16.  * 拼音过滤器[负责将汉字转换为拼音] 
  17.  * @author Lanxiaowei 
  18.  * 
  19.  */  
  20. public class PinyinTokenFilter extends TokenFilter {  
  21.     /**是否输出原中文*/  
  22.     private boolean isOutChinese;  
  23.     /**是否只转换简拼*/  
  24.     private boolean shortPinyin;  
  25.     /**是否转换全拼+简拼*/  
  26.     private boolean pinyinAll;  
  27.     /**中文词组长度过滤,默认超过2位长度的中文才转换拼音*/  
  28.     private int minTermLength;  
  29.   
  30.     /**词元输入缓存*/  
  31.     private char[] curTermBuffer;  
  32.     /**词元输入长度*/  
  33.     private int curTermLength;  
  34.   
  35.     private final CharTermAttribute termAtt = (CharTermAttribute) addAttribute(CharTermAttribute.class);  
  36.     /**位置增量属性*/  
  37.     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);  
  38.     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);  
  39.     /**当前输入是否已输出*/  
  40.     private boolean hasCurOut;  
  41.     /**拼音结果集*/  
  42.     private Collection<String> terms;  
  43.     /**拼音结果集迭代器*/  
  44.     private Iterator<String> termIte;  
  45.   
  46.     public PinyinTokenFilter(TokenStream input) {  
  47.         this(input,Constant.DEFAULT_MIN_TERM_LRNGTH);  
  48.     }  
  49.   
  50.     public PinyinTokenFilter(TokenStream input, int minTermLength) {  
  51.         this(input, Constant.DEFAULT_SHORT_PINYIN, Constant.DEFAULT_PINYIN_ALL,minTermLength);  
  52.     }  
  53.   
  54.     public PinyinTokenFilter(TokenStream input, boolean shortPinyin) {  
  55.         this(input, shortPinyin, Constant.DEFAULT_PINYIN_ALL);  
  56.     }  
  57.       
  58.     public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll) {  
  59.         this(input, shortPinyin,pinyinAll, Constant.DEFAULT_MIN_TERM_LRNGTH);  
  60.     }  
  61.       
  62.     public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll,int minTermLength) {  
  63.         this(input, shortPinyin,pinyinAll,Constant.DEFAULT_OUT_CHINESE, minTermLength);  
  64.     }  
  65.   
  66.     public PinyinTokenFilter(TokenStream input, boolean shortPinyin,boolean pinyinAll,  
  67.             boolean isOutChinese,int minTermLength) {  
  68.         super(input);  
  69.         this.minTermLength = minTermLength;  
  70.         if (this.minTermLength < 1) {  
  71.             this.minTermLength = 1;  
  72.         }  
  73.         this.isOutChinese = isOutChinese;  
  74.         this.shortPinyin = shortPinyin;  
  75.         this.pinyinAll = pinyinAll;  
  76.         // 偏移量属性  
  77.         addAttribute(OffsetAttribute.class);   
  78.     }  
  79.       
  80.     @Override  
  81.     public final boolean incrementToken() throws IOException {  
  82.         while (true) {  
  83.             // 开始处理或上一输入词元已被处理完成  
  84.             if (this.curTermBuffer == null) {  
  85.                 // 获取下一词元输入  
  86.                 if (!this.input.incrementToken()) {   
  87.                     // 没有后继词元输入,处理完成,返回false,结束上层调用  
  88.                     return false;   
  89.                 }  
  90.                 // 缓存词元输入  
  91.                 this.curTermBuffer = ((char[]) this.termAtt.buffer().clone());  
  92.                 this.curTermLength = this.termAtt.length();  
  93.             }  
  94.             String chinese = this.termAtt.toString();  
  95.             // 处理原输入词元  
  96.             if ((this.isOutChinese) && (!this.hasCurOut) && (this.termIte == null)) {  
  97.                 // 准许输出原中文词元且当前没有输出原输入词元且还没有处理拼音结果集  
  98.                 // 标记以保证下次循环不会输出  
  99.                 this.hasCurOut = true;   
  100.                 // 写入原输入词元  
  101.                 this.termAtt.copyBuffer(this.curTermBuffer, 0,  
  102.                         this.curTermLength);  
  103.                 this.posIncrAtt.setPositionIncrement(this.posIncrAtt.getPositionIncrement());  
  104.                 this.typeAtt.setType(StringUtils.isNumeric(chinese)? "numeric_original" :   
  105.                     (StringUtils.containsChinese(chinese)?"chinese_original" : "normal_word"));  
  106.                 return true;  
  107.             }  
  108.               
  109.             String type = this.typeAtt.type();  
  110.             // 若包含中文且中文字符长度不小于限定的最小长度minTermLength  
  111.             if (StringUtils.chineseCharCount(chinese) >= this.minTermLength) {  
  112.                 // 如果需要全拼+简拼  
  113.                 if(this.pinyinAll) {  
  114.                     Collection<String> quanpinColl = Pinyin4jUtil.getPinyinCollection(chinese);  
  115.                     quanpinColl.addAll(Pinyin4jUtil.getPinyinShortCollection(chinese));  
  116.                     this.terms = quanpinColl;  
  117.                 } else {  
  118.                     // 简拼 or 全拼,二选一  
  119.                     this.terms = this.shortPinyin ?   
  120.                             Pinyin4jUtil.getPinyinShortCollection(chinese) :   
  121.                             Pinyin4jUtil.getPinyinCollection(chinese);  
  122.                 }  
  123.                   
  124.                 if (this.terms != null) {  
  125.                     this.termIte = this.terms.iterator();  
  126.                 }  
  127.             } else {  
  128.                 if(null != type && ("numeric_original".equals(type) ||  
  129.                         "normal_word".equals(type))) {  
  130.                     Collection<String> coll = new ArrayList<String>();  
  131.                     coll.add(chinese);  
  132.                     this.terms = coll;  
  133.                     if (this.terms != null) {  
  134.                         this.termIte = this.terms.iterator();  
  135.                     }  
  136.                 }  
  137.             }  
  138.             if (this.termIte != null) {  
  139.                 // 有拼音结果集且未处理完成  
  140.                 while (this.termIte.hasNext()) {   
  141.                     String pinyin = this.termIte.next();  
  142.                     this.termAtt.copyBuffer(pinyin.toCharArray(), 0, pinyin.length());  
  143.                     //同义词的原理  
  144.                     this.posIncrAtt.setPositionIncrement(0);  
  145.                     this.typeAtt.setType(this.shortPinyin ? "short_pinyin" : "pinyin");  
  146.                     return true;  
  147.                 }  
  148.             }  
  149.             // 没有中文或转换拼音失败,不用处理,  
  150.             // 清理缓存,下次取新词元  
  151.             this.curTermBuffer = null;  
  152.             this.termIte = null;  
  153.             this.hasCurOut = false;   
  154.         }  
  155.     }  
  156.   
  157.     @Override  
  158.     public void reset() throws IOException {  
  159.         super.reset();  
  160.     }  
  161. }  

   

Java代码  收藏代码
  1. import java.io.IOException;  
  2.   
  3. import org.apache.lucene.analysis.TokenFilter;  
  4. import org.apache.lucene.analysis.TokenStream;  
  5. import org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;  
  6. import org.apache.lucene.analysis.pinyin.utils.Constant;  
  7. import org.apache.lucene.analysis.pinyin.utils.StringUtils;  
  8. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;  
  9. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;  
  10. import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;  
  11. import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;  
  12. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;  
  13. import org.apache.lucene.analysis.util.CharacterUtils;  
  14.   
  15. /** 
  16.  * 对转换后的拼音进行NGram处理的TokenFilter 
  17.  *  
  18.  * @author Lanxiaowei 
  19.  *  
  20.  */  
  21. @SuppressWarnings("unused")  
  22. public class PinyinNGramTokenFilter extends TokenFilter {  
  23.     private char[] curTermBuffer;  
  24.     private int curTermLength;  
  25.     private int curCodePointCount;  
  26.     private int curGramSize;  
  27.     private int curPos;  
  28.     private int curPosInc, curPosLen;  
  29.     private int tokStart;  
  30.     private int tokEnd;  
  31.     private boolean hasIllegalOffsets;  
  32.   
  33.     private int minGram;  
  34.     private int maxGram;  
  35.     /** 是否需要对中文进行NGram[默认为false] */  
  36.     private final boolean nGramChinese;  
  37.     /** 是否需要对纯数字进行NGram[默认为false] */  
  38.     private final boolean nGramNumber;  
  39.   
  40.     private final CharacterUtils charUtils;  
  41.     private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);  
  42.     private PositionIncrementAttribute posIncAtt;  
  43.     private PositionLengthAttribute posLenAtt;  
  44.     private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);  
  45.     private TypeAttribute typeAtt;  
  46.   
  47.     public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram,  
  48.             boolean nGramChinese,boolean nGramNumber) {  
  49.         super(new CodepointCountFilter(input, minGram, Integer.MAX_VALUE));  
  50.         this.charUtils = CharacterUtils.getInstance();  
  51.         if (minGram < 1) {  
  52.             throw new IllegalArgumentException(  
  53.                     "minGram must be greater than zero");  
  54.         }  
  55.         if (minGram > maxGram) {  
  56.             throw new IllegalArgumentException(  
  57.                     "minGram must not be greater than maxGram");  
  58.         }  
  59.         this.minGram = minGram;  
  60.         this.maxGram = maxGram;  
  61.         this.nGramChinese = nGramChinese;  
  62.         this.nGramNumber = nGramNumber;  
  63.           
  64.         this.termAtt = addAttribute(CharTermAttribute.class);  
  65.         this.offsetAtt = addAttribute(OffsetAttribute.class);  
  66.         this.typeAtt = addAttribute(TypeAttribute.class);  
  67.         this.posIncAtt = addAttribute(PositionIncrementAttribute.class);  
  68.         this.posLenAtt = addAttribute(PositionLengthAttribute.class);  
  69.     }  
  70.   
  71.     public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram,  
  72.             boolean nGramChinese) {  
  73.         this(input, minGram, maxGram, nGramChinese, Constant.DEFAULT_NGRAM_NUMBER);  
  74.     }  
  75.       
  76.     public PinyinNGramTokenFilter(TokenStream input, int minGram, int maxGram) {  
  77.         this(input, minGram, maxGram, Constant.DEFAULT_NGRAM_CHINESE);  
  78.     }  
  79.       
  80.     public PinyinNGramTokenFilter(TokenStream input, int minGram) {  
  81.         this(input, minGram, Constant.DEFAULT_MAX_GRAM);  
  82.     }  
  83.       
  84.     public PinyinNGramTokenFilter(TokenStream input) {  
  85.         this(input, Constant.DEFAULT_MIN_GRAM);  
  86.     }  
  87.   
  88.     @Override  
  89.     public final boolean incrementToken() throws IOException {  
  90.         while (true) {  
  91.             if (curTermBuffer == null) {  
  92.                 if (!input.incrementToken()) {  
  93.                     return false;  
  94.                 }  
  95.                 String type = this.typeAtt.type();  
  96.                 if(null != type && "normal_word".equals(type)) {  
  97.                     return true;  
  98.                 }  
  99.                 if(null != type && "numeric_original".equals(type)) {  
  100.                     return true;  
  101.                 }  
  102.                 if(null != type && "chinese_original".equals(type)) {  
  103.                     return true;  
  104.                 }  
  105.                 if ((!this.nGramNumber)  
  106.                         && (StringUtils.isNumeric(this.termAtt.toString()))) {  
  107.                     return true;  
  108.                 }  
  109.                 if ((!this.nGramChinese)  
  110.                         && (StringUtils.containsChinese(this.termAtt.toString()))) {  
  111.                     return true;  
  112.                 }  
  113.                 curTermBuffer = termAtt.buffer().clone();  
  114.                 curTermLength = termAtt.length();  
  115.                 curCodePointCount = charUtils.codePointCount(termAtt);  
  116.                 curGramSize = minGram;  
  117.                 curPos = 0;  
  118.                 curPosInc = posIncAtt.getPositionIncrement();  
  119.                 curPosLen = posLenAtt.getPositionLength();  
  120.                 tokStart = offsetAtt.startOffset();  
  121.                 tokEnd = offsetAtt.endOffset();  
  122.   
  123.                 hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;  
  124.             }  
  125.   
  126.             if (curGramSize > maxGram  
  127.                     || (curPos + curGramSize) > curCodePointCount) {  
  128.                 ++curPos;  
  129.                 curGramSize = minGram;  
  130.             }  
  131.             if ((curPos + curGramSize) <= curCodePointCount) {  
  132.                 clearAttributes();  
  133.                 final int start = charUtils.offsetByCodePoints(curTermBuffer,  
  134.                         0, curTermLength, 0, curPos);  
  135.                 final int end = charUtils.offsetByCodePoints(curTermBuffer, 0,  
  136.                         curTermLength, start, curGramSize);  
  137.                 termAtt.copyBuffer(curTermBuffer, start, end - start);  
  138.                 posIncAtt.setPositionIncrement(curPosInc);  
  139.                 curPosInc = 0;  
  140.                 posLenAtt.setPositionLength(curPosLen);  
  141.                 offsetAtt.setOffset(tokStart, tokEnd);  
  142.                 curGramSize++;  
  143.                 return true;  
  144.             }  
  145.             curTermBuffer = null;  
  146.         }  
  147.     }  
  148.   
  149.     @Override  
  150.     public void reset() throws IOException {  
  151.         super.reset();  
  152.         curTermBuffer = null;  
  153.     }  
  154. }  

   

Java代码  收藏代码
  1. import java.io.IOException;  
  2.   
  3. import org.apache.lucene.analysis.TokenFilter;  
  4. import org.apache.lucene.analysis.TokenStream;  
  5. import org.apache.lucene.analysis.pinyin.utils.Constant;  
  6. import org.apache.lucene.analysis.pinyin.utils.StringUtils;  
  7. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;  
  8. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;  
  9. import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;  
  10. import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;  
  11. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;  
  12. import org.apache.lucene.analysis.util.CharacterUtils;  
  13.   
  14. /** 
  15.  * 对转换后的拼音进行EdgeNGram处理的TokenFilter 
  16.  *  
  17.  * @author Lanxiaowei 
  18.  *  
  19.  */  
  20. public class PinyinEdgeNGramTokenFilter extends TokenFilter {  
  21.     private final int minGram;  
  22.     private final int maxGram;  
  23.     /** 是否需要对中文进行NGram[默认为false] */  
  24.     private final boolean nGramChinese;  
  25.     /** 是否需要对纯数字进行NGram[默认为false] */  
  26.     private final boolean nGramNumber;  
  27.     private final CharacterUtils charUtils;  
  28.     private char[] curTermBuffer;  
  29.     private int curTermLength;  
  30.     private int curCodePointCount;  
  31.     private int curGramSize;  
  32.     private int tokStart;  
  33.     private int tokEnd;  
  34.     private int savePosIncr;  
  35.     private int savePosLen;  
  36.   
  37.     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);  
  38.     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);  
  39.     private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);  
  40.     private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);  
  41.     private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);  
  42.       
  43.     public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,  
  44.             int maxGram, boolean nGramChinese, boolean nGramNumber) {  
  45.         super(input);  
  46.         if (minGram < 1) {  
  47.             throw new IllegalArgumentException(  
  48.                     "minGram must be greater than zero");  
  49.         }  
  50.   
  51.         if (minGram > maxGram) {  
  52.             throw new IllegalArgumentException(  
  53.                     "minGram must not be greater than maxGram");  
  54.         }  
  55.   
  56.         this.charUtils = CharacterUtils.getInstance();  
  57.         this.minGram = minGram;  
  58.         this.maxGram = maxGram;  
  59.         this.nGramChinese = nGramChinese;  
  60.         this.nGramNumber = nGramNumber;  
  61.     }  
  62.       
  63.     public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,  
  64.             int maxGram, boolean nGramChinese) {  
  65.         this(input, minGram, maxGram, nGramChinese, Constant.DEFAULT_NGRAM_NUMBER);  
  66.     }  
  67.       
  68.     public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram,  
  69.             int maxGram) {  
  70.         this(input, minGram, maxGram, Constant.DEFAULT_NGRAM_CHINESE);  
  71.     }  
  72.       
  73.     public PinyinEdgeNGramTokenFilter(TokenStream input, int minGram) {  
  74.         this(input, minGram, Constant.DEFAULT_MAX_GRAM);  
  75.     }  
  76.       
  77.     public PinyinEdgeNGramTokenFilter(TokenStream input) {  
  78.         this(input, Constant.DEFAULT_MIN_GRAM);  
  79.     }  
  80.   
  81.     @Override  
  82.     public final boolean incrementToken() throws IOException {  
  83.         while (true) {  
  84.             if (curTermBuffer == null) {  
  85.                 if (!input.incrementToken()) {  
  86.                     return false;  
  87.                 }  
  88.                 String type = this.typeAtt.type();  
  89.                 if(null != type && "normal_word".equals(type)) {  
  90.                     return true;  
  91.                 }  
  92.                 if(null != type && "numeric_original".equals(type)) {  
  93.                     return true;  
  94.                 }  
  95.                 if(null != type && "chinese_original".equals(type)) {  
  96.                     return true;  
  97.                 }  
  98.                 if ((!this.nGramNumber)  
  99.                         && (StringUtils.isNumeric(this.termAtt.toString()))) {  
  100.                     return true;  
  101.                 }  
  102.                 if ((!this.nGramChinese)  
  103.                         && (StringUtils.containsChinese(this.termAtt.toString()))) {  
  104.                     return true;  
  105.                 }  
  106.                 curTermBuffer = termAtt.buffer().clone();  
  107.                 curTermLength = termAtt.length();  
  108.                 curCodePointCount = charUtils.codePointCount(termAtt);  
  109.                 curGramSize = minGram;  
  110.                 tokStart = offsetAtt.startOffset();  
  111.                 tokEnd = offsetAtt.endOffset();  
  112.                 savePosIncr += posIncrAtt.getPositionIncrement();  
  113.                 savePosLen = posLenAtt.getPositionLength();  
  114.             }  
  115.             if (curGramSize <= maxGram) {   
  116.                 if (curGramSize <= curCodePointCount) {   
  117.                     clearAttributes();  
  118.                     offsetAtt.setOffset(tokStart, tokEnd);  
  119.                     if (curGramSize == minGram) {  
  120.                         posIncrAtt.setPositionIncrement(savePosIncr);  
  121.                         savePosIncr = 0;  
  122.                     } else {  
  123.                         posIncrAtt.setPositionIncrement(0);  
  124.                     }  
  125.                     posLenAtt.setPositionLength(savePosLen);  
  126.                     final int charLength = charUtils.offsetByCodePoints(  
  127.                             curTermBuffer, 0, curTermLength, 0, curGramSize);  
  128.                     termAtt.copyBuffer(curTermBuffer, 0, charLength);  
  129.                     curGramSize++;  
  130.                     return true;  
  131.                 }  
  132.             }  
  133.             curTermBuffer = null;  
  134.         }  
  135.     }  
  136.   
  137.     @Override  
  138.     public void reset() throws IOException {  
  139.         super.reset();  
  140.         curTermBuffer = null;  
  141.         savePosIncr = 0;  
  142.     }  
  143. }  

   

Java代码  收藏代码
  1. package org.apache.lucene.analysis.pinyin.lucene5;  
  2.   
  3. import java.io.BufferedReader;  
  4. import java.io.Reader;  
  5. import java.io.StringReader;  
  6.   
  7. import org.apache.lucene.analysis.Analyzer;  
  8. import org.apache.lucene.analysis.TokenStream;  
  9. import org.apache.lucene.analysis.Tokenizer;  
  10. import org.apache.lucene.analysis.pinyin.utils.Constant;  
  11. import org.wltea.analyzer.lucene.IKTokenizer;  
  12. /** 
  13.  * 自定义拼音分词器 
  14.  * @author Lanxiaowei 
  15.  * 
  16.  */  
  17. public class PinyinAnalyzer extends Analyzer {  
  18.     private int minGram;  
  19.     private int maxGram;  
  20.     private boolean useSmart;  
  21.     /** 是否需要对中文进行NGram[默认为false] */  
  22.     private boolean nGramChinese;  
  23.     /** 是否需要对纯数字进行NGram[默认为false] */  
  24.     private boolean nGramNumber;  
  25.     /**是否开启edgesNGram模式*/  
  26.     private boolean edgesNGram;  
  27.       
  28.     public PinyinAnalyzer() {  
  29.         this(Constant.DEFAULT_IK_USE_SMART);  
  30.     }  
  31.       
  32.     public PinyinAnalyzer(boolean useSmart) {  
  33.         this(Constant.DEFAULT_MIN_GRAM, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, useSmart,Constant.DEFAULT_NGRAM_CHINESE);  
  34.     }  
  35.       
  36.     public PinyinAnalyzer(int minGram) {  
  37.         this(minGram, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, Constant.DEFAULT_IK_USE_SMART, Constant.DEFAULT_NGRAM_CHINESE,Constant.DEFAULT_NGRAM_NUMBER);  
  38.     }  
  39.   
  40.     public PinyinAnalyzer(int minGram,boolean useSmart) {  
  41.         this(minGram, Constant.DEFAULT_MAX_GRAM, Constant.DEFAULT_EDGES_GRAM, useSmart,Constant.DEFAULT_NGRAM_CHINESE);  
  42.     }  
  43.       
  44.     public PinyinAnalyzer(int minGram, int maxGram) {  
  45.         this(minGram, maxGram, Constant.DEFAULT_EDGES_GRAM);  
  46.     }  
  47.       
  48.     public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram) {  
  49.         this(minGram, maxGram, edgesNGram, Constant.DEFAULT_IK_USE_SMART);  
  50.     }  
  51.       
  52.     public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart) {  
  53.         this(minGram, maxGram, edgesNGram, useSmart,Constant.DEFAULT_NGRAM_CHINESE);  
  54.     }  
  55.   
  56.     public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart,  
  57.             boolean nGramChinese) {  
  58.         this(minGram, maxGram, edgesNGram, useSmart,nGramChinese,Constant.DEFAULT_NGRAM_NUMBER);  
  59.     }  
  60.       
  61.     public PinyinAnalyzer(int minGram, int maxGram,boolean edgesNGram,boolean useSmart,  
  62.             boolean nGramChinese,boolean nGramNumber) {  
  63.         super();  
  64.         this.minGram = minGram;  
  65.         this.maxGram = maxGram;  
  66.         this.edgesNGram = edgesNGram;  
  67.         this.useSmart = useSmart;  
  68.         this.nGramChinese = nGramChinese;  
  69.         this.nGramNumber = nGramNumber;  
  70.     }  
  71.   
  72.     @Override  
  73.     protected TokenStreamComponents createComponents(String fieldName) {  
  74.         Reader reader = new BufferedReader(new StringReader(fieldName));  
  75.         Tokenizer tokenizer = new IKTokenizer(reader, useSmart);  
  76.         //转拼音  
  77.         TokenStream tokenStream = new PinyinTokenFilter(tokenizer,  
  78.             Constant.DEFAULT_SHORT_PINYIN,Constant.DEFAULT_PINYIN_ALL, Constant.DEFAULT_MIN_TERM_LRNGTH);  
  79.         //对拼音进行NGram处理  
  80.         if(edgesNGram) {  
  81.             tokenStream = new PinyinEdgeNGramTokenFilter(tokenStream,this.minGram,  
  82.                 this.maxGram,this.nGramChinese,this.nGramNumber);  
  83.         } else {  
  84.             tokenStream = new PinyinNGramTokenFilter(tokenStream,this.minGram,  
  85.                     this.maxGram,this.nGramChinese,this.nGramNumber);  
  86.         }  
  87.         return new Analyzer.TokenStreamComponents(tokenizer, tokenStream);  
  88.     }  
  89. }  

   Lucene5中PinyinAnalyzer分词器使用示例代码如下:

Java代码  收藏代码
  1. import java.io.IOException;  
  2.   
  3. import org.apache.lucene.analysis.Analyzer;  
  4. import org.apache.lucene.analysis.TokenStream;  
  5. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;  
  6. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;  
  7. import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;  
  8. import org.apache.lucene.analysis.tokenattributes.TypeAttribute;  
  9. import org.wltea.analyzer.lucene.IKAnalyzer;  
  10.   
  11. import com.yida.framework.lucene5.pinyin.PinyinAnalyzer;  
  12. @SuppressWarnings("resource")  
  13. public class AnalyzerTest {  
  14.     public static void main(String[] args) throws IOException {  
  15.         String s = "京华时报2009年1月23日报道 the this that welcome to beijing 虽然我很丑,但是我很温柔,昨天,受一股来自中西伯利亚的强冷空气影响,本市出现大风降温天气,白天最高气温只有零下7摄氏度,同时伴有6到7级的偏北风。";  
  16.           
  17.         //Analyzer analyzer = new IKAnalyzer();  
  18.         Analyzer analyzer = new PinyinAnalyzer();  
  19.         TokenStream tokenStream = analyzer.tokenStream("text", s);  
  20.         displayTokens(tokenStream);  
  21.   
  22.     }  
  23.       
  24.     public static void displayTokens(TokenStream tokenStream) throws IOException {  
  25.         OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);  
  26.         PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);  
  27.         CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);  
  28.         TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);  
  29.           
  30.         tokenStream.reset();  
  31.         int position = 0;  
  32.         while (tokenStream.incrementToken()) {  
  33.             int increment = positionIncrementAttribute.getPositionIncrement();  
  34.             if(increment > 0) {  
  35.                 position = position + increment;  
  36.                 System.out.print(position + ":");  
  37.             }  
  38.             int startOffset = offsetAttribute.startOffset();  
  39.             int endOffset = offsetAttribute.endOffset();  
  40.             String term = charTermAttribute.toString();  
  41.             System.out.println("[" + term + "]" + ":(" + startOffset + "-->" + endOffset + "):" + typeAttribute.type());  
  42.         }  
  43.         tokenStream.end();  
  44.         tokenStream.close();  
  45.     }  
  46. }  

    

Java代码  收藏代码
  1. package org.apache.lucene.analysis.pinyin.solr5;  
  2.   
  3. import java.util.Map;  
  4.   
  5. import org.apache.lucene.analysis.TokenFilter;  
  6. import org.apache.lucene.analysis.TokenStream;  
  7. import org.apache.lucene.analysis.pinyin.lucene5.PinyinTokenFilter;  
  8. import org.apache.lucene.analysis.pinyin.utils.Constant;  
  9. import org.apache.lucene.analysis.util.TokenFilterFactory;  
  10. /** 
  11.  * PinyinTokenFilter工厂类 
  12.  * @author Lanxiaowei 
  13.  * 
  14.  */  
  15. public class PinyinTokenFilterFactory extends TokenFilterFactory {  
  16.     /**是否输出原中文*/  
  17.     private boolean outChinese;  
  18.     /**是否只转换简拼*/  
  19.     private boolean shortPinyin;  
  20.     /**是否转换全拼+简拼*/  
  21.     private boolean pinyinAll;  
  22.     /**中文词组长度过滤,默认超过minTermLength长度的中文才转换拼音*/  
  23.     private int minTermLength;  
  24.   
  25.     public PinyinTokenFilterFactory(Map<String, String> args) {  
  26.         super(args);  
  27.         this.outChinese = getBoolean(args, "outChinese", Constant.DEFAULT_OUT_CHINESE);  
  28.         this.shortPinyin = getBoolean(args, "shortPinyin", Constant.DEFAULT_SHORT_PINYIN);  
  29.         this.pinyinAll = getBoolean(args, "pinyinAll", Constant.DEFAULT_PINYIN_ALL);  
  30.         this.minTermLength = getInt(args, "minTermLength", Constant.DEFAULT_MIN_TERM_LRNGTH);  
  31.     }  
  32.   
  33.     public TokenFilter create(TokenStream input) {  
  34.         return new PinyinTokenFilter(input, this.shortPinyin,this.outChinese,  
  35.                 this.minTermLength);  
  36.     }  
  37.   
  38.     public boolean isOutChinese() {  
  39.         return outChinese;  
  40.     }  
  41.   
  42.     public void setOutChinese(boolean outChinese) {  
  43.         this.outChinese = outChinese;  
  44.     }  
  45.   
  46.     public boolean isShortPinyin() {  
  47.         return shortPinyin;  
  48.     }  
  49.   
  50.     public void setShortPinyin(boolean shortPinyin) {  
  51.         this.shortPinyin = shortPinyin;  
  52.     }  
  53.   
  54.     public boolean isPinyinAll() {  
  55.         return pinyinAll;  
  56.     }  
  57.   
  58.     public void setPinyinAll(boolean pinyinAll) {  
  59.         this.pinyinAll = pinyinAll;  
  60.     }  
  61.   
  62.     public int getMinTermLength() {  
  63.           
  64.           
  65.         return minTermLength;  
  66.     }  
  67.   
  68.     public void setMinTermLength(int minTermLength) {  
  69.         this.minTermLength = minTermLength;  
  70.     }  
  71. }  

   

Java代码  收藏代码
  1. import java.util.Map;  
  2.   
  3. import org.apache.lucene.analysis.TokenFilter;  
  4. import org.apache.lucene.analysis.TokenStream;  
  5. import org.apache.lucene.analysis.pinyin.lucene5.PinyinEdgeNGramTokenFilter;  
  6. import org.apache.lucene.analysis.pinyin.lucene5.PinyinNGramTokenFilter;  
  7. import org.apache.lucene.analysis.pinyin.utils.Constant;  
  8. import org.apache.lucene.analysis.util.TokenFilterFactory;  
  9. /** 
  10.  * PinyinNGramTokenFilter工厂类 
  11.  * @author Lanxiaowei 
  12.  * 
  13.  */  
  14. public class PinyinNGramTokenFilterFactory extends TokenFilterFactory {  
  15.     private int minGram;  
  16.     private int maxGram;  
  17.     /** 是否需要对中文进行NGram[默认为false] */  
  18.     private boolean nGramChinese;  
  19.     /** 是否需要对纯数字进行NGram[默认为false] */  
  20.     private boolean nGramNumber;  
  21.     /**是否开启edgesNGram模式*/  
  22.     private boolean edgesNGram;  
  23.   
  24.     public PinyinNGramTokenFilterFactory(Map<String, String> args) {  
  25.         super(args);  
  26.   
  27.         this.minGram = getInt(args, "minGram", Constant.DEFAULT_MIN_GRAM);  
  28.         this.maxGram = getInt(args, "maxGram", Constant.DEFAULT_MAX_GRAM);  
  29.         this.edgesNGram = getBoolean(args, "edgesNGram", Constant.DEFAULT_EDGES_GRAM);  
  30.         this.nGramChinese = getBoolean(args, "nGramChinese", Constant.DEFAULT_NGRAM_CHINESE);  
  31.         this.nGramNumber = getBoolean(args, "nGramNumber", Constant.DEFAULT_NGRAM_NUMBER);  
  32.     }  
  33.   
  34.     public TokenFilter create(TokenStream input) {  
  35.         if(edgesNGram) {  
  36.             return new PinyinEdgeNGramTokenFilter(input, this.minGram, this.maxGram,   
  37.                 this.nGramChinese, this.nGramNumber);  
  38.         }  
  39.         return new PinyinNGramTokenFilter(input, this.minGram, this.maxGram,  
  40.                 this.nGramChinese,this.nGramNumber);  
  41.     }  
  42. }  

    我已经将他们打包成了两个jar包:lucene-analyzer-pinyin.5.1.0.jar和solr-analyzer-pinyin.5.1.0.jar(这两个jar包我已经上传到最底下的附件里,特此提醒!!!),只需要把这两个jar放入core的lib目录下,如图:

 然后在schema.xml中添加拼音分词的域类型,如图:

 然后如图应用定义好的text_pinyin这个域类型,看图:

 然后你就可以启动你的tomcat部署solr,进行拼音分词测试了:

 如果你看到如图效果,表明拼音分词已经部署成功且测试成功!如果你有任何疑问,请联系我!我的联系方式请查阅我之前的博客,打完收工,谢谢!就此晚安啦!

转载:http://iamyida.iteye.com/blog/2240657

版权声明:本文内容由阿里云实名注册用户自发贡献,版权归原作者所有,阿里云开发者社区不拥有其著作权,亦不承担相应法律责任。具体规则请查看《阿里云开发者社区用户服务协议》和《阿里云开发者社区知识产权保护指引》。如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。

分享:
人工智能
使用钉钉扫一扫加入圈子
+ 订阅

了解行业+人工智能最先进的技术和实践,参与行业+人工智能实践项目

其他文章
最新文章
相关文章