由于我们的业务系统中有大量的MHT格式的资料,需要对其建立索引,搜索很久了一直没有找到相关解析的类库,只好自己动手丰衣足食了。已实现内容的提取以及和lucene的整合,稍后会完善编码检测及其他内容的提取,做一个完整的parser出来。
-
文本内容提取: 首先提取html部分的内容,解码之后使用nekoHtml提取文本内容;
-
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
publicclassMhtDocHandlerextendsHtmDocHandler {privateDOMFragmentParser parser =newDOMFragmentParser();publicDocument getDocument(InputStream is)throwsDocumentHandlerException {DocumentFragment node =newHTMLDocumentImpl().createDocumentFragment();try{String mhts = IOUtils.toString(is);inta1 = mhts.indexOf("<HTML");inta2 = mhts.indexOf("</HTML>");String html = mhts.substring(a1, a2 +8);//在mht中文本按照QuotedPrintable格式编码html = decodeQuotedPrintable(html,"UTF-8");StringReader r =newStringReader(html);parser.parse(newInputSource(r), node);}catch(Exception e) {thrownewDocumentHandlerException("Cannot parse MHT document: ", e);}Document doc =newDocument();StringBuffer sb =newStringBuffer();getText(sb, node,"title");String title = sb.toString().trim();sb.setLength(0);getText(sb, node,"body");String text = sb.toString().trim();if(!title.equals("")) {doc.add(newField(WikiDOC.DOC_TITLE, title,Field.Store.YES, Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));}if(!text.equals("")) {doc.add(newField(WikiDOC.DOC_CONTENT, text,Field.Store.COMPRESS, Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));}returndoc;}publicstaticString decodeQuotedPrintable(String str, String encoding) {if(str ==null) {returnnull;}try{//str = str.replaceAll("=\n", "");//??byte[] bytes = str.getBytes("US-ASCII");ByteArrayOutputStream buffer =newByteArrayOutputStream();for(inti =0; i < bytes.length; i++) {intb = bytes[i];if(b =='=') {intu = Character.digit((char) bytes[++i],16);intl = Character.digit((char) bytes[++i],16);if(u == -1|| l == -1) {//??continue;}buffer.write((char) ((u <<4) + l));}else{buffer.write(b);}}returnbuffer.toString(encoding);}catch(Exception e) {e.printStackTrace();returnstr;}}}
本文转自 独孤环宇 51CTO博客,原文链接:http://blog.51cto.com/snowtiger/1963087