由于我们的业务系统中有大量的MHT格式的资料,需要对其建立索引,搜索很久了一直没有找到相关解析的类库,只好自己动手丰衣足食了。已实现内容的提取以及和lucene的整合,稍后会完善编码检测及其他内容的提取,做一个完整的parser出来。
-
文本内容提取: 首先提取html部分的内容,解码之后使用nekoHtml提取文本内容;
-
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
public
class
MhtDocHandler
extends
HtmDocHandler {
private
DOMFragmentParser parser =
new
DOMFragmentParser();
public
Document getDocument(InputStream is)
throws
DocumentHandlerException {
DocumentFragment node =
new
HTMLDocumentImpl().createDocumentFragment();
try
{
String mhts = IOUtils.toString(is);
int
a1 = mhts.indexOf(
"<HTML"
);
int
a2 = mhts.indexOf(
"</HTML>"
);
String html = mhts.substring(a1, a2 +
8
);
//在mht中文本按照QuotedPrintable格式编码
html = decodeQuotedPrintable(html,
"UTF-8"
);
StringReader r =
new
StringReader(html);
parser.parse(
new
InputSource(r), node);
}
catch
(Exception e) {
throw
new
DocumentHandlerException(
"Cannot parse MHT document: "
, e);
}
Document doc =
new
Document();
StringBuffer sb =
new
StringBuffer();
getText(sb, node,
"title"
);
String title = sb.toString().trim();
sb.setLength(
0
);
getText(sb, node,
"body"
);
String text = sb.toString().trim();
if
(!title.equals(
""
)) {
doc.add(
new
Field(WikiDOC.DOC_TITLE, title,
Field.Store.YES, Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
}
if
(!text.equals(
""
)) {
doc.add(
new
Field(WikiDOC.DOC_CONTENT, text,
Field.Store.COMPRESS, Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
}
return
doc;
}
public
static
String decodeQuotedPrintable(String str, String encoding) {
if
(str ==
null
) {
return
null
;
}
try
{
//str = str.replaceAll("=\n", "");//??
byte
[] bytes = str.getBytes(
"US-ASCII"
);
ByteArrayOutputStream buffer =
new
ByteArrayOutputStream();
for
(
int
i =
0
; i < bytes.length; i++) {
int
b = bytes[i];
if
(b ==
'='
) {
int
u = Character.digit((
char
) bytes[++i],
16
);
int
l = Character.digit((
char
) bytes[++i],
16
);
if
(u == -
1
|| l == -
1
) {
//??
continue
;
}
buffer.write((
char
) ((u <<
4
) + l));
}
else
{
buffer.write(b);
}
}
return
buffer.toString(encoding);
}
catch
(Exception e) {
e.printStackTrace();
return
str;
}
}
}
本文转自 独孤环宇 51CTO博客,原文链接:http://blog.51cto.com/snowtiger/1963087