package org.apache.nutch.parse.html; import java.text.ParseException; /** private static final String TIME_REGEX = "(:|>|\\s)?20[0-9]{2}(-|/|\\.|\\u5e74)\\d{1,2}(-|/|\\.|\\u6708)\\d{1,2}(\\u65e5)?.\\d{2}(:|\\u65f6)\\d{2}((:|\\u5206)\\d{2})?"; /** Matcher m = pattern.matcher(content); // BBS分析最后一个发表时间 String dateStr = null; Date date = null; while (m.find()) { dateStr = m.group(); if (dateStr == null) dateStr = dateStr.trim().replaceAll(">", ""); if (dateStr.startsWith(":")) { dateStr = dateStr.replaceAll("\\.|/|\\u5e74|\\u6708|\\u65e5", Date tempDate; try { if (tempDate.after(now)) { } catch (ParseException e) { if (date == null) { if (date != null) { return (date.getTime() + (long) 8 * 3600 * 1000) + ""; } else { // 新闻网页分析第一个出现的时间 String dateStr = null; if (m.find()) { if (dateStr != null) { dateStr = dateStr.trim().replaceAll(">", ""); if (dateStr.startsWith(":")) { dateStr = dateStr.replaceAll("\\.|/|\\u5e74|\\u6708|\\u65e5", try { return (sdf.parse(dateStr).getTime() + (long) 8 * 3600 * 1000) } catch (ParseException e) { return ((new Date()).getTime() + (long) 8 * 3600 * 1000) + ""; } |
本文转自william_xu 51CTO博客,原文链接:http://blog.51cto.com/williamx/790610,如需转载请自行联系原作者