采集一个简单的页面，我配置的分页一直不对，只能采集第一页?报错

以下是我配置的XML文件：

<?xml version="1.0" encoding="UTF-8"?>

<task>


<fetchConfig charset="utf-8" timeOutSecond="5" errorRetry="5" errorDelayTime="10" runThreadNum="5" fetchPrepareDelayTime="5" >

<userAgent>Mozilla/5.0 (compatible; webpasser;)</userAgent>
<headers>
<header name="Referer" value="http://www.nhfpc.gov.cn" />
</headers>


<cookies>

</cookies>




</fetchConfig>


<scope>
<limitHost value="www.nhfpc.gov.cn" />
</scope>


<seeds>
<seed url="http://www.nhfpc.gov.cn/zhuzhan/flfg/lists.shtml" />
</seeds>


<page>
<scope>
<rule type="regex" value="http://www.nhfpc.gov.cn/zhuzhan/flfg/(.*).shtml" />
</scope>

<digLink>
<rules>
<rule type="xpath" value="//div[@class='contents']//ul//li//a[@href]" attr="href" />
<rule type="replace" >
<oldChars>../</oldChars>
<newChars></newChars>
</rule>
<rule type="combine" value="http://www.nhfpc.gov.cn/[$this]" />
</rules>
</digLink>

<digLink>
<rules>
<rule type="xpath" value="//div[@class='pagination_index_num']//a[@href]" attr="href" />
<rule type="combine" value="http://www.nhfpc.gov.cn/zhuzhan/flfg/[$this]" />
</rules>
</digLink>
</page>


<page name="films">

<scope>
<rule type="regex" value="http://www.nhfpc.gov.cn/\w+/\w+/\d*/\w+.shtml" />
</scope>

<field name="title" >
<rules>

<rule type="xpath" value="//div[@class='content_title']" exp="text()" />
<rule type="toText" >
</rule>
</rules>
</field>

<field name="subTitle" >
<rules>
<rule type="xpath" value="//div[@class='content_subtitle']//span[1]" exp="text()" />
<rule type="toText" >
</rule>

</rules>
</field>

<field name="submitdate" >
<rules>
<rule type="xpath" value="//div[@class='content_subtitle']//span[3]" exp="text()" />
</rules>
</field>

<field name="content" >
<rules>
<rule type="xpath" value="//div[@class='content']" exp="text()" />
<rule type="toText" >
</rule>
</rules>
</field>



</page>


<resultHandler target="handleResultMapInterface" classPath="com.hxt.webpasser.persistent.impl.DiskJsonHandleResult">
<property name="rootDir" value="d:/www.nhfpc.gov.cn/data" ></property>
<property name="charSet" value="gbk" ></property>
</resultHandler>

</task>

@ hxt168

这个网站分页是js生成的，要用自定义处理链（项目要再gitpull下）。

这个分页的处理链简单写了下：

publicclassCustomDemoRuleimplementsDecideRule{

publicList<Object>handle(Rulerule,List<Object>contentList,MapvalueMap){
//valueMap默认会有fetchUrl和 taskName的值
StringfetchUrl=String.valueOf(valueMap.get("fetchUrl"));
if(contentList!=null)
{
for(inti=0;i<contentList.size();i++)
{
//('page_div',4,2,'lists','shtml',80)
Stringcon=String.valueOf(contentList.get(i));
String[]arr=con.split(",");
if(arr.length>5){
inttotalPage=Integer.parseInt(arr[1]);
StringlistStr=StringUtil.cutNotContainStartAndEnd(arr[3],"'","'");
StringshtmlStr=StringUtil.cutNotContainStartAndEnd(arr[4],"'","'");
StringpreUrl=StringUtil.cutNotContainStartAndEnd(fetchUrl,"",listStr);
List<Object>list=newArrayList<Object>();
for(intj=1;j<=totalPage;j++){
Stringurl=preUrl+listStr+"_"+j+"."+shtmlStr;
list.add(url);
}
returnlist;

}

}

}

returnnull;
}

}

配置：

<?xmlversion="1.0"encoding="UTF-8"?>

<task>


<fetchConfigcharset="utf-8"timeOutSecond="5"errorRetry="5"errorDelayTime="10"runThreadNum="5" fetchPrepareDelayTime="5">

  <userAgent>Mozilla/5.0(compatible;webpasser;)</userAgent>
<headers>
<headername="Referer"value="http://www.nhfpc.gov.cn"/>
</headers>


<cookies>

</cookies>




</fetchConfig>

<customConfig>
    
<customRules>
     <customRulename="digJumpPage"classPath="com.hxt.webpasser.regular.CustomDemoRule"/>
</customRules>
</customConfig>


<scope>
<limitHostvalue="www.nhfpc.gov.cn"/>
</scope>


<seeds>
<seed url="http://www.nhfpc.gov.cn/zhuzhan/flfg/lists.shtml"/>
</seeds>


<page>
<scope>
<ruletype="regex"value="http://www.nhfpc.gov.cn/zhuzhan/(.*)/(.*).shtml"/>
</scope>

  <digLink>
     <rules>
<ruletype="xpath"value="//div[@class='contents']//ul//li//a[@href]"attr="href" />
<ruletype="replace">
  <oldChars>../</oldChars>
    <newChars></newChars>
  </rule>
<ruletype="combine"value="http://www.nhfpc.gov.cn/[$this]"/>
</rules>
  </digLink>
  
<digLink>
     <rules>

        <ruletype="cut">
   
  <pre><![CDATA[createPageHTML(]]></pre>
  <end><![CDATA[);]]></end>
</rule>

  
<ruletype="digJumpPage" />

    </rules>
  </digLink>
</page>


<pagename="films">

<scope>
<ruletype="regex"value="http://www.nhfpc.gov.cn/\w+/\w+/\d*/\w+.shtml"/>
</scope>

<fieldname="title">
<rules>

<ruletype="xpath"value="//div[@class='content_title']"exp="text()"/>
<ruletype="toText">
</rule>
</rules>
</field>

<fieldname="subTitle">
<rules>
<ruletype="xpath"value="//div[@class='content_subtitle']//span[1]"exp="text()"/>
<ruletype="toText">
</rule>

</rules>
</field>

<fieldname="submitdate">
<rules>
<ruletype="xpath"value="//div[@class='content_subtitle']//span[3]"exp="text()"/>
</rules>
</field>

<fieldname="content">
<rules>
<ruletype="xpath"value="//div[@class='content']"exp="text()"/>
<ruletype="toText">
</rule>
</rules>
</field>



</page>


<resultHandlertarget="handleResultMapInterface"classPath="com.hxt.webpasser.persistent.impl.DiskJsonHandleResult">
<propertyname="rootDir"value="d:/www.nhfpc.gov.cn/data"></property>
<propertyname="charSet"value="gbk"></property>
</resultHandler>

</task>

@hxt168谢谢~

采集一个简单的页面，我配置的分页一直不对，只能采集第一页?报错

相关文章

相关电子书