1.先导入依赖
<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.15.4</version></dependency><dependency><groupId>commons-io</groupId><artifactId>commons-io</artifactId><version>2.11.0</version></dependency>
2.获取目录
参数说明一下 root是书目录起始页 next是目录的下一页 dir是收集各章节网址的集合
privatestaticvoidgetDir(Stringroot,Stringnext,List<String>dir) throwsException { Documentdocument=Jsoup.connect(next).get(); Elementselements=document.select("a[href$=\".html\"]"); List<String>list=elements.eachAttr("href"); list.remove(0); if (elements.last().text().equals("下一页")){ StringnextPage=list.get(list.size() -1); nextPage=root+nextPage.substring(nextPage.lastIndexOf("/") +1); list.remove(list.size() -1); if (elements.get(elements.size() -2).text().equals("上一页")){ list.remove(list.size() -1); } dir.addAll(list); getDir(root,nextPage,dir); return; } if ((elements.last().text().equals("上一页"))){ list.remove(list.size() -1); } dir.addAll(list); }
3.根据目录获取章节信息写入文件
参数说明:dir刚才收集的目录 root是书目录起始页 writer用于将书写到文件中
privatestaticvoidgetContent(List<String>dir,Stringroot, Writerwriter) throwsException { StringBuildertemp=newStringBuilder(); for (Stringurl : dir) { Documentdocument=Jsoup.connect(root+url).get(); Stringtitle=document.select("h1").text() +"\n"; System.out.println(title); Elementscontent=document.select("div[id=\"content\"]"); Stringtext=content.toString(); inti=text.indexOf("&"); if (i!=-1){ text=text.substring(i); } text=text.replaceAll(" ","").replaceAll("<br><br>","").replaceAll("</div>",""); temp.append(title+text); } IOUtils.write(temp,writer); writer.close(); IOUtils.close(); }
总的代码:
publicclassSoup { publicstaticvoidmain(String[] args) throwsException { Stringurl="https://www.bbiquge.net/book/132488/"; StringfileName=Jsoup.connect(url).get().select("h1").text(); fileName=fileName.replace("/","") +".txt"; Filefile=newFile(fileName); Writerwriter=newFileWriter(file,true); List<String>dir=newArrayList<>(); getDir(url,url,dir); getContent(dir,url,writer); } privatestaticvoidgetDir(Stringroot,Stringnext,List<String>dir) throwsException { Documentdocument=Jsoup.connect(next).get(); Elementselements=document.select("a[href$=\".html\"]"); List<String>list=elements.eachAttr("href"); list.remove(0); if (elements.last().text().equals("下一页")){ StringnextPage=list.get(list.size() -1); nextPage=root+nextPage.substring(nextPage.lastIndexOf("/") +1); list.remove(list.size() -1); if (elements.get(elements.size() -2).text().equals("上一页")){ list.remove(list.size() -1); } dir.addAll(list); getDir(root,nextPage,dir); return; } if ((elements.last().text().equals("上一页"))){ list.remove(list.size() -1); } dir.addAll(list); } privatestaticvoidgetContent(List<String>dir,Stringroot, Writerwriter) throwsException { StringBuildertemp=newStringBuilder(); for (Stringurl : dir) { Documentdocument=Jsoup.connect(root+url).get(); Stringtitle=document.select("h1").text() +"\n"; System.out.println(title); Elementscontent=document.select("div[id=\"content\"]"); Stringtext=content.toString(); inti=text.indexOf("&"); if (i!=-1){ text=text.substring(i); } text=text.replaceAll(" ","").replaceAll("<br><br>","").replaceAll("</div>",""); temp.append(title+text); } IOUtils.write(temp,writer); writer.close(); IOUtils.close(); } }
闲暇写出来的,运行起来效率感人也是,希望各位能说些优化方案 学习一下