package com.xh.crawle; import java.io.BufferedReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.sql.rowset.spi.SyncFactory; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; /** * 多线程 * @author kali * */ public class Test implements Runnable{ String content; HttpClient client; GetMethod getMethod; String myDomain; static String strHomePage; static List<String> urls=new ArrayList<String>(); int i=0; public String getContent(String url) { i+=1; client=new HttpClient(); getMethod=new GetMethod(url); StringBuffer buffer=new StringBuffer(); InputStream in; try { int status=client.executeMethod(getMethod); if(status==HttpStatus.SC_OK) { // content=getMethod.getResponseBodyAsString(); in=getMethod.getResponseBodyAsStream(); content=in_str(in); FileWriter fileWriter=new FileWriter("F:\\jd2\\jd_"+i+"_"+Thread.currentThread().getName()+".html"); buffer.append(content); fileWriter.write(buffer.toString()); fileWriter.flush(); fileWriter.close(); //System.out.println(">>>>"+content); } getUrl(content); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return content; } public static void main(String[] args) throws InterruptedException { Test test=new Test(); strHomePage="http://www.jd.com"; for(int i=0;i<10;i++) { new Thread(test).start(); Thread.currentThread().sleep(500); } } public List<String> getUrl(String cont) { String tmpStr = cont; myDomain=getDomain(); String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("+ myDomain + ")[^\\s\"\'>]*"; //正则 Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);//Pattern.CASE_INSENSITIVE 大小写不敏感 Matcher m = p.matcher(tmpStr); boolean blnp = m.find(); while(blnp) { if (!urls.contains(m.group(0))) { urls.add(m.group(0)); System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"+m.group(0)); System.out.println("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<"+urls.size()); } tmpStr = tmpStr.substring(m.end(), tmpStr.length()); m = p.matcher(tmpStr); blnp = m.find(); } return urls; } public String getDomain() { String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)"; //String reg = "(?<=http\\://\\w{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)"; Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(strHomePage); boolean blnp = m.find(); if (blnp == true) { return m.group(0); } return null; } public synchronized void Crawle(List<String> list) { while(!list.isEmpty()) { getContent(list.get(0)); System.out.println("*************************************************"+list.get(0)); System.out.println("*************************************************"+urls.size()); list.remove(0); } } public String in_str(InputStream in) throws IOException { InputStreamReader inputStreamReader=new InputStreamReader(in); StringBuffer buffer=new StringBuffer(); BufferedReader bufferedReader=new BufferedReader(inputStreamReader); String line=bufferedReader.readLine(); while(line!=null) { buffer.append(line); line=bufferedReader.readLine(); } return buffer.toString(); } @Override public void run() { getContent(strHomePage); if(!urls.isEmpty()) { Crawle(urls); } } }