HttpClient在java中的使用http://www.bieryun.com/1011.html
项目中一直在使用HttpClient,版本是3.6的,负责维护的同事离职后,就没有更新过,在这次项目改版中决定对这块进行升级到4.3版本,查阅了一些资料写了部分出来,还不是很完善,后期有时间在更新
这里对httpclient如何获取原页面编码进行了一些处理,感觉还不是很完善,后期想到在处理吧
具体的理论东西就不说了,代码中注释都有一些说明。
直接上代码
- /**
- * 项目名:NewsCrawlerV3.1
- * 文件名:InfoLoad.java
- * 作者:zhouyh
- * 时间:2015-10-14 上午08:46:47
- * 描述:TODO(用一句话描述该文件做什么)
- */
- package com.boryou.module.load;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InterruptedIOException;
- import java.net.MalformedURLException;
- import java.net.URI;
- import java.net.URISyntaxException;
- import java.net.URL;
- import java.net.UnknownHostException;
- import java.nio.charset.Charset;
- import java.security.KeyManagementException;
- import java.security.KeyStoreException;
- import java.security.NoSuchAlgorithmException;
- import java.security.cert.CertificateException;
- import java.security.cert.X509Certificate;
- import java.util.Random;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import java.util.zip.GZIPInputStream;
- import javax.net.ssl.SSLContext;
- import javax.net.ssl.SSLException;
- import org.apache.http.Header;
- import org.apache.http.HeaderElement;
- import org.apache.http.HttpEntity;
- import org.apache.http.HttpEntityEnclosingRequest;
- import org.apache.http.HttpHost;
- import org.apache.http.HttpRequest;
- import org.apache.http.client.ClientProtocolException;
- import org.apache.http.client.HttpRequestRetryHandler;
- import org.apache.http.client.config.CookieSpecs;
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.client.protocol.HttpClientContext;
- import org.apache.http.conn.ConnectTimeoutException;
- import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
- import org.apache.http.entity.ContentType;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.client.LaxRedirectStrategy;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.protocol.HttpContext;
- import org.apache.http.ssl.SSLContextBuilder;
- import org.apache.http.ssl.TrustStrategy;
- import org.apache.http.util.ByteArrayBuffer;
- import com.boryou.constant.Constant;
- import com.boryou.util.Common;
- /**
- * 类名: InfoLoad
- * 包名: com.boryou.module.load
- * 作者: zhouyh
- * 时间: 2015-10-14 上午08:46:47
- * 描述: 下载基础类,共监控、下载等模块使用
- */
- public class InfoLoad {
- // 创建httpclient连接池
- private PoolingHttpClientConnectionManager httpClientConnectionManager = null;
- /******单例模式声明开始******/
- //类初始化时,自动实例化,饿汉单例模式
- private static final InfoLoad infoLoad = new InfoLoad();
- /**
- *
- * 方法名:getInfoLoadInstance
- * 作者:zhouyh
- * 创建时间:2015-10-14 上午08:59:54
- * 描述:单例的静态方法,返回InfoLoad的实例
- * @return
- */
- public static InfoLoad getInfoLoadInstance(){
- return infoLoad;
- }
- /******单例模式声明结束******/
- /**
- * 私有的构造函数
- */
- private InfoLoad(){
- //初始化httpClient
- initHttpClient();
- }
- /**
- *
- * 方法名:initHttpClient
- * 作者:zhouyh
- * 创建时间:2015-10-14 上午11:00:30
- * 描述:创建httpclient连接池,并初始化httpclient
- */
- public void initHttpClient(){
- //创建httpclient连接池
- httpClientConnectionManager = new PoolingHttpClientConnectionManager();
- //设置连接池最大数量
- httpClientConnectionManager.setMaxTotal(Constant.HTTPCLIENT_CONNECTION_COUNT);
- //设置单个路由最大连接数量
- httpClientConnectionManager.setDefaultMaxPerRoute(Constant.HTTPCLIENT_MAXPERROUTE_COUNT);
- }
- //请求重试机制
- HttpRequestRetryHandler myRetryHandler = new HttpRequestRetryHandler() {
- public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
- if (executionCount >= 3) {
- // 超过三次则不再重试请求
- return false;
- }
- if (exception instanceof InterruptedIOException) {
- // Timeout
- return false;
- }
- if (exception instanceof UnknownHostException) {
- // Unknown host
- return false;
- }
- if (exception instanceof ConnectTimeoutException) {
- // Connection refused
- return false;
- }
- if (exception instanceof SSLException) {
- // SSL handshake exception
- return false;
- }
- HttpClientContext clientContext = HttpClientContext.adapt(context);
- HttpRequest request = clientContext.getRequest();
- boolean idempotent = !(request instanceof HttpEntityEnclosingRequest);
- if (idempotent) {
- // Retry if the request is considered idempotent
- return true;
- }
- return false;
- }
- };
- /**
- *
- * 方法名:getHttpClient
- * 作者:zhouyh
- * 创建时间:2016-2-18 下午01:23:32
- * 描述:多线程调用时,需要创建自己的httpclient
- * @return
- */
- public CloseableHttpClient getHttpClient(){
- // 创建全局的requestConfig
- RequestConfig requestConfig = RequestConfig.custom()
- .setConnectTimeout(Constant.HTTPCLIENT_CONNECT_TIMEOUT)
- .setSocketTimeout(Constant.HTTPCLIENT_SOCKET_TIMEOUT)
- .setCookieSpec(CookieSpecs.BEST_MATCH).build();
- // 声明重定向策略对象
- LaxRedirectStrategy redirectStrategy = new LaxRedirectStrategy();
- CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(httpClientConnectionManager)
- .setDefaultRequestConfig(requestConfig)
- .setRedirectStrategy(redirectStrategy)
- .setRetryHandler(myRetryHandler)
- .build();
- return httpClient;
- }
- /**
- *
- * 方法名:loadForString
- * 作者:zhouyh
- * 创建时间:2015-10-14 下午02:22:19
- * 描述:根据传入的url获取下载信息
- * @param url
- * @param type
- * @return
- */
- public static String loadForString(String urlString, int type){
- String src = "";
- if(null==urlString || urlString.isEmpty() || !urlString.startsWith("http")){//如果urlString为null或者urlString为空,或urlString非http开头,返回src空值
- return src;
- }
- //创建response
- CloseableHttpResponse response = null;
- HttpGet httpGet = null;
- urlString = urlString.trim();//防止传入的urlString首尾有空格
- //转化String url为URI,解决url中包含特殊字符的情况
- try {
- URL url = new URL(urlString);
- URI uri = new URI(url.getProtocol(), url.getHost(), url.getPath(), url.getQuery(), null);
- httpGet = new HttpGet(uri);
- //针对https采用SSL的方式创建httpclient
- // if(urlString.startsWith("https")){
- // httpClient = createSSLClientDefault();
- // System.setProperty ("jsse.enableSNIExtension", "false");
- // }
- //设置请求头
- httpGet.addHeader("Accept","*/*");
- // httpGet.addHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
- httpGet.addHeader("Connection","keep-alive");
- httpGet.addHeader("Accept-Encoding", "gzip, deflate");
- //设置USER_AGENT
- Random random = new Random();
- int randomInt = random.nextInt(4);
- System.err.println(randomInt);
- httpGet.addHeader("User-Agent", Constant.USER_AGENT[randomInt]);
- //此处的代理暂时注释
- // String[] proxys = Constant.HTTPCLIENT_PROXY[randomInt].split("\\s+");
- // //添加代理
- // HttpHost proxy = new HttpHost(proxys[0].trim(), Integer.parseInt(proxys[1].trim()), "http");
- // RequestConfig config = RequestConfig.custom().setProxy(proxy).build();
- // httpGet.setConfig(config);
- //执行请求
- try {
- if(urlString.startsWith("https")){
- System.setProperty ("jsse.enableSNIExtension", "false");
- response = createSSLClientDefault().execute(httpGet);
- }else{
- response = infoLoad.getHttpClient().execute(httpGet);
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- //得到响应状态码
- int statuCode = response.getStatusLine().getStatusCode();
- //根据状态码进行逻辑处理
- switch (statuCode){
- case 200:
- //获得响应实体
- HttpEntity entity = response.getEntity();
- /**
- * 仿浏览器获取网页编码
- * 浏览器是先从content-type的charset(响应头信息)中获取编码,
- * 如果获取不了,则会从meta(HTML里的代码)中获取charset的编码值
- */
- //第一步-->处理网页字符编码
- String charset = null;
- ContentType contentType = null;
- contentType = ContentType.getOrDefault(entity);
- Charset charsets = contentType.getCharset();
- if(null != charsets){
- charset = charsets.toString();
- }
- //判断返回的数据流是否采用了gzip压缩
- Header header = entity.getContentEncoding();
- boolean isGzip = false;
- if(null != header){
- for(HeaderElement headerElement : header.getElements()){
- if(headerElement.getName().equalsIgnoreCase("gzip")){
- isGzip = true;
- }
- }
- }
- //获得响应流
- InputStream inputStream = entity.getContent();
- ByteArrayBuffer buffer = new ByteArrayBuffer(4096);
- byte[] tmp = new byte[4096];
- int count;
- if(isGzip){//如果采用了Gzip压缩,则进行gizp压缩处理
- GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
- while((count=gzipInputStream.read(tmp)) != -1){
- buffer.append(tmp, 0, count);
- }
- }else{//处理非gzip格式的数据
- while((count=inputStream.read(tmp)) != -1){
- buffer.append(tmp, 0, count);
- }
- }
- //第二步--->如果第一步contenttyp未获取到编码,这里从meta标签中获取
- if(null==charset || "".equals(charset) || "null".equals(charset)
- || "zh-cn".equalsIgnoreCase(charset)){
- charset = getCharsetFromMetaTag(buffer, urlString);
- }
- //根据获取的字符编码转为string类型
- src = new String(buffer.toByteArray(), charset);
- //替换特殊编码
- src = replaceStr(src);
- //转化Unicode编码格式]
- src = Common.decodeUnicode(src);
- // System.out.println(src);
- break;
- case 400:
- System.out.println("下载400错误代码,请求出现语法错误" + urlString);
- //TODO 要进行判断是列表页还是正文页下载,再去修改数据库,下同
- //TODO 此处添加对mongodb数据库的操作,将该url的isStart改为0,暂时不在进行监控,后续根据模板状态为0的进行修改
- break;
- case 403:
- System.out.println("下载403错误代码,资源不可用" + urlString);
- //TODO 此处添加对mongodb数据库的操作,将该url的isStart改为0,暂时不在进行监控,后续根据模板状态为0的进行修改
- break;
- case 404:
- System.out.println("下载404错误代码,无法找到指定资源地址" + urlString);
- //TODO 此处添加对mongodb数据库的操作,将该url的isStart改为0,暂时不在进行监控,后续根据模板状态为0的进行修改
- break;
- case 503:
- System.out.println("下载503错误代码,服务不可用" + urlString);
- //TODO 此处添加对mongodb数据库的操作,将该url的isStart改为0,暂时不在进行监控,后续根据模板状态为0的进行修改
- break;
- case 504:
- System.out.println("下载504错误代码,网关超时" + urlString);
- //TODO 此处添加对mongodb数据库的操作,将该url的isStart改为0,暂时不在进行监控,后续根据模板状态为0的进行修改
- break;
- }
- } catch (MalformedURLException e) {
- //执行URL url = new URL()的异常
- e.printStackTrace();
- } catch (URISyntaxException e) {
- //执行URI uri = new URI()的异常
- e.printStackTrace();
- } catch (ClientProtocolException e) {
- // 执行httpClient.execute(httpGet)的异常
- e.printStackTrace();
- } catch (IOException e) {
- // 执行httpClient.execute(httpGet)的异常
- e.printStackTrace();
- } finally{
- if(response != null){
- try {
- response.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- httpGet.abort(); //结束后关闭httpGet请求
- /**
- * httpclient的链接有线程池管理,这里不用直接关闭
- */
- // try {//关闭连接
- // httpClient.close();
- // } catch (IOException e) {
- // e.printStackTrace();
- // }
- }
- return src;
- }
- /**
- *
- * 方法名:createSSLClientDefault
- * 作者:zhouyh
- * 创建时间:2015-10-14 下午03:03:30
- * 描述:针对https采用SSL的方式创建httpclient
- * @return
- */
- public static CloseableHttpClient createSSLClientDefault(){
- try {
- SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, new TrustStrategy(){
- //信任所有
- public boolean isTrusted(X509Certificate[] chain, String authType) throws CertificateException {
- return true;
- }}).build();
- SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext);
- return HttpClients.custom().setSSLSocketFactory(sslsf).build();
- } catch (KeyManagementException e) {
- e.printStackTrace();
- } catch (NoSuchAlgorithmException e) {
- e.printStackTrace();
- } catch (KeyStoreException e) {
- e.printStackTrace();
- }
- return HttpClients.createDefault();
- }
- /**
- *
- * 方法名:getCharsetFromMetaTag
- * 作者:zhouyh
- * 创建时间:2015-10-14 下午05:23:08
- * 描述:从meta标签中获取编码格式
- * @param buffer
- * @param url
- * @return
- */
- public static String getCharsetFromMetaTag(ByteArrayBuffer buffer,String url){
- String charset = null;
- String regEx = Constant.CHARSET_REGEX;
- Pattern p = Pattern.compile(regEx,
- Pattern.CASE_INSENSITIVE);
- Matcher m = p.matcher(new String(buffer.toByteArray()));
- boolean result = m.find();
- if (result) {
- if (m.groupCount() == 1) {
- charset = m.group(1);
- }
- System.err.println("网页 中的编码:" + charset + "\t url:" + url);
- } else {
- //出现未匹配的编码,先赋值为gbk
- charset = "gbk";
- System.out.println("字符编码未匹配到 : " + url);
- }
- return charset;
- }
- /**
- *
- * 方法名:replaceStr
- * 作者:zhouyh
- * 创建时间:2015-10-14 下午05:33:01
- * 描述:替换原网页中的特殊字符
- * @param src
- * @return
- */
- public static String replaceStr(String src){
- if (src == null || "".equals(src)) return null;
- src = src.replaceAll("<!--", "");
- src = src.replaceAll("-->", "");
- src = src.replaceAll("<", "<");
- src = src.replaceAll(">", ">");
- src = src.replaceAll(""", "\"");
- src = src.replaceAll(" ", " ");
- src = src.replaceAll("&", "&");
- return src;
- }
- /**
- * 方法名:main
- * 作者:zhouyh
- * 创建时间:2015-10-14 上午08:46:47
- * 描述:main方法
- * @param args
- */
- public static void main(String[] args) {
- // TODO Auto-generated method stub
- Random random = new Random();
- int randomInt = random.nextInt(4);
- System.out.println(randomInt);
- // InfoLoad.getInfoLoadInstance().loadForString("http://weixin.sogou.com/remind/doc_list.php?callback=jQuery111006747886650961997_1446517725478&from=web&uid=B31F8214DA30BE47F750B8BE2BF0E4AA%40qq.sohu.com&start=0&num=20&wordid=237&clear=1&_=1446517725480", 0);
- InfoLoad.getInfoLoadInstance().loadForString("http://www.xinli001.com/info", 0);
- }
- }
这里用到了IO常用的工具类IOUtils,详细了解可以参考这篇文章[file]IO常用工具类IOUtils(Java读文件、写文件、打Zip包)http://www.bieryun.com/1003.html