package com.sdyc.jise.fetch.crawler.requester; import cn.edu.hfut.dmic.webcollector.model.CrawlDatum; import cn.edu.hfut.dmic.webcollector.net.HttpResponse; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang.StringUtils; import org.apache.http.Header; import org.apache.http.client.methods.HttpGet; import org.apache.http.config.Registry; import org.apache.http.config.RegistryBuilder; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.util.EntityUtils; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; import java.io.Closeable; import java.io.IOException; import java.net.URI; import java.security.KeyManagementException; import java.security.NoSuchAlgorithmException; import java.security.cert.CertificateException; import java.util.Map; import java.util.Objects; /** *
 *
 * Created by zhenqin.
 * User: zhenqin
 * Date: 17/4/25
 * Time: 16:26
 * Vendor: NowledgeData
 * To change this template use File | Settings | File Templates.
 *
 * 
* * @author zhenqin */ @Slf4j public class HttpClientRequester extends JavaNativeRequester implements Closeable { protected final CloseableHttpClient httpClient; final PoolingHttpClientConnectionManager cm; public HttpClientRequester() { try { //设置协议http和https对应的处理socket链接工厂的对象 Registry socketFactoryRegistry = RegistryBuilder.create() .register("http", PlainConnectionSocketFactory.INSTANCE) .register("https", new SSLConnectionSocketFactory(createIgnoreVerifySSL())) .build(); cm = new PoolingHttpClientConnectionManager(socketFactoryRegistry); httpClient = HttpClients.custom() .setConnectionManager(cm) .setRetryHandler(new DefaultHttpRequestRetryHandler(3, false)) .build(); } catch (Exception e) { throw new IllegalStateException(e); } } /** * 绕过验证 * * @return * @throws NoSuchAlgorithmException * @throws KeyManagementException */ public static SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { SSLContext sc = SSLContext.getInstance("SSLv3"); // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override public void checkClientTrusted( java.security.cert.X509Certificate[] paramArrayOfX509Certificate, String paramString) throws CertificateException { } @Override public void checkServerTrusted( java.security.cert.X509Certificate[] paramArrayOfX509Certificate, String paramString) throws CertificateException { } @Override public java.security.cert.X509Certificate[] getAcceptedIssuers() { return null; } }; sc.init(null, new TrustManager[] { trustManager }, null); return sc; } /** * 覆盖父类的方法,使用 HttpClient 实现 * @param crawlDatum * @return * @throws Exception */ @Override public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception { HttpGet get = new HttpGet(crawlDatum.url()); if(StringUtils.isNotBlank(cookie)) { header.put("Cookie", cookie); } if(StringUtils.isNotBlank(userAgent)) { header.put("User-Agent", userAgent); } if(!header.isEmpty()) { for (Map.Entry entry : header.entrySet()) { get.addHeader(entry.getKey(), entry.getValue()); } } URI url = new URI(crawlDatum.url()); org.apache.http.HttpResponse resp = httpClient.execute(new HttpGet(url)); HttpResponse response = new HttpResponse(url.toURL()); log.debug("fetch url {} return code {}", crawlDatum.url(), resp.getStatusLine().getStatusCode()); Header[] allHeaders = resp.getAllHeaders(); if(allHeaders != null) { for (Header header : allHeaders) { response.addHeader(header.getName(), header.getValue()); } } response.setNotFound(Objects.equals(404, resp.getStatusLine().getStatusCode())); response.setRedirect(Objects.equals(302, resp.getStatusLine().getStatusCode())); response.code(resp.getStatusLine().getStatusCode()); response.setHtml(EntityUtils.toString(resp.getEntity())); EntityUtils.consumeQuietly(resp.getEntity()); return response; } @Override public void close() throws IOException { httpClient.close(); log.info("closed http client."); } }