package com.sdyc.jise.fetch.crawler.requester;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager;
import java.io.Closeable;
import java.io.IOException;
import java.net.URI;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.util.Map;
import java.util.Objects;
/**
*
*
* Created by zhenqin.
* User: zhenqin
* Date: 17/4/25
* Time: 16:26
* Vendor: NowledgeData
* To change this template use File | Settings | File Templates.
*
*
*
* @author zhenqin
*/
@Slf4j
public class HttpClientRequester extends JavaNativeRequester implements Closeable {
protected final CloseableHttpClient httpClient;
final PoolingHttpClientConnectionManager cm;
public HttpClientRequester() {
try {
//设置协议http和https对应的处理socket链接工厂的对象
Registry socketFactoryRegistry = RegistryBuilder.create()
.register("http", PlainConnectionSocketFactory.INSTANCE)
.register("https", new SSLConnectionSocketFactory(createIgnoreVerifySSL()))
.build();
cm = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
httpClient = HttpClients.custom()
.setConnectionManager(cm)
.setRetryHandler(new DefaultHttpRequestRetryHandler(3, false))
.build();
} catch (Exception e) {
throw new IllegalStateException(e);
}
}
/**
* 绕过验证
*
* @return
* @throws NoSuchAlgorithmException
* @throws KeyManagementException
*/
public static SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
SSLContext sc = SSLContext.getInstance("SSLv3");
// 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
X509TrustManager trustManager = new X509TrustManager() {
@Override
public void checkClientTrusted(
java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
String paramString) throws CertificateException {
}
@Override
public void checkServerTrusted(
java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
String paramString) throws CertificateException {
}
@Override
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
return null;
}
};
sc.init(null, new TrustManager[] { trustManager }, null);
return sc;
}
/**
* 覆盖父类的方法,使用 HttpClient 实现
* @param crawlDatum
* @return
* @throws Exception
*/
@Override
public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
HttpGet get = new HttpGet(crawlDatum.url());
if(StringUtils.isNotBlank(cookie)) {
header.put("Cookie", cookie);
}
if(StringUtils.isNotBlank(userAgent)) {
header.put("User-Agent", userAgent);
}
if(!header.isEmpty()) {
for (Map.Entry entry : header.entrySet()) {
get.addHeader(entry.getKey(), entry.getValue());
}
}
URI url = new URI(crawlDatum.url());
org.apache.http.HttpResponse resp = httpClient.execute(new HttpGet(url));
HttpResponse response = new HttpResponse(url.toURL());
log.debug("fetch url {} return code {}", crawlDatum.url(), resp.getStatusLine().getStatusCode());
Header[] allHeaders = resp.getAllHeaders();
if(allHeaders != null) {
for (Header header : allHeaders) {
response.addHeader(header.getName(), header.getValue());
}
}
response.setNotFound(Objects.equals(404, resp.getStatusLine().getStatusCode()));
response.setRedirect(Objects.equals(302, resp.getStatusLine().getStatusCode()));
response.code(resp.getStatusLine().getStatusCode());
response.setHtml(EntityUtils.toString(resp.getEntity()));
EntityUtils.consumeQuietly(resp.getEntity());
return response;
}
@Override
public void close() throws IOException {
httpClient.close();
log.info("closed http client.");
}
}