| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155 |
- package com.sdyc.jise.fetch.crawler.requester;
- import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
- import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.commons.lang.StringUtils;
- import org.apache.http.Header;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.config.Registry;
- import org.apache.http.config.RegistryBuilder;
- import org.apache.http.conn.socket.ConnectionSocketFactory;
- import org.apache.http.conn.socket.PlainConnectionSocketFactory;
- import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
- import org.apache.http.util.EntityUtils;
- import javax.net.ssl.SSLContext;
- import javax.net.ssl.TrustManager;
- import javax.net.ssl.X509TrustManager;
- import java.io.Closeable;
- import java.io.IOException;
- import java.net.URI;
- import java.security.KeyManagementException;
- import java.security.NoSuchAlgorithmException;
- import java.security.cert.CertificateException;
- import java.util.Map;
- import java.util.Objects;
- /**
- * <pre>
- *
- * Created by zhenqin.
- * User: zhenqin
- * Date: 17/4/25
- * Time: 16:26
- * Vendor: NowledgeData
- * To change this template use File | Settings | File Templates.
- *
- * </pre>
- *
- * @author zhenqin
- */
- @Slf4j
- public class HttpClientRequester extends JavaNativeRequester implements Closeable {
- protected final CloseableHttpClient httpClient;
- final PoolingHttpClientConnectionManager cm;
- public HttpClientRequester() {
- try {
- //设置协议http和https对应的处理socket链接工厂的对象
- Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
- .register("http", PlainConnectionSocketFactory.INSTANCE)
- .register("https", new SSLConnectionSocketFactory(createIgnoreVerifySSL()))
- .build();
- cm = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
- httpClient = HttpClients.custom()
- .setConnectionManager(cm)
- .setRetryHandler(new DefaultHttpRequestRetryHandler(3, false))
- .build();
- } catch (Exception e) {
- throw new IllegalStateException(e);
- }
- }
- /**
- * 绕过验证
- *
- * @return
- * @throws NoSuchAlgorithmException
- * @throws KeyManagementException
- */
- public static SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
- SSLContext sc = SSLContext.getInstance("SSLv3");
- // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
- X509TrustManager trustManager = new X509TrustManager() {
- @Override
- public void checkClientTrusted(
- java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
- String paramString) throws CertificateException {
- }
- @Override
- public void checkServerTrusted(
- java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
- String paramString) throws CertificateException {
- }
- @Override
- public java.security.cert.X509Certificate[] getAcceptedIssuers() {
- return null;
- }
- };
- sc.init(null, new TrustManager[] { trustManager }, null);
- return sc;
- }
- /**
- * 覆盖父类的方法,使用 HttpClient 实现
- * @param crawlDatum
- * @return
- * @throws Exception
- */
- @Override
- public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
- HttpGet get = new HttpGet(crawlDatum.url());
- if(StringUtils.isNotBlank(cookie)) {
- header.put("Cookie", cookie);
- }
- if(StringUtils.isNotBlank(userAgent)) {
- header.put("User-Agent", userAgent);
- }
- if(!header.isEmpty()) {
- for (Map.Entry<String, String> entry : header.entrySet()) {
- get.addHeader(entry.getKey(), entry.getValue());
- }
- }
- URI url = new URI(crawlDatum.url());
- org.apache.http.HttpResponse resp = httpClient.execute(new HttpGet(url));
- HttpResponse response = new HttpResponse(url.toURL());
- log.debug("fetch url {} return code {}", crawlDatum.url(), resp.getStatusLine().getStatusCode());
- Header[] allHeaders = resp.getAllHeaders();
- if(allHeaders != null) {
- for (Header header : allHeaders) {
- response.addHeader(header.getName(), header.getValue());
- }
- }
- response.setNotFound(Objects.equals(404, resp.getStatusLine().getStatusCode()));
- response.setRedirect(Objects.equals(302, resp.getStatusLine().getStatusCode()));
- response.code(resp.getStatusLine().getStatusCode());
- response.setHtml(EntityUtils.toString(resp.getEntity()));
- EntityUtils.consumeQuietly(resp.getEntity());
- return response;
- }
- @Override
- public void close() throws IOException {
- httpClient.close();
- log.info("closed http client.");
- }
- }
|