HttpClientRequester.java 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. package com.sdyc.jise.fetch.crawler.requester;
  2. import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
  3. import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
  4. import lombok.extern.slf4j.Slf4j;
  5. import org.apache.commons.lang.StringUtils;
  6. import org.apache.http.Header;
  7. import org.apache.http.client.methods.HttpGet;
  8. import org.apache.http.config.Registry;
  9. import org.apache.http.config.RegistryBuilder;
  10. import org.apache.http.conn.socket.ConnectionSocketFactory;
  11. import org.apache.http.conn.socket.PlainConnectionSocketFactory;
  12. import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
  13. import org.apache.http.impl.client.CloseableHttpClient;
  14. import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
  15. import org.apache.http.impl.client.HttpClients;
  16. import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
  17. import org.apache.http.util.EntityUtils;
  18. import javax.net.ssl.SSLContext;
  19. import javax.net.ssl.TrustManager;
  20. import javax.net.ssl.X509TrustManager;
  21. import java.io.Closeable;
  22. import java.io.IOException;
  23. import java.net.URI;
  24. import java.security.KeyManagementException;
  25. import java.security.NoSuchAlgorithmException;
  26. import java.security.cert.CertificateException;
  27. import java.util.Map;
  28. import java.util.Objects;
  29. /**
  30. * <pre>
  31. *
  32. * Created by zhenqin.
  33. * User: zhenqin
  34. * Date: 17/4/25
  35. * Time: 16:26
  36. * Vendor: NowledgeData
  37. * To change this template use File | Settings | File Templates.
  38. *
  39. * </pre>
  40. *
  41. * @author zhenqin
  42. */
  43. @Slf4j
  44. public class HttpClientRequester extends JavaNativeRequester implements Closeable {
  45. protected final CloseableHttpClient httpClient;
  46. final PoolingHttpClientConnectionManager cm;
  47. public HttpClientRequester() {
  48. try {
  49. //设置协议http和https对应的处理socket链接工厂的对象
  50. Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory>create()
  51. .register("http", PlainConnectionSocketFactory.INSTANCE)
  52. .register("https", new SSLConnectionSocketFactory(createIgnoreVerifySSL()))
  53. .build();
  54. cm = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
  55. httpClient = HttpClients.custom()
  56. .setConnectionManager(cm)
  57. .setRetryHandler(new DefaultHttpRequestRetryHandler(3, false))
  58. .build();
  59. } catch (Exception e) {
  60. throw new IllegalStateException(e);
  61. }
  62. }
  63. /**
  64. * 绕过验证
  65. *
  66. * @return
  67. * @throws NoSuchAlgorithmException
  68. * @throws KeyManagementException
  69. */
  70. public static SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException {
  71. SSLContext sc = SSLContext.getInstance("SSLv3");
  72. // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法
  73. X509TrustManager trustManager = new X509TrustManager() {
  74. @Override
  75. public void checkClientTrusted(
  76. java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
  77. String paramString) throws CertificateException {
  78. }
  79. @Override
  80. public void checkServerTrusted(
  81. java.security.cert.X509Certificate[] paramArrayOfX509Certificate,
  82. String paramString) throws CertificateException {
  83. }
  84. @Override
  85. public java.security.cert.X509Certificate[] getAcceptedIssuers() {
  86. return null;
  87. }
  88. };
  89. sc.init(null, new TrustManager[] { trustManager }, null);
  90. return sc;
  91. }
  92. /**
  93. * 覆盖父类的方法,使用 HttpClient 实现
  94. * @param crawlDatum
  95. * @return
  96. * @throws Exception
  97. */
  98. @Override
  99. public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
  100. HttpGet get = new HttpGet(crawlDatum.url());
  101. if(StringUtils.isNotBlank(cookie)) {
  102. header.put("Cookie", cookie);
  103. }
  104. if(StringUtils.isNotBlank(userAgent)) {
  105. header.put("User-Agent", userAgent);
  106. }
  107. if(!header.isEmpty()) {
  108. for (Map.Entry<String, String> entry : header.entrySet()) {
  109. get.addHeader(entry.getKey(), entry.getValue());
  110. }
  111. }
  112. URI url = new URI(crawlDatum.url());
  113. org.apache.http.HttpResponse resp = httpClient.execute(new HttpGet(url));
  114. HttpResponse response = new HttpResponse(url.toURL());
  115. log.debug("fetch url {} return code {}", crawlDatum.url(), resp.getStatusLine().getStatusCode());
  116. Header[] allHeaders = resp.getAllHeaders();
  117. if(allHeaders != null) {
  118. for (Header header : allHeaders) {
  119. response.addHeader(header.getName(), header.getValue());
  120. }
  121. }
  122. response.setNotFound(Objects.equals(404, resp.getStatusLine().getStatusCode()));
  123. response.setRedirect(Objects.equals(302, resp.getStatusLine().getStatusCode()));
  124. response.code(resp.getStatusLine().getStatusCode());
  125. response.setHtml(EntityUtils.toString(resp.getEntity()));
  126. EntityUtils.consumeQuietly(resp.getEntity());
  127. return response;
  128. }
  129. @Override
  130. public void close() throws IOException {
  131. httpClient.close();
  132. log.info("closed http client.");
  133. }
  134. }