JavaNativeRequester.java 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. package com.sdyc.jise.fetch.crawler.requester;
  2. import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
  3. import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
  4. import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
  5. import cn.edu.hfut.dmic.webcollector.net.Requester;
  6. import org.apache.commons.lang.StringUtils;
  7. import org.slf4j.Logger;
  8. import org.slf4j.LoggerFactory;
  9. import java.net.URL;
  10. import java.util.HashMap;
  11. import java.util.Map;
  12. /**
  13. * <pre>
  14. *
  15. * Created by zhenqin.
  16. * User: zhenqin
  17. * Date: 17/4/25
  18. * Time: 16:26
  19. * Vendor: NowledgeData
  20. * To change this template use File | Settings | File Templates.
  21. *
  22. * </pre>
  23. *
  24. * @author zhenqin
  25. */
  26. public class JavaNativeRequester implements Requester {
  27. /**
  28. * Http Cookie
  29. */
  30. protected String cookie;
  31. /**
  32. * UserAgent
  33. */
  34. protected String userAgent;
  35. /**
  36. * 访问超时时间
  37. */
  38. protected int connectTimeout = -1;
  39. /**
  40. * Http Header
  41. */
  42. protected final Map<String, String> header = new HashMap<String, String>(5);
  43. /**
  44. * 日志系统
  45. */
  46. protected static Logger LOG = LoggerFactory.getLogger(JavaNativeRequester.class);
  47. public JavaNativeRequester() {
  48. }
  49. @Override
  50. public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
  51. HttpRequest request = new HttpRequest(crawlDatum);
  52. if(StringUtils.isNotBlank(cookie)) {
  53. request.setCookie(cookie);
  54. }
  55. if(StringUtils.isNotBlank(userAgent)) {
  56. request.setUserAgent(userAgent);
  57. }
  58. if(connectTimeout > 0) {
  59. request.setTimeoutForConnect(connectTimeout);
  60. }
  61. if(!header.isEmpty()) {
  62. for (Map.Entry<String, String> entry : header.entrySet()) {
  63. request.addHeader(entry.getKey(), entry.getValue());
  64. }
  65. }
  66. LOG.info("fetch url: {}", crawlDatum.url());
  67. HttpResponse response = null;
  68. int retry = 0;
  69. do {
  70. try {
  71. response = request.response();
  72. break;
  73. } catch (Exception e) {
  74. retry++;
  75. LOG.info("不知道是否IP发生切换,发送抓取异常, 稍等 " + (retry * 2) + "s ,进行重试。");
  76. Thread.sleep(retry * 2 * 1000);
  77. LOG.info("等待后,重试开始, 当前重试第 " + retry + " 次。");
  78. }
  79. } while(retry < 5);
  80. if(retry >= 5){
  81. response = new HttpResponse(new URL(crawlDatum.url()));
  82. response.setNotFound(true);
  83. response.setRedirect(false);
  84. response.code(404);
  85. response.setHtml("");
  86. }
  87. return response;
  88. }
  89. public String getCookie() {
  90. return cookie;
  91. }
  92. public void setCookie(String cookie) {
  93. this.cookie = cookie;
  94. }
  95. public String getUserAgent() {
  96. return userAgent;
  97. }
  98. public void setUserAgent(String userAgent) {
  99. this.userAgent = userAgent;
  100. }
  101. public int getConnectTimeout() {
  102. return connectTimeout;
  103. }
  104. public void setConnectTimeout(int connectTimeout) {
  105. this.connectTimeout = connectTimeout;
  106. }
  107. public Map<String, String> getHeader() {
  108. return header;
  109. }
  110. public String addHeader(String key, String value) {
  111. return header.put(key, value);
  112. }
  113. }