123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143 |
- package com.sdyc.jise.fetch.crawler.requester;
- import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
- import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
- import cn.edu.hfut.dmic.webcollector.net.HttpResponse;
- import cn.edu.hfut.dmic.webcollector.net.Requester;
- import org.apache.commons.lang.StringUtils;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import java.net.URL;
- import java.util.HashMap;
- import java.util.Map;
- /**
- * <pre>
- *
- * Created by zhenqin.
- * User: zhenqin
- * Date: 17/4/25
- * Time: 16:26
- * Vendor: NowledgeData
- * To change this template use File | Settings | File Templates.
- *
- * </pre>
- *
- * @author zhenqin
- */
- public class JavaNativeRequester implements Requester {
- /**
- * Http Cookie
- */
- protected String cookie;
- /**
- * UserAgent
- */
- protected String userAgent;
- /**
- * 访问超时时间
- */
- protected int connectTimeout = -1;
- /**
- * Http Header
- */
- protected final Map<String, String> header = new HashMap<String, String>(5);
- /**
- * 日志系统
- */
- protected static Logger LOG = LoggerFactory.getLogger(JavaNativeRequester.class);
- public JavaNativeRequester() {
- }
- @Override
- public HttpResponse getResponse(CrawlDatum crawlDatum) throws Exception {
- HttpRequest request = new HttpRequest(crawlDatum);
- if(StringUtils.isNotBlank(cookie)) {
- request.setCookie(cookie);
- }
- if(StringUtils.isNotBlank(userAgent)) {
- request.setUserAgent(userAgent);
- }
- if(connectTimeout > 0) {
- request.setTimeoutForConnect(connectTimeout);
- }
- if(!header.isEmpty()) {
- for (Map.Entry<String, String> entry : header.entrySet()) {
- request.addHeader(entry.getKey(), entry.getValue());
- }
- }
- LOG.info("fetch url: {}", crawlDatum.url());
- HttpResponse response = null;
- int retry = 0;
- do {
- try {
- response = request.response();
- break;
- } catch (Exception e) {
- retry++;
- LOG.info("不知道是否IP发生切换,发送抓取异常, 稍等 " + (retry * 2) + "s ,进行重试。");
- Thread.sleep(retry * 2 * 1000);
- LOG.info("等待后,重试开始, 当前重试第 " + retry + " 次。");
- }
- } while(retry < 5);
- if(retry >= 5){
- response = new HttpResponse(new URL(crawlDatum.url()));
- response.setNotFound(true);
- response.setRedirect(false);
- response.code(404);
- response.setHtml("");
- }
- return response;
- }
- public String getCookie() {
- return cookie;
- }
- public void setCookie(String cookie) {
- this.cookie = cookie;
- }
- public String getUserAgent() {
- return userAgent;
- }
- public void setUserAgent(String userAgent) {
- this.userAgent = userAgent;
- }
- public int getConnectTimeout() {
- return connectTimeout;
- }
- public void setConnectTimeout(int connectTimeout) {
- this.connectTimeout = connectTimeout;
- }
- public Map<String, String> getHeader() {
- return header;
- }
- public String addHeader(String key, String value) {
- return header.put(key, value);
- }
- }
|