springBoot+webMagic實現網站爬蟲的實例代碼
前端時間公司項目需要抓取各類數據,py玩的不6,只好研究Java爬蟲方案,做一個總結。
開發環境:
springBoot 2.2.6、jdk1.8。
1、導入依賴
<!--WebMagic核心包--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> <!--這里可以去掉WebMagic自帶的日志(因為打印的很多。。。。)--><!-- <exclusions>--><!-- <exclusion>--><!-- <groupId>org.slf4j</groupId>--><!-- <artifactId>slf4j-log4j12</artifactId>--><!-- </exclusion>--><!-- </exclusions>--> </dependency> <!--WebMagic擴展--> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <!--WebMagic對布隆過濾器的支持--> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> <version>16.0</version> </dependency>
話不多說,直接上代碼。
基礎案例
下面代碼說明以一個類似列表的頁面為例
package com.crawler.project.proTask;import com.alibaba.fastjson.JSONObject;import org.springframework.scheduling.annotation.Scheduled;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Spider;import us.codecraft.webmagic.processor.PageProcessor;import us.codecraft.webmagic.scheduler.BloomFilterDuplicateRemover;import us.codecraft.webmagic.scheduler.QueueScheduler;import us.codecraft.webmagic.selector.Selectable;import java.util.List;public class TaskProcessor implements PageProcessor { /* * 此方法為爬蟲業務實現 * */ @Override public void process(Page page) { //1、爬蟲任務獲取到一個page 解析page上的列表 List<Selectable> list = page.getHtml().css('css selector').nodes(); if (list.size() > 0){//說明為列表頁面、需要解析列表中每個元素的鏈接,存入待獲取page隊列中 for (Selectable selectable : list) { //遍歷集合,將每個元素鏈接存入待獲取page隊列中 page.addTargetRequest(selectable.links().toString()); } //同時將下一頁的url存入隊列中 page.addTargetRequest('下一頁的url'); }else { //此時為列表中單個元素對應的詳情頁 //在自定義方法中處理詳細頁,獲取需要的數據進行處理。 handle(page); } } private void handle(Page page) { //例如 處理后的數據為一個JSONObject對象 JSONObject tmp = new JSONObject(); //將這個tmp交由自定義的TaskPipline類處理,若未自定義Pipline并設置到Spider參數中,框架會默認將tmp打印到控制臺。 page.putField('obj',tmp); } /* * 此方法為配置爬蟲過程的一些參數 * */ private Site site = Site.me() .setCharset('UTF-8') .setTimeOut(60 * 1000) .setRetrySleepTime(60 * 1000) .setCycleRetryTimes(5); @Override public Site getSite() { return site; } /* 設置定時任務,執行爬蟲任務 * */ @Scheduled(initialDelay = 1 * 1000,fixedDelay = 2 * 1000) public void process(){ System.out.println('開始執行爬蟲抓取任務'); Spider.create(new TaskProcessor())//注意這里的類名要和當前類名對應 .addUrl('起始頁url') .addPipeline(new TaskPipeline()) //此處課自定義 數據處理類 (在handle()方法中有); .setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(100000))) .thread(3)//此處設置線程數量(不宜過多,最好和列表頁中列表元素數量一致) .run(); }}
package com.crawler.project.proTask;import com.alibaba.fastjson.JSON;import com.alibaba.fastjson.JSONObject;import us.codecraft.webmagic.ResultItems;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.pipeline.Pipeline;public class TaskPipeline implements Pipeline { @Override public void process(ResultItems resultItems, Task task) { if (resultItems.getAll() .size() > 0){ Object obj = resultItems.getAll().get('obj'); JSONObject jsonObject = JSON.parseObject(obj.toString()); //獲取到JSONObject對象下面可進行自定義的業務處理。 } }}
特殊情況一
需根據鏈接下載圖片或文件
eg:在上面說到的詳情頁中含有iframe。
1、首先獲取iframe的src
//獲得iframe的src (這里要注意獲得的src是絕對路徑還是相對路徑,相對路徑需要拼接主站點url)String src = html.css('css selector', 'src').toString();//采用jsoup解析Document document = Jsoup.parse(new URL(src),1000);//獲得需要的元素Element ele = document.select('css selector').last();//獲取需要下載的文件的鏈接String downUrl = ele.attr('href');//根據鏈接下載文件 返回一個文件的名稱String fileName = downloadFile(downUrl);
//通過url下載文件public String downloadFile(String fileUrl) throws FileNotFoundException{ try{ URL httpUrl = new URL(fileUrl); String fileName = UUID.randomUUID().toString() + '.mp3'; File file = new File(this.STATIC_FILEPATH + fileName); System.out.println('============保存文件方法被調用==============='); FileUtils.copyURLToFile(httpUrl,file); return fileName; }catch (Exception e){ e.printStackTrace(); return null; }}
特殊情況二
有些https站點 無法直接使用WebMagic默認的下載器下載,此時我們可以根據站點ssl類型修改下載器。
在項目中創建一個包用于存放自定義(修改)的下載器類
(!!!摘自webMagic框架中HttpClientDownloader,基于此類修改!!!)
/*此方法中需要傳入一個自定義的生成器(HttpClientGenerator)*/package com.crawler.project.spider_download;import org.apache.commons.io.IOUtils;import org.apache.http.HttpResponse;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.util.EntityUtils;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import us.codecraft.webmagic.Page;import us.codecraft.webmagic.Request;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.Task;import us.codecraft.webmagic.downloader.AbstractDownloader;import us.codecraft.webmagic.downloader.HttpClientRequestContext;import us.codecraft.webmagic.downloader.HttpUriRequestConverter;import us.codecraft.webmagic.proxy.Proxy;import us.codecraft.webmagic.proxy.ProxyProvider;import us.codecraft.webmagic.selector.PlainText;import us.codecraft.webmagic.utils.CharsetUtils;import us.codecraft.webmagic.utils.HttpClientUtils;import java.io.IOException;import java.nio.charset.Charset;import java.util.HashMap;import java.util.Map;/** * The http downloader based on HttpClient. * * @author code4crafter@gmail.com <br> * @since 0.1.0 */public class HttpClientDownloader extends AbstractDownloader { private Logger logger = LoggerFactory.getLogger(getClass()); private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>(); //自定義的生成器(HttpClientGenerator)注意導入的應為自定義的HttpClientGenerator類,而不是WebMagic依賴中的HttpClientGenerator類。 private HttpClientGenerator httpClientGenerator = new HttpClientGenerator(); private HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter(); private ProxyProvider proxyProvider; private boolean responseHeader = true; public void setHttpUriRequestConverter(HttpUriRequestConverter httpUriRequestConverter) { this.httpUriRequestConverter = httpUriRequestConverter; } public void setProxyProvider(ProxyProvider proxyProvider) { this.proxyProvider = proxyProvider; } private CloseableHttpClient getHttpClient(Site site) { if (site == null) { return httpClientGenerator.getClient(null); } String domain = site.getDomain(); CloseableHttpClient httpClient = httpClients.get(domain); if (httpClient == null) { synchronized (this) { httpClient = httpClients.get(domain); if (httpClient == null) { httpClient = httpClientGenerator.getClient(site); httpClients.put(domain, httpClient); } } } return httpClient; } @Override public Page download(Request request, Task task) { if (task == null || task.getSite() == null) { throw new NullPointerException('task or site can not be null'); } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { httpResponse = httpClient.execute(requestContext.getHttpUriRequest(), requestContext.getHttpClientContext()); page = handleResponse(request, request.getCharset() != null ? request.getCharset() : task.getSite().getCharset(), httpResponse, task); onSuccess(request); logger.info('downloading page success {}', request.getUrl()); return page; } catch (IOException e) { logger.warn('download page {} error', request.getUrl(), e); onError(request); return page; } finally { if (httpResponse != null) { //ensure the connection is released back to pool EntityUtils.consumeQuietly(httpResponse.getEntity()); } if (proxyProvider != null && proxy != null) { proxyProvider.returnProxy(proxy, page, task); } } } @Override public void setThread(int thread) { httpClientGenerator.setPoolSize(thread); } protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException { byte[] bytes = IOUtils.toByteArray(httpResponse.getEntity().getContent()); String contentType = httpResponse.getEntity().getContentType() == null ? '' : httpResponse.getEntity().getContentType().getValue(); Page page = new Page(); page.setBytes(bytes); if (!request.isBinaryContent()){ if (charset == null) { charset = getHtmlCharset(contentType, bytes); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); } page.setUrl(new PlainText(request.getUrl())); page.setRequest(request); page.setStatusCode(httpResponse.getStatusLine().getStatusCode()); page.setDownloadSuccess(true); if (responseHeader) { page.setHeaders(HttpClientUtils.convertHeaders(httpResponse.getAllHeaders())); } return page; } private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { charset = Charset.defaultCharset().name(); logger.warn('Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()', Charset.defaultCharset()); } return charset; }}
然后在自定義的HttpClientGenerator類中修改有關ssl的參數
(!!!摘自webMagic框架中HttpClientGenerator,基于此類修改!!!)
/*自定義的HttpClientGenerator生成器*/package com.sealion_crawler.project.spider_download;import org.apache.http.HttpException;import org.apache.http.HttpRequest;import org.apache.http.HttpRequestInterceptor;import org.apache.http.client.CookieStore;import org.apache.http.config.Registry;import org.apache.http.config.RegistryBuilder;import org.apache.http.config.SocketConfig;import org.apache.http.conn.socket.ConnectionSocketFactory;import org.apache.http.conn.socket.PlainConnectionSocketFactory;import org.apache.http.conn.ssl.DefaultHostnameVerifier;import org.apache.http.conn.ssl.SSLConnectionSocketFactory;import org.apache.http.impl.client.*;import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;import org.apache.http.impl.cookie.BasicClientCookie;import org.apache.http.protocol.HttpContext;import org.slf4j.Logger;import org.slf4j.LoggerFactory;import us.codecraft.webmagic.Site;import us.codecraft.webmagic.downloader.CustomRedirectStrategy;import javax.net.ssl.SSLContext;import javax.net.ssl.TrustManager;import javax.net.ssl.X509TrustManager;import java.io.IOException;import java.security.KeyManagementException;import java.security.NoSuchAlgorithmException;import java.security.cert.CertificateException;import java.security.cert.X509Certificate;import java.util.Map;/** * @author code4crafter@gmail.com <br> * @since 0.4.0 */public class HttpClientGenerator { private transient Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; public HttpClientGenerator() { Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create() .register('http', PlainConnectionSocketFactory.INSTANCE) .register('https', buildSSLConnectionSocketFactory()) .build(); connectionManager = new PoolingHttpClientConnectionManager(reg); connectionManager.setDefaultMaxPerRoute(100); } /* 此方法中設置ssl有關參數。 */ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { try { return new SSLConnectionSocketFactory(createIgnoreVerifySSL(), new String[]{'SSLv3', 'TLSv1', 'TLSv1.1', 'TLSv1.2'}, null, new DefaultHostnameVerifier()); // 優先繞過安全證書 } catch (KeyManagementException e) { logger.error('ssl connection fail', e); } catch (NoSuchAlgorithmException e) { logger.error('ssl connection fail', e); } return SSLConnectionSocketFactory.getSocketFactory(); } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 實現一個X509TrustManager接口,用于繞過驗證,不用修改里面的方法 X509TrustManager trustManager = new X509TrustManager() { @Override public void checkClientTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public void checkServerTrusted(X509Certificate[] chain, String authType) throws CertificateException { } @Override public X509Certificate[] getAcceptedIssuers() { return null; } }; /* 下面為當前框架默認參數 SSLContext sc = SSLContext.getInstance('SSLv3'); 可修改為需要的ssl參數類型 */ SSLContext sc = SSLContext.getInstance('TLS'); sc.init(null, new TrustManager[] { trustManager }, null); return sc; } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); return this; } public CloseableHttpClient getClient(Site site) { return generateClient(site); } private CloseableHttpClient generateClient(Site site) { HttpClientBuilder httpClientBuilder = HttpClients.custom(); httpClientBuilder.setConnectionManager(connectionManager); if (site.getUserAgent() != null) { httpClientBuilder.setUserAgent(site.getUserAgent()); } else { httpClientBuilder.setUserAgent(''); } if (site.isUseGzip()) { httpClientBuilder.addInterceptorFirst(new HttpRequestInterceptor() { public void process( final HttpRequest request, final HttpContext context) throws HttpException, IOException { if (!request.containsHeader('Accept-Encoding')) { request.addHeader('Accept-Encoding', 'gzip'); } } }); } //解決post/redirect/post 302跳轉問題 httpClientBuilder.setRedirectStrategy(new CustomRedirectStrategy()); SocketConfig.Builder socketConfigBuilder = SocketConfig.custom(); socketConfigBuilder.setSoKeepAlive(true).setTcpNoDelay(true); socketConfigBuilder.setSoTimeout(site.getTimeOut()); SocketConfig socketConfig = socketConfigBuilder.build(); httpClientBuilder.setDefaultSocketConfig(socketConfig); connectionManager.setDefaultSocketConfig(socketConfig); httpClientBuilder.setRetryHandler(new DefaultHttpRequestRetryHandler(site.getRetryTimes(), true)); generateCookie(httpClientBuilder, site); return httpClientBuilder.build(); } private void generateCookie(HttpClientBuilder httpClientBuilder, Site site) { if (site.isDisableCookieManagement()) { httpClientBuilder.disableCookieManagement(); return; } CookieStore cookieStore = new BasicCookieStore(); for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(site.getDomain()); cookieStore.addCookie(cookie); } for (Map.Entry<String, Map<String, String>> domainEntry : site.getAllCookies().entrySet()) { for (Map.Entry<String, String> cookieEntry : domainEntry.getValue().entrySet()) { BasicClientCookie cookie = new BasicClientCookie(cookieEntry.getKey(), cookieEntry.getValue()); cookie.setDomain(domainEntry.getKey()); cookieStore.addCookie(cookie); } } httpClientBuilder.setDefaultCookieStore(cookieStore); }}
好了,到這里 基于WebMagic框架 實現爬蟲、包括jsoup的使用總結就到這里的。
到此這篇關于springBoot+webMagic實現網站爬蟲的實例代碼的文章就介紹到這了,更多相關springBoot webMagic 爬蟲內容請搜索好吧啦網以前的文章或繼續瀏覽下面的相關文章希望大家以后多多支持好吧啦網!
相關文章:
