最近在做ETL的项目,其中肯定要有数据,才能在各个工具之间抽取、转存、加载。按照天亮爬虫项目上的讲解,对网易之家的贷款机构进行了抓取。大致模块分为四部分:抓取模块、实体类、工具类、控制类。现在把相关的代码大致记录一遍,以防遗忘。
首先定义一个定义两个工具类,第一个工具类负责将将后期抓取的数据写入到一个文件里保存:

import java.io.File; import java.io.FileOutputStream; import java.io.IOException; /** *文件读写类 */ public class IOUtil { public static void writeFile(String filePath, String value, String encoding) { FileOutputStream fos = null; try { fos = new FileOutputStream(new File(filePath)); fos.write(value.getBytes(encoding)); fos.close(); } catch (Exception e) { e.printStackTrace(); } finally { if (fos != null) { try { fos.close(); } catch (IOException e) { e.printStackTrace(); } } } } public static void main(String[] args) { String filePath = "test.txt"; String value = "中国人民万岁,hello world,123"; String encoding = "utf-8"; IOUtil.writeFile(filePath, value, encoding); System.out.println("done!"); } }
其次一个工具类是对抓取到的数据进行解析,因为后期抓取到的数据是json格式的,需要模板进行解析:

import java.util.Iterator; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.JSONValue; /** * json解析工具类 * */ public class JsonOperatorUtil { public static JSONObject toJSONObject(String str) { return (JSONObject) JSONValue.parse(str); } public static JSONArray toJSONArray(String str) { return (JSONArray) JSONValue.parse(str); } public static void main(String[] args) { String str = "[{\"one\":1,\"two\":\"2\"}]"; // JSONObject jsonObject = JsonOperatorUtil.toJSONObject(str); JSONArray jsonObject = JsonOperatorUtil.toJSONArray(str); Iterator<JSONObject> iterator=jsonObject.iterator(); while(iterator.hasNext()){ System.out.println(iterator.next()); } } }
一个设置爬虫的层级类

/** 设置任务的级别 */ public enum TaskLevel { HIGH, MIDDLE, LOW }
接下来是一个爬虫实现接口类

public interface ICrawler { public CrawlResultPojo crawl(UrlPojo urlPojo); }
在接口的实现上采取了两种实现方法,一种是利用HttpClient工具对数据抓取,另外一种直接用传统的HttpConnect来对数据进行抓取。
第一种方法的实现:

import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.HttpURLConnection; public class HttpUrlConnectionCrawlerImpl implements ICrawler { @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); if (urlPojo == null || urlPojo.getUrl() == null) { crawlResultPojo.setSuccess(false); crawlResultPojo.setPageContent(null); return crawlResultPojo; } StringBuilder stringBuilder = new StringBuilder(); HttpURLConnection httpURLConnection = urlPojo.getConnection(); if (httpURLConnection != null) { BufferedReader br = null; String line = null; try { br = new BufferedReader(new InputStreamReader(httpURLConnection .getInputStream(),"gb2312")); while ((line = br.readLine()) != null) { // System.out.println(line);; stringBuilder.append(line+"\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); } catch (Exception e) { e.printStackTrace(); } finally { try { if (br != null) { br.close(); } } catch (Exception e) { e.printStackTrace(); System.out.println("done!"); } } } return crawlResultPojo; } }
Httpclient实现类:

import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.apache.http.HttpEntity; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.RequestBuilder; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import com.ztl.simple.iface.crawl.ICrawler; import com.ztl.simple.pojos.CrawlResultPojo; import com.ztl.simple.pojos.UrlPojo; public class HttpClientCrawlerImpl implements ICrawler { public CloseableHttpClient httpclient = HttpClients.custom().build(); @Override public CrawlResultPojo crawl(UrlPojo urlPojo) { if (urlPojo == null) { return null; } CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); CloseableHttpResponse response1 = null; BufferedReader br = null; try { HttpGet httpget = new HttpGet(urlPojo.getUrl()); response1 = httpclient.execute(httpget); HttpEntity entity = response1.getEntity(); InputStreamReader isr = new InputStreamReader(entity.getContent(), "utf-8"); br = new BufferedReader(isr); String line = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = br.readLine()) != null) { stringBuilder.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (Exception e) { e.printStackTrace(); crawlResultPojo.setSuccess(false); } finally { if (response1 != null) { try { response1.close(); } catch (IOException e1) { e1.printStackTrace(); } } if (br != null) { try { br.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return crawlResultPojo; } /** * 传入加入参数post参数的url pojo */ public CrawlResultPojo crawl4Post(UrlPojo urlPojo) { if (urlPojo == null) { return null; } CrawlResultPojo crawlResultPojo = new CrawlResultPojo(); CloseableHttpResponse response1 = null; BufferedReader br = null; try { RequestBuilder rb = RequestBuilder.post().setUri( new URI(urlPojo.getUrl())); ; // .addParameter("IDToken1", // "username").addParameter("IDToken2", "password").build(); Map<String, Object> parasMap = urlPojo.getParasMap(); if (parasMap != null) { for (Entry<String, Object> entry : parasMap.entrySet()) { rb .addParameter(entry.getKey(), entry.getValue() .toString()); } } HttpUriRequest httpRequest = rb.build(); response1 = httpclient.execute(httpRequest); HttpEntity entity = response1.getEntity(); InputStreamReader isr = new InputStreamReader(entity.getContent(), "utf-8"); br = new BufferedReader(isr); String line = null; StringBuilder stringBuilder = new StringBuilder(); while ((line = br.readLine()) != null) { stringBuilder.append(line + "\n"); } crawlResultPojo.setSuccess(true); crawlResultPojo.setPageContent(stringBuilder.toString()); return crawlResultPojo; } catch (Exception e) { e.printStackTrace(); crawlResultPojo.setSuccess(false); } finally { if (response1 != null) { try { response1.close(); } catch (IOException e1) { e1.printStackTrace(); } } if (br != null) { try { br.close(); } catch (IOException e1) { e1.printStackTrace(); } } } return crawlResultPojo; } public static void main(String[] args) throws Exception { HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl(); String url = "http://www.wangdaizhijia.com/front_select-plat"; UrlPojo urlPojo = new UrlPojo(url); Map<String, Object> parasMap = new HashMap<String, Object>(); int max_page_number = 1000; parasMap.put("currPage", 30); parasMap.put("params", ""); parasMap.put("sort", 0); urlPojo.setParasMap(parasMap); CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo); if (resultPojo != null) { System.out.println(resultPojo); } } }
最后是抓取控制类:

import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.json.simple.JSONArray; import org.json.simple.JSONObject; import com.ztl.simple.impl.crawl.HttpClientCrawlerImpl; import com.ztl.simple.pojos.CrawlResultPojo; import com.ztl.simple.pojos.UrlPojo; import com.ztl.simple.utils.IOUtil; import com.ztl.simple.utils.JsonOperatorUtil; /** * 网易贷抓取管理器 * * @author zel * */ public class WangYiDaiCrawlManager { public static HttpClientCrawlerImpl httpClientCrawlerImpl = new HttpClientCrawlerImpl(); public static String[] column_key = { "platName", "locationAreaName", "locationCityName", "platUrl" }; public static int item_count = 0; private static CrawlResultPojo crawlOnePage(UrlPojo urlPojo) { CrawlResultPojo resultPojo = httpClientCrawlerImpl.crawl4Post(urlPojo); return resultPojo; } public static String parserOnePage(String jsonStr) { // 解析该json JSONObject jsonObj = JsonOperatorUtil.toJSONObject(jsonStr); JSONArray jsonArray = JsonOperatorUtil.toJSONArray(jsonObj.get("list") .toString()); StringBuilder stringBuilder = new StringBuilder(); for (Object json : jsonArray) { JSONObject itemJson = (JSONObject) json; for (String column : column_key) { stringBuilder.append(itemJson.get(column) + "\t"); } stringBuilder.append("\n"); item_count++; } return stringBuilder.toString(); } public static void processWangYiDai(String url, int max_page_number, String filePath) { // 存储所有的抓取条目 StringBuilder all_items = new StringBuilder(); UrlPojo urlPojo = new UrlPojo(url); Map<String, Object> parasMap = new HashMap<String, Object>(); int have_download_page_count = 0; Set<String> uniqSet = new HashSet<String>(); for (int pageNumber = 1; pageNumber <= max_page_number; pageNumber++) { parasMap.put("currPage", pageNumber); parasMap.put("params", ""); parasMap.put("sort", 0); urlPojo.setParasMap(parasMap); CrawlResultPojo resultPojo = crawlOnePage(urlPojo); if (uniqSet.contains(resultPojo.getPageContent())) { System.out.println("碰到重复,代表已抓取完成!"); break; } else { uniqSet.add(resultPojo.getPageContent()); } if (resultPojo != null) { String content = resultPojo.getPageContent(); String page_items = parserOnePage(content); all_items.append(page_items); have_download_page_count++; } } System.out.println("all items size---" + item_count); System.out.println("已经下载了---" + have_download_page_count); IOUtil.writeFile(filePath, all_items.toString(), "utf-8"); System.out.println("save successfully~"); } public static void main(String[] args) { String url = "http://www.wangdaizhijia.com/front_select-plat"; int max_page_number = 1000; String fileName = "网易贷_数据集1.txt"; processWangYiDai(url, max_page_number, fileName); System.out.println("done!"); } }