1 package com.test.pic.crawler; 2 3 import java.io.File; 4 import java.io.FileOutputStream; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.OutputStream; 8 import java.net.URL; 9 import java.net.URLConnection; 10 import java.util.Arrays; 11 import java.util.HashSet; 12 import java.util.List; 13 import java.util.Set; 14 import java.util.concurrent.BlockingQueue; 15 import java.util.concurrent.LinkedBlockingDeque; 16 import java.util.concurrent.ScheduledExecutorService; 17 import java.util.concurrent.ScheduledThreadPoolExecutor; 18 import org.apache.commons.lang3.concurrent.BasicThreadFactory; 19 import org.jsoup.Jsoup; 20 import org.jsoup.nodes.*; 21 import org.jsoup.select.Elements; 22 23 24 25 26 /** 27 * @Title: PicCrawler.java 28 * 29 * @Package com.test.pic.crawler 30 * 31 * @Description: 爬取指定網站的指定Tag下的圖片或者全部Tag圖片 32 * 33 * @author CoderZZ 34 * 35 * @date 2018年1月12日 下午11:22:41 36 * 37 * @version V1.0 38 * 39 */ 40 public class PicCrawler implements Runnable{ 41 private static String pathString = "G:/test/pic/";//存儲目錄 42 //存儲真正的爬取頁面 43 static BlockingQueue<String> urlBlockingQueue = new LinkedBlockingDeque<String>(1000); 44 static int threadNum = 10; 45 // public PicCrawler(String url){ 46 // this.url = url; 47 // } 48 49 /** 50 * @Title: main 51 * 52 * @Description: TODO(這里用一句話描述這個方法的作用) 53 * 54 * @param @param args 設定文件 55 * 56 * @return void 返回類型 57 * 58 * @throws 59 * 60 */ 61 public static void main(String[] args) { 62 String homeurlString = "https://www.xxxx.com";//爬取頁面的基本地址 63 String tagPageUrl = "https://www.xxxx.com/tag/";//tag分頁地址 64 //Tag標簽的完整路徑 65 Set<String> tagFullHrefSet = new HashSet<String>(16); 66 //想要爬取哪些tag,如果為空,則全部爬取;否則只配置對應的tag 67 String[] crawlerTagArray = {"風景"}; 68 List<String> crawlerTagList = Arrays.asList(crawlerTagArray); 69 try { 70 //1.獲取想要的tag完整的url 71 Document tagListDocument = Jsoup.connect(tagPageUrl).get(); 72 Elements tagsListDivElements = tagListDocument.getElementsByClass("tags_list"); 73 for(Element element:tagsListDivElements){ 74 Elements aElements = element.getElementsByTag("a"); 75 for(Element a:aElements){ 76 if(crawlerTagList.size() == 0 || crawlerTagList.contains(a.text())){ 77 String tagUrlString = homeurlString+a.attr("href"); 78 //https://www.xxxx.com/tag/fengjing.html 79 tagUrlString = tagUrlString.substring(0, tagUrlString.lastIndexOf("."))+"/1.html"; 80 tagFullHrefSet.add(tagUrlString); 81 } 82 } 83 } 84 //2.獲取圖片鏈接頁面地址,分頁爬取 85 for(String tagUrl:tagFullHrefSet){ 86 String tempTagUrlString = tagUrl; 87 int currentPageNum = 1; 88 while(true){ 89 try{ 90 Document imagePageDocument = Jsoup.connect(tempTagUrlString).get(); 91 Elements imageListElements = imagePageDocument.getElementsByClass("Pli-litpic"); 92 if(imageListElements.size() == 0){ 93 break; 94 } 95 for(Element image:imageListElements){ 96 urlBlockingQueue.offer(homeurlString+image.attr("href")); 97 } 98 //https://www.xxxx.com/tag/fengjing/1.html 99 tempTagUrlString = tempTagUrlString.substring(0, tempTagUrlString.lastIndexOf("/")+1)+(++currentPageNum)+".html"; 100 }catch(Exception e){ 101 break; 102 } 103 } 104 } 105 ScheduledExecutorService excutor = new ScheduledThreadPoolExecutor(threadNum,new BasicThreadFactory.Builder().namingPattern("my-crawler-thread-%d").daemon(false).build()); 106 for(int i=0;i<threadNum;i++){ 107 // excutor.schedule(new PicCrawler(urlArray[i]), 1, TimeUnit.SECONDS); 108 // excutor.execute(new PicCrawler(urlArray[i])); 109 excutor.submit(new PicCrawler()); 110 } 111 } catch (IOException e) { 112 // TODO Auto-generated catch block 113 e.printStackTrace(); 114 } 115 } 116 @Override 117 public void run() { 118 while (true) { 119 try { 120 long begin = System.currentTimeMillis(); 121 String url = urlBlockingQueue.poll(); 122 if(null != url){ 123 Document doc = Jsoup.connect(url).get(); 124 Elements titleElements =doc.select("#photos > h1"); 125 if(null != titleElements && null != titleElements.get(0)){ 126 Set<String> imgSrcSet = new HashSet<String>(16); 127 Element titleElement = titleElements.get(0); 128 String foldNameString = titleElement.text(); 129 String[] nameArray = foldNameString.split("\\("); 130 foldNameString = nameArray[0]; 131 nameArray = nameArray[1].split("/"); 132 int totalPaggs = Integer.parseInt(nameArray[1].replace(")", "")); 133 for(int i=1;i<=totalPaggs;i++){ 134 String urlTemp = url.replace(".html", "_"+i+".html"); 135 Document docTemp = Jsoup.connect(urlTemp).get(); 136 Element element = docTemp.getElementById("big-pic"); 137 Elements imgElements = element.getElementsByTag("img"); 138 for(Element imgElement:imgElements){ 139 imgSrcSet.add(imgElement.attr("src")); 140 } 141 } 142 if(imgSrcSet.size()>0){ 143 for(String imgSrc:imgSrcSet){ 144 // 構造URL 145 URL imgurl = new URL(imgSrc); 146 // 打開連接 147 URLConnection con = imgurl.openConnection(); 148 //設置請求超時為10s 149 con.setConnectTimeout(10*1000); 150 // 輸入流 151 InputStream is = con.getInputStream(); 152 // 500k的數據緩沖 153 byte[] bs = new byte[1024*500]; 154 // 讀取到的數據長度 155 int len; 156 // 輸出的文件流 157 File sf=new File(pathString+"\\"+foldNameString); 158 if(!sf.exists()){ 159 sf.mkdirs(); 160 } 161 String filename = imgSrc.split("/")[imgSrc.split("/").length-1]; 162 OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename); 163 // 開始讀取 164 while ((len = is.read(bs)) != -1) { 165 os.write(bs, 0, len); 166 } 167 // 完畢,關閉所有鏈接 168 os.close(); 169 is.close(); 170 System.out.println(imgSrc+"下載完成!!!"); 171 } 172 } 173 long end = System.currentTimeMillis(); 174 System.out.println("================================================================"); 175 System.out.println(Thread.currentThread().getName()+"******************已全部下載完成,用時:"+((end-begin)/1000)+"S"); 176 } 177 }else{ 178 System.out.println("========================BlockingQueue已空,已全部抓取完成!======================="); 179 } 180 } catch (Exception e) { 181 System.out.println("========================抓取異常======================="); 182 } 183 } 184 } 185 }