圖片爬蟲工具，可以爬取指定網頁的圖片

本文轉載自查看原文 2018-07-11 13:27 1768

 
         public 
         class 
         Demo { 
        
         public 
         static 
         void 
         main(String[] args)  
         throws 
         IOException { 
        
         //要抓取圖片的網址連接 
        
         String url =  
         "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%B6%AB%B7%BD%BD%F0%B9%DD%B3%A4&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111" 
         ; 
        
         //根據連接獲取一個Connection對象 
        
         InputStream is = getConnection(url, 
         null 
         ).getInputStream(); 
        
         //調用commonsio工具包中IOUtils的方法，返回HTML內容； 
        
         String html = IOUtils.toString(is,  
         "GBK" 
         ); 
        
         //解析HTML內容，獲取所有圖片鏈接地址 
        
         List<String> picPaths = parseHtml(html); 
        
         //判斷是否獲取到圖片鏈接 
        
         if 
         (picPaths.size() >  
         0 
         ) { 
        
         //創建一個線程池，處理下載任務 
        
         ExecutorService es = Executors.newFixedThreadPool(picPaths.size() <  
         50 
         ? picPaths.size() :  
         50 
         ); 
        
         //循環處理資源 
        
         for 
         ( 
         final 
         String picPath : picPaths) { 
        
         //根據具體的資源，創建下載任務，提交到線程池中 
        
         es.execute(()->{downLoad(pathHandle(picPath),  
         "F:\\pic" 
         ,url);}); 
        
         } 
        
         //關閉線程池 
        
         es.shutdown(); 
        
         } 
        
         } 
        
         /** 
        
         * 獲取唯一序列號，做為文件名 
        
         * @return 
        
         */ 
        
         private 
         static 
         String getUUID() { 
        
         UUID uuid = UUID.randomUUID(); 
        
         return 
         uuid.toString().replaceAll( 
         "-" 
         ,  
         "" 
         ); 
        
         } 
        
         /** 
        
         * 處理獲取到的圖片鏈接 
        
         * @param picPath 
        
         * @return 
        
         */ 
        
         private 
         static 
         String pathHandle(String picPath) { 
        
         if 
         (!picPath.startsWith( 
         "http" 
         )) { 
        
         picPath =  
         "http:" 
         + picPath; 
        
         } 
        
         //這個處理，是針對天貓的圖片鏈接，用於下載大圖; 
        
         //天貓的商品圖片鏈接示例如下： 
        
         //http://img.alicdn.com/bao/uploaded/i4/TB19FGse7KWBuNjy1zjefkOypXa_032207.jpg_b.jpg 
        
         //去掉最后一個_以后的內容，可以下載大圖；否則就下載的是小圖 
        
         if 
         (picPath.indexOf( 
         "_" 
         ) != picPath.lastIndexOf( 
         "_" 
         )) { 
        
         picPath = picPath.substring( 
         0 
         ,picPath.lastIndexOf( 
         "_" 
         )); 
        
         } 
        
         return 
         picPath; 
        
         } 
        
         /** 
        
         * 下載圖片 
        
         * @param picPath 
        
         * @param dir 
        
         * @param referer 
        
         */ 
        
         private 
         static 
         void 
         downLoad(String picPath, String dir,String referer){ 
        
         try 
         { 
        
         //生成文件名 
        
         String name = getUUID()+ 
         "." 
         +FilenameUtils.getExtension(picPath); 
        
         FileUtils.copyToFile(getConnection(picPath,referer).getInputStream(),  
         new 
         File( 
         new 
         File(dir),name)); 
        
         System.out.println(picPath +  
         "下載完畢！" 
         ); 
        
         }  
         catch 
         (IOException e) { 
        
         System.err.println(picPath+  
         "下載失敗！" 
         ); 
        
         } 
        
         } 
        
         /** 
        
         * 使用正則表達式解析html內容，獲取圖片鏈接 
        
         * @param html 
        
         * @return 
        
         */ 
        
         private 
         static 
         List<String> parseHtml(String html) { 
        
         String regex =  
         "\"[^\"^(^)^}^>^<^{]+\\.(jpg|png|jpeg|gif)" 
         ; 
        
         List<String> list =  
         new 
         ArrayList<>(); 
        
         Pattern p = Pattern.compile(regex); 
        
         Matcher m = p.matcher(html); 
        
         while 
         (m.find()) { 
        
         list.add(m.group().substring( 
         1 
         )); 
        
         } 
        
         return 
         list; 
        
         } 
        
         /** 
        
         * 根據url地址，獲取一個連接對象，同時設置請求頭，避免服務器防盜鏈，以及模擬瀏覽器請求 
        
         * @param url 
        
         * @param referer 
        
         * @return 
        
         */ 
        
         private 
         static 
         URLConnection getConnection(String url,String referer) { 
        
         try 
         { 
        
         URLConnection uc =  
         new 
         URL(url).openConnection(); 
        
         //解決防盜鏈問題 
        
         uc.setRequestProperty( 
         "referer" 
         , referer== 
         null 
         ? 
         "http://www.baidu.com/" 
         :referer); 
        
         //模擬瀏覽器 
        
         uc.setRequestProperty( 
         "user-agent" 
         ,  
         "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" 
         ); 
        
         return 
         uc; 
        
         }  
         catch 
         (IOException e) { 
        
         e.printStackTrace(); 
        
         System.out.println(url); 
        
         return 
         null 
         ; 
        
         } 
        
         } 
        
         }

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python 圖片爬蟲抓取圖片系列二——爬取指定網頁中的圖片精細版 python 圖片爬蟲抓取圖片系列一——爬取指定網頁中的圖片【爬蟲】網頁圖片爬蟲工具——從谷歌必應上爬取圖片 Python爬蟲爬取網頁圖片 node：爬蟲爬取網頁圖片 Python爬蟲功能（爬取網頁圖片） java爬蟲-簡單爬取網頁圖片 Python爬蟲——爬取網頁圖片使用截圖工具截取指定大小的圖片 python爬蟲爬取指定用戶微博圖片及內容，並進行微博分類及使用習慣分析，生成可視化圖表