圖片爬蟲工具,可以爬取指定網頁的圖片


public class Demo {
     public static void main(String[] args) throws IOException {
         //要抓取圖片的網址連接
         String url = "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%B6%AB%B7%BD%BD%F0%B9%DD%B3%A4&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=111111" ;
         //根據連接獲取一個Connection對象
         InputStream is = getConnection(url, null ).getInputStream();
         //調用commonsio工具包中IOUtils的方法,返回HTML內容;
         String html = IOUtils.toString(is, "GBK" );
         //解析HTML內容,獲取所有圖片鏈接地址
         List<String> picPaths = parseHtml(html);
         //判斷是否獲取到圖片鏈接
         if (picPaths.size() > 0 ) {
             //創建一個線程池,處理下載任務
             ExecutorService es = Executors.newFixedThreadPool(picPaths.size() < 50 ? picPaths.size() : 50 );
             //循環處理資源
             for ( final String picPath : picPaths) {
                 //根據具體的資源,創建下載任務,提交到線程池中
                 es.execute(()->{downLoad(pathHandle(picPath), "F:\\pic" ,url);});
             }
             //關閉線程池
             es.shutdown();
         }
     }
     /**
      * 獲取唯一序列號,做為文件名
      * @return
      */
     private static String getUUID() {
         UUID uuid = UUID.randomUUID();
         return uuid.toString().replaceAll( "-" , "" );
     }
     /**
      * 處理獲取到的圖片鏈接
      * @param picPath
      * @return
      */
     private static String pathHandle(String picPath) {
         if (!picPath.startsWith( "http" )) {
             picPath = "http:" + picPath;
         }
         //這個處理,是針對天貓的圖片鏈接,用於下載大圖;
         //天貓的商品圖片鏈接示例如下:
         //http://img.alicdn.com/bao/uploaded/i4/TB19FGse7KWBuNjy1zjefkOypXa_032207.jpg_b.jpg
         //去掉最后一個_以后的內容,可以下載大圖;否則就下載的是小圖
         if (picPath.indexOf( "_" ) != picPath.lastIndexOf( "_" )) {
             picPath = picPath.substring( 0 ,picPath.lastIndexOf( "_" ));
         }
         return picPath;
     }
     /**
      * 下載圖片
      * @param picPath
      * @param dir
      * @param referer
      */
     private static void downLoad(String picPath, String dir,String referer){
         try {
             //生成文件名
             String name = getUUID()+ "." +FilenameUtils.getExtension(picPath);
             FileUtils.copyToFile(getConnection(picPath,referer).getInputStream(), new File( new File(dir),name));
             System.out.println(picPath + "下載完畢!" );
         } catch (IOException e) {
             System.err.println(picPath+ "下載失敗!" );
         }
     }
     /**
      * 使用正則表達式解析html內容,獲取圖片鏈接
      * @param html
      * @return
      */
     private static List<String> parseHtml(String html) {
         String regex = "\"[^\"^(^)^}^>^<^{]+\\.(jpg|png|jpeg|gif)" ;
         List<String> list = new ArrayList<>();
         Pattern p = Pattern.compile(regex);
         Matcher m = p.matcher(html);
         while (m.find()) {
             list.add(m.group().substring( 1 ));
         }
         return list;
     }
     /**
      * 根據url地址,獲取一個連接對象,同時設置請求頭,避免服務器防盜鏈,以及模擬瀏覽器請求
      * @param url
      * @param referer
      * @return
      */
     private static URLConnection getConnection(String url,String referer) {
         try {
             URLConnection uc = new URL(url).openConnection();
             //解決防盜鏈問題
             uc.setRequestProperty( "referer" , referer== null ? "http://www.baidu.com/" :referer);
             //模擬瀏覽器
             uc.setRequestProperty( "user-agent" , "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" );
             return uc;
         } catch (IOException e) {
             e.printStackTrace();
             System.out.println(url);
             return null ;
         }
     }
}


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM