接上文
找到接口之后連續查看了幾個圖片,結果發現圖片都很小,於是用手機下載了一個用wireshark查看了一下url
之前接口的是
imges_min下載的時候變成了images
soga,知道之后立馬試了一下
果然有效,
但是總不能一個一個的查看下載吧
於是連夜寫了個java爬蟲
下面是代碼
package com.feng.main; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.HttpClients; public class DownLoadImg { List<String> imgFormat = new ArrayList<String>(); DownLoadImg(){ imgFormat.add("jpg"); imgFormat.add("jpeg"); imgFormat.add("png"); imgFormat.add("gif"); imgFormat.add("bmp"); } /** * 開啟總方法 * @param startUrl */ public void start(String startUrl){ String content = getContent(startUrl); // 獲取所有圖片鏈接 List<String> urls = getAllImageUrls(content); for (int i = 0; i < urls.size(); i++) { downloadImage(urls.get(i)); } System.out.println("----------------------------------"); System.out.println("------------下載成功-------------"); System.out.println("----------------------------------"); } /** * 獲取HttpEntity * @return HttpEntity網頁實體 */ private HttpEntity getHttpEntity(String url){ HttpResponse response = null;//創建請求響應 //創建httpclient對象 HttpClient httpClient = HttpClients.createDefault(); HttpGet get = new HttpGet(url); RequestConfig requestConfig = RequestConfig.custom() .setSocketTimeout(5000) //設置請求超時時間 .setConnectionRequestTimeout(5000) //設置傳輸超時時間 .build(); get.setConfig(requestConfig);//設置請求的參數 // try { response = httpClient.execute(get); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } //獲取返回狀態 200為響應成功 // StatusLine state = response.getStatusLine(); //獲取網頁實體 HttpEntity httpEntity = response.getEntity(); return httpEntity; // try { // return httpEntity.getContent(); // } catch (IllegalStateException | IOException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } // return null; } /** * 獲取整個html以String形式輸出 * @param url * @return */ private String getContent(String url){ HttpEntity httpEntity = getHttpEntity(url); String content = ""; try { InputStream is = httpEntity.getContent(); InputStreamReader isr = new InputStreamReader(is); char[] c = new char[1024]; int l = 0; while((l = isr.read(c)) != -1){ content += new String(c,0,l); } isr.close(); is.close(); } catch (IllegalStateException | IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return content; } /** * 通過開始時的content獲取所有圖片的地址 * @param startUrl * @return */ private List<String> getAllImageUrls(String content){ String regex = "http://([\\w-]+\\.)+[\\w-]+(/[\\w-./?%&=]*)?(.jpg|.mp4|.rmvb|.png|.mkv|.gif|.bmp|.jpeg|.flv|.avi|.asf|.rm|.wmv)+"; // String regex = "http://www.sslingyu.com/mz_pbl/images_min\\S*\\.jpg"; Pattern p = Pattern.compile(regex); Matcher m = p.matcher(content); List<String> urls = new ArrayList<String>(); while(m.find()){ String url = m.group(); //將獲取到的url轉換成高清 url = getHDImageUrl(url); System.out.println("獲取的url:"+url+"\n是否符合標准:"+isTrueUrl(url)); if(isTrueUrl(url)){ urls.add(url); } } System.out.println("----------------------------------"); System.out.println("--------獲取所有url成功----------"); System.out.println("----------------------------------"); return urls; } /** * 下載圖片 * @param url * @param is */ private int downloadImage(String url){ try{ HttpEntity httpEntity = getHttpEntity(url); long len = httpEntity.getContentLength()/1024; System.out.println("下載的文件大小為:"+len+"k"); if(len < 150){ System.out.println("Warring:文件太小,不予下載--------"); return 0; } String realPath = getRealPath(url); String name = getName(url); System.out.println("文件夾路徑:"+realPath); System.out.println("文件名字:"+name); InputStream is = httpEntity.getContent(); //此方法不行 // System.out.println(is.available()/1024+"k"); int l = 0; byte[] b = new byte[1024]; FileOutputStream fos = new FileOutputStream(new File(realPath+"/"+name)); while((l = is.read(b)) != -1){ fos.write(b, 0, l); } fos.flush(); fos.close(); is.close(); System.out.println("下載:"+url+"成功\n"); }catch(Exception e){ System.out.println("下載:"+url+"失敗"); e.printStackTrace(); } return 1; } /** * 創建並把存儲的位置返回回去 * @param url * @return */ private String getRealPath(String url){ Pattern p = Pattern.compile("images/[a-z]+/[a-z_0-9]+"); Matcher m = p.matcher(url); String format = getName(url).split("\\.")[1]; String path = null; //說明是圖片 if(imgFormat.contains(format)){ path = "media/images/"; }else{ path = "media/video/"; } path += url.split("/")[(url.split("/").length-2)]; if(m.find()){ path = m.group(); }; //添加盤符 path = "D:/"+path; File file = new File(path); if(!file.exists()){ file.mkdirs(); } return path; } /** * 獲取文件名 * @param url * @return */ private String getName(String url){ // s3.substring(s3.lastIndexOf("/")+1) return url.substring(url.lastIndexOf("/")+1); } /** * 獲取高清圖片地址 * 就是把images_min換成了Images * @param url * @return */ private String getHDImageUrl(String url){ if(url.contains("images_min")){ return url.replace("images_min", "images"); } return url; } /** * 判斷url的格式是否正確,必須以http開頭,以.jpg結尾 * @param url * @return */ private boolean isTrueUrl(String url){ return url.matches("^http://([\\w-]+\\.)+[\\w-]+(/[\\w-./?%&=]*)?(.jpg|.mp4|.rmvb|.png|.mkv|.gif|.bmp|.jpeg|.flv|.avi|.asf|.rm|.wmv)+$"); } }
上面是下載部分,下面是主函數
package com.feng.main; public class MainTest { public static void main(String[] args) { DownLoadImg down = new DownLoadImg(); String startUrl = "http://www.example.com"; down.start(startUrl); } }
整個爬蟲的功能是爬去整個網頁(單個網頁)中所有圖片與視頻,
通過正則獲取網頁中的url,而且還特地增加了將images_min換成images的方法。
換成接口鏈接開始運行!!!
等了幾十分鍾下完之后打開文件夾
很好很好,下載完成。
《未完待續》