java爬蟲入門技術
我們需要用到http協議 從而建立java程序和網頁的連接
URL url = new URL("https://www.ivsky.com/tupian/ziranfengguang/"); URLConnection urlConnection = url.openConnection(); urlConnection.connect(); //讀取網頁的html BufferedReader br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
上面的代碼就是建立java程序和網頁的連接
我們爬蟲首先是將網頁的Html代碼爬下來
接下來我們需要從這些代碼中找到有用的東西,我們發現大部分圖片會有一個src資源
如果只是找一張圖片我們可以直接用find函數查找,但是我們需要多張圖片,此時正則表達式就顯示了他的威力
我們直接用正則表達式匹配出來src資源
代碼如下:
String line = null;
//正則表達式,解釋如下在最少的""里面匹配到子表達式 ?相當於懶惰(匹配盡可能少) Pattern pattern = Pattern.compile("src=\"(.+?)\""); List<String> list = new ArrayList<String>(); while((line = br.readLine()) != null) { Matcher m = pattern.matcher(line); while(m.find()) { //查到之后添加list里面 list.add(m.group()); } }
匹配出來這些字符串我們還需要對他們進行一些處理篩選出來圖片的地址
代碼如下
//篩選src,找到jpg和png和gif結尾的(設置新格式也就是截取字符串)放到圖片集合里面 List<String> imglist = new ArrayList<String>(); for(String xString : list) { if(xString.endsWith(".jpg\"") || xString.endsWith(".png\"") || xString.endsWith(".gif\"")) { //截取字符串的一部分也就是圖片的地址 String partString = xString.substring(5,xString.length() - 1);
imglist.add(partString); } }
處理好之后我們就剩下最后一步了(下載)
下載就是將圖片讀進本地磁盤
代碼如下
//開始下載 Date beginDate = new Date(); for(String xString : imglist) { Date partbeginDate = new Date(); URL partUrl; if(!xString.startsWith("http:")) { partUrl = new URL("http:"+xString); if(!("http:"+xString).startsWith("http://")) { continue; } }else { partUrl = new URL(xString); } System.out.println(partUrl); String nameString = xString.substring(xString.lastIndexOf("/") + 1,xString.length()); File file = new File("E:\\圖片下載\\"+nameString); InputStream is = partUrl.openStream(); BufferedInputStream bis = new BufferedInputStream(is); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); System.out.println("開始下載" + xString); int len = 0; while((len = bis.read()) != -1){ bos.write(len); } System.out.println("下載完成"); Date partendDate = new Date(); double ti = (partendDate.getTime() - partbeginDate.getTime()) / 1000; System.out.println("用時" + String.format("%.8f", ti) + "s"); bis.close(); bos.close(); } Date endDate = new Date(); double ti = (endDate.getTime() - beginDate.getTime() ) / 1000; System.out.println("全部下載完成"); System.out.println("總用時" + String.format("%.8f", ti) + "s");
代碼綜合如下
package worm; import java.io.*; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.text.html.HTMLDocument.HTMLReader.IsindexAction; public class Main { public static void main(String[] args) throws Exception { /* * 連接網頁 */ URL url = new URL("https://www.ivsky.com/tupian/ziranfengguang/"); URLConnection urlConnection = url.openConnection(); urlConnection.connect(); //讀取網頁的html BufferedReader br = new BufferedReader(new InputStreamReader(urlConnection.getInputStream())); String line = null; //正則表達式,解釋如下在最少的""里面匹配到子表達式 ?相當於懶惰(匹配盡可能少) Pattern pattern = Pattern.compile("src=\"(.+?)\""); List<String> list = new ArrayList<String>(); while((line = br.readLine()) != null) { Matcher m = pattern.matcher(line); while(m.find()) { //查到之后添加list里面 list.add(m.group()); } } br.close(); //篩選src,找到jpg和png和gif結尾的(設置新格式也就是截取字符串)放到圖片集合里面 List<String> imglist = new ArrayList<String>(); for(String xString : list) { if(xString.endsWith(".jpg\"") || xString.endsWith(".png\"") || xString.endsWith(".gif\"")) { //截取字符串的一部分也就是圖片的地址 String partString = xString.substring(5,xString.length() - 1); imglist.add(partString); } } //開始下載 Date beginDate = new Date(); for(String xString : imglist) { Date partbeginDate = new Date(); URL partUrl; if(!xString.startsWith("http:")) { partUrl = new URL("http:"+xString); if(!("http:"+xString).startsWith("http://")) { continue; } }else { partUrl = new URL(xString); } System.out.println(partUrl); String nameString = xString.substring(xString.lastIndexOf("/") + 1,xString.length()); File file = new File("E:\\圖片下載\\"+nameString); InputStream is = partUrl.openStream(); BufferedInputStream bis = new BufferedInputStream(is); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file)); System.out.println("開始下載" + xString); int len = 0; while((len = bis.read()) != -1){ bos.write(len); } System.out.println("下載完成"); Date partendDate = new Date(); double ti = (partendDate.getTime() - partbeginDate.getTime()) / 1000; System.out.println("用時" + String.format("%.8f", ti) + "s"); bis.close(); bos.close(); } Date endDate = new Date(); double ti = (endDate.getTime() - beginDate.getTime() ) / 1000; System.out.println("全部下載完成"); System.out.println("總用時" + String.format("%.8f", ti) + "s"); } }
運行截圖如下: