java爬蟲,爬取網址、爬取視頻、爬取圖片


一、爬取網址

import java.io.*;
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * java實現爬蟲
 */
public class Robot {
    public static void main(String[] args) {
        URL url = null;
        URLConnection urlconn = null;
        BufferedReader br = null;
        PrintWriter pw = null;
//        String regex = "http://[\\w+\\.?/?]+\\.[A-Za-z]+";
        String regex = "https://[\\w+\\.?/?]+\\.[A-Za-z]+";//url匹配規則
        Pattern p = Pattern.compile(regex);
        try {
            url = new URL("https://www.cnblogs.com/peachh/p/9740229.html");//爬取的網址、這里爬取的是一個生物網站
            urlconn = url.openConnection();
            pw = new PrintWriter(new FileWriter("C:/SiteURL.txt"), true);//將爬取到的鏈接放到D盤的SiteURL文件中
            br = new BufferedReader(new InputStreamReader(
                    urlconn.getInputStream()));
            String buf = null;
            while ((buf = br.readLine()) != null) {
                Matcher buf_m = p.matcher(buf);
                while (buf_m.find()) {
                    pw.println(buf_m.group());
                }
            }
            System.out.println("爬取成功^_^");
        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                br.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            pw.close();
        }
    }
}
View Code

二、爬取視頻

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
/**
 * 功能:爬取某姐的小視頻
 * @author cxd
 *
 */
public class WebSpiderDemo1 {
 
    public static void main(String[] args) throws Exception {
 
        String source = "http://www.budejie.com/video/";
        String destDir = "C:/rob/";
 
        Map<String, String> urlMap = getUrlInSource(source);
 
        for (Map.Entry<String, String> entry : urlMap.entrySet()) {
            String title = entry.getKey();// 視頻名稱
            String url = entry.getValue();// 視頻url
            File destFile = new File(destDir + title + ".mp4");
            download(url, destFile);
        }
    }
 
    /**
     * 通過視頻的URL下載該視頻並存入本地
     * 
     * @param url      視頻的URL
     * @param destFile 視頻存入的位置
     * @throws IOException
     */
    public static void download(String url, File destFile) throws IOException {
        URL videoUrl = new URL(url);
 
        InputStream is = videoUrl.openStream();
        FileOutputStream fos = new FileOutputStream(destFile);
 
        int len = 0;
        byte[] buffer = new byte[1024];
        while ((-1) != (len = is.read(buffer))) {
            fos.write(buffer, 0, len);
        }
        fos.flush();
 
        if (null != fos) {
            fos.close();
        }
 
        if (null != is) {
            is.close();
        }
    }
 
    /**
     * 獲取視頻的URL地址和視頻名稱存入hashMap
     * 
     * @param source
     * @return
     * @throws IOException
     */
    public static Map<String, String> getUrlInSource(String source) throws IOException {
 
        Map<String, String> hashMap = new HashMap<>();
 
        for (int index = 1; index <= 1; index++) { // 頁數最大為50,自己玩嘛,就只爬取了一頁。
            String pageUrl = source + index;
            URL url = new URL(pageUrl);
            InputStream is = url.openStream();
 
//            若遇到反爬機制則使用該方法將程序偽裝為瀏覽器進行訪問
//            HttpURLConnection conn = (HttpURLConnection) url.openConnection();
//            conn.setRequestMethod("GET");
//            conn.setRequestProperty("user-agent",
//                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36");
//            BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
 
            BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
 
            String info = null;
            String title = null;
            // 此處不要用==null進行判斷,因為網頁中有很多行都是null,否則會報java.lang.NullPointerException。
            for (int i = 0; i < 10000; i++) {
                info = br.readLine();
 
                if (null != info) {
                    String urlRegex = "data-mp4=\"(.*?\\.mp4)";
 
                    if (info.contains("data-title")) {
                        title = info;
                    }
 
                    Pattern pattern = Pattern.compile(urlRegex);
                    Matcher matcher = pattern.matcher(info);
                    if (matcher.find()) {
                        for (int j = 0; j <= matcher.groupCount(); j++) {
                            String tmp = matcher.group(j);
                            if (!tmp.startsWith("data-mp4=")) {
                                String videoTitle = getTitle(title.trim());
                                hashMap.put(videoTitle, tmp);
                            }
                        }
                    }
                }
            }
        }
        return hashMap;
    }
 
    /**
     * 清洗整理titile字符串,
     * 
     * @param info
     * @return
     */
    private static String getTitle(String info) {
 
        int len = info.length();
        String title = info.substring(12, len - 1);
        return title;
    }

}
View Code

三、爬取圖片

 

  1 import com.obcy.util.DownLoad;
  2 import com.obcy.util.GetHTML;
  3 import org.jsoup.Jsoup;
  4 import org.jsoup.nodes.Document;
  5 import org.jsoup.select.Elements;
  6 import org.junit.Test;
  7  
  8 import java.io.File;
  9 import java.util.ArrayList;
 10 import java.util.HashMap;
 11 import java.util.Map;
 12  
 13 public class BiAn {
 14  
 15     //獲取到所有的一級頁面,從第2頁到第946頁
 16  
 17  
 18     public ArrayList<String> getTopUrl(){
 19         //String topurl = "http://www.netbian.com/hd3840x2160/index_2.htm"
 20  
 21         //定義一個集合保存所有的一級頁面
 22         ArrayList<String> list = new ArrayList<String>();
 23         for (int i = 2; i <= 946; i++) {
 24             list.add("http://www.netbian.com/hd3840x2160/index_"+i+".htm");
 25         }
 26  
 27         return list;
 28     }
 29  
 30  
 31     //獲取一級頁面的所有圖片查看地址
 32     //傳入的參數是一級頁面地址
 33     public HashMap<String,String> getGpjView(String topUrl){
 34  
 35         String url = topUrl;
 36         String html = GetHTML.getHTML(url);
 37         //獲取到網頁源代碼document對象
 38         Document document = Jsoup.parse(html);
 39         //解析document對象,拿到頁面每個圖片查看地址
 40         Elements list = document.getElementsByClass("list");
 41         //拿到的list只有一個對象,這個對象里包含頁面所有的圖片a標簽
 42         Elements a = null;
 43         try {
 44             a = list.get(0).select("ul>li>a");
 45         } catch (Exception e) {
 46             System.out.println("沒有獲取到a標簽");
 47  
 48         }
 49         //遍歷a標簽對象,拿到a標簽的href屬性值並拼接成完整的圖片查看地址放入集合里
 50             //建立一個map集合
 51         HashMap<String,String> map = new HashMap<String, String>();
 52         for (int i = 0; i < a.size(); i++) {
 53             String href = "http://www.netbian.com"+a.get(i).attr("href");
 54             String name = a.get(i).attr("title");
 55             //System.out.println(href);http://www.netbian.com/desk/22138.htm
 56             map.put(name,href);
 57         }
 58  
 59         //搜集本頁面的圖片查看地址完成
 60  
 61         return map;
 62  
 63     }
 64  
 65  
 66     //訪問每個一級頁面,獲取到頁面里所有的圖片下載地址,
 67     // 方法接收一個裝有一個一級頁面所有圖片查看地址的集合
 68  
 69     public void getDownload(HashMap<String,String> map){
 70  
 71  
 72         //遍歷集合,對集合里所有的頁面進行提取,每個頁面提取到一張圖片下載地址,並下載
 73  
 74         for (Map.Entry<String, String> entry : map.entrySet()) {
 75  
 76  
 77             String html = GetHTML.getHTML(entry.getValue());
 78  
 79             Document document = Jsoup.parse(html);
 80  
 81             //拿到圖片img標簽對象,只有一個
 82  
 83             Elements endpage = null;
 84             try {
 85                 endpage = document.getElementsByClass("endpage").get(0).select("div>p>a>img");
 86             } catch (Exception e) {
 87                 System.out.println("沒獲取到頁面對象,繼續下一個");
 88                 continue;
 89             }
 90  
 91             //System.out.println(endpage.get(0).attr("src"));
 92  
 93             //得到下載地址
 94             String target = endpage.get(0).attr("src");
 95  
 96  
 97             String path = "F:/BiAn/"+entry.getKey()+".jpg";
 98             //開始下載
 99             DownLoad.downLoad(target,path);
100         }
101  
102  
103     }
104  
105     @Test
106     public void test(){
107         //判斷是否存在文件夾F:/BiAn
108         File file = new File("C:/BiAn");
109         if (!file.exists()){
110             file.mkdirs();
111             System.out.println("已創建下載文件夾F:/BiAn");
112         }else {
113             System.out.println("已存在文件夾,具備下載條件");
114         }
115  
116  
117         //單線程
118         //拿到所有一級頁面
119         ArrayList<String> topUrl = getTopUrl();
120         //對每個頁面進行操作,1.得到視圖集合,2.遍歷集合,拿到下載地址,3.下載
121  
122         for (String url : topUrl) {
123             HashMap<String, String> gpjView = getGpjView(url);
124             getDownload(gpjView);
125         }
126  
127  
128  
129     }
130 }
View Code

 

四、如何分析網頁信息,從而進行抓取的分析?

https://www.cnblogs.com/518894-lu/p/9021548.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM