Golang - 爬蟲案例實踐
Golang - 爬蟲案例實踐
1. 爬蟲步驟
- 明確目標(確定在哪個網址搜索)
- 爬(爬下數據)
- 取(去掉沒用的數據)
- 處理數據(按具體業務去使用數據)
2. 正則表達式
- 文檔:https://studygolang.com/pkgdoc
- API
- re := regexp.MustCompile(reStr):傳入正則表達式,得到正則表達式對象
- ret := re.FindAllStringSubmatch(srcStr,-1):用正則對象提取頁面中內容,srcStr是頁面內容,-1代表取所有
- 爬郵箱
- 爬超鏈接
- 爬手機號
- http://www.zhaohaowang.com/
- 爬身份證號
- http://henan.qq.com/a/20171107/069413.htm
-
爬圖片鏈接
package main import ( "net/http" "fmt" "io/ioutil" "regexp" ) var ( //\d代表數字 reQQEmail = `(\d+)@qq.com` //匹配郵箱 reEmail = `\w+@\w+\.\w+(\.\w+)?` //鏈接 reLink = `href="(https?://[\s\S]+?)"` rePhone=`1[3456789]\d\s?\d{4}\s?\d{4}` //410222 1987 06 13 4038 reIdcard=`[12345678]\d{5}((19\d{2})|(20[01]))((0[1-9]|[1[012]]))((0[1-9])|[12]\d|[3[01]])\d{3}[\dXx]` reImg=`"(https?://[^"]+?(\.((jpg)|(jpeg)|(png)|(gif)|(ico))))"` ) func main2() { //1.爬郵箱 //GetEmail() //2.抽取爬郵箱的方法 //GetEmail2("http://tieba.baidu.com/p/2544042204") //3.爬超鏈接 //GetLink("http://www.baidu.com/s?wd=島國%20留下郵箱") //4.爬手機號 //GetPhone("http://www.zhaohaowang.com/") //5.爬身份證 //GetIdcard("http://henan.qq.com/a/20171107/069413.htm") //6.爬圖片鏈接 //GetImg("http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=%E7%BE%8E%E5%A5%B3") } //爬郵箱 func GetEmail() { //1.發送http請求,獲取頁面內容 resp, err := http.Get("http://tieba.baidu.com/p/2544042204") //處理異常 HandleError(err, "http.Get url") //關閉資源 defer resp.Body.Close() //接收頁面 pageBytes, err := ioutil.ReadAll(resp.Body) HandleError(err, "ioutil.ReadAll") //打印頁面內容 pageStr := string(pageBytes) fmt.Println(pageStr) //2.捕獲郵箱,先搞定qq郵箱 //傳入正則 re := regexp.MustCompile(reQQEmail) results := re.FindAllStringSubmatch(pageStr, -1) for _, result := range results { //fmt.Println(result) fmt.Printf("email=%s qq=%s\n", result[0], result[1]) } } //處理異常 func HandleError(err error, why string) { if err != nil { fmt.Println(why, err) } } //抽取的爬郵箱的方法 func GetEmail2(url string) { //爬頁面所有數據 pageStr := GetPageStr(url) re := regexp.MustCompile(reEmail) results := re.FindAllStringSubmatch(pageStr, -1) for _, result := range results { fmt.Println(result) } } //根據url獲取頁面內容 func GetPageStr(url string) (pageStr string) { //1.發送http請求,獲取頁面內容 resp, err := http.Get(url) //處理異常 HandleError(err, "http.Get url") //關閉資源 defer resp.Body.Close() //接收頁面 pageBytes, err := ioutil.ReadAll(resp.Body) HandleError(err, "ioutil.ReadAll") //打印頁面內容 pageStr = string(pageBytes) return pageStr } func GetLink(url string) { pageStr := GetPageStr(url) fmt.Println(pageStr) re := regexp.MustCompile(reLink) results := re.FindAllStringSubmatch(pageStr, -1) fmt.Printf("找到%d條結果:\n",len(results)) for _, result := range results { //fmt.Println(result) fmt.Println(result[1]) } } func GetPhone(url string) { pageStr := GetPageStr(url) fmt.Println(pageStr) re := regexp.MustCompile(rePhone) results := re.FindAllStringSubmatch(pageStr, -1) fmt.Printf("找到%d條結果:\n",len(results)) for _, result := range results { //fmt.Println(result) fmt.Println(result) } } func GetIdcard(url string) { pageStr := GetPageStr(url) fmt.Println(pageStr) re := regexp.MustCompile(reIdcard) results := re.FindAllStringSubmatch(pageStr, -1) fmt.Printf("找到%d條結果:\n",len(results)) for _, result := range results { //fmt.Println(result) fmt.Println(result) } } func GetImg(url string) { pageStr := GetPageStr(url) fmt.Println(pageStr) re := regexp.MustCompile(reImg) results := re.FindAllStringSubmatch(pageStr, -1) fmt.Printf("找到%d條結果:\n",len(results)) for _, result := range results { //fmt.Println(result) fmt.Println(result[0]) } }
3. 並發爬取美圖
- http://www.umei.cc/bizhitupian/meinvbizhi/1.htm
- 基本分析:
- 先測試獲取頁面所有內容
-
完成圖片下載
- 並發爬蟲分析:
- 初始化數據通道(2個)
- 爬蟲協程:65個協程向管道中添加圖片鏈接
- 任務統計協程:檢查65個任務是否都完成,完成就關閉通道
-
下載協程:從管道中讀取鏈接並下載
package main import ( "fmt" "net/http" "io/ioutil" "sync" "strconv" "regexp" "strings" "time" ) //測試是否能拿到數據 func myTest() { //1.獲取頁面內容 pageStr := GetPageStr("http://www.umei.cc/bizhitupian/meinvbizhi/1.htm") fmt.Println(pageStr) //2.獲取圖片鏈接 GetImg("http://www.umei.cc/bizhitupian/meinvbizhi/1.htm") } //圖片下載 func TestDownloadImg() { ok := DownloadFile("http://i1.whymtj.com/uploads/tu/201903/9999/rne35bbd2303.jpg", "1.jpg") if ok { fmt.Println("下載成功") } else { fmt.Println("下載失敗") } } //下載 func DownloadFile(url string, filename string) (ok bool) { //發請求 resp, err := http.Get(url) if err != nil { HandleError(err, "http.Get") return } //關閉資源 defer resp.Body.Close() //讀取響應內容 fBytes, e := ioutil.ReadAll(resp.Body) HandleError(e, "ioutil resp.Body") //拼接 filename = "D:/go_work/src/goapp01/07/img/" + filename //寫入硬盤 err = ioutil.WriteFile(filename, fBytes, 644) HandleError(err, "http.GetWrite") if err != nil { return false } else { return true } } var ( //存圖片鏈接的數據通道,string chanImageUrls chan string //監控通道 chanTask chan string waitGroup sync.WaitGroup ) func main() { //myTest() //TestDownloadImg() //1.初始化數據通道 chanImageUrls = make(chan string, 1000000) chanTask = make(chan string, 65) //2.爬蟲協程 for i := 1; i < 66; i++ { waitGroup.Add(1) //獲取某個頁面所有圖片鏈接 //strconv.Itoa(i):將整數轉為字符串 go getImgUrls("http://www.umei.cc/bizhitupian/weimeibizhi/" + strconv.Itoa(i) + ".htm") } //3.任務統計協程 waitGroup.Add(1) go CheckOk() //4.下載協程 //少開幾個下載協程,開5個 for i := 0; i < 5; i++ { waitGroup.Add(1) //下載 go DownloadImg() } waitGroup.Wait() } //爬當前頁所有圖片鏈接,並添加到管道 func getImgUrls(url string) { //爬當前頁所有圖片鏈接 urls := getImgs(url) //添加到管道 for _, url := range urls { chanImageUrls <- url } //標志當前協程任務完成 chanTask <- url waitGroup.Done() } //拿圖片鏈接 func getImgs(url string) (urls []string) { //根據url取內容 pageStr := GetPageStr(url) //獲取正則對象 re := regexp.MustCompile(reImg) results := re.FindAllStringSubmatch(pageStr, -1) fmt.Printf("找到%d條結果:\n", len(results)) for _, result := range results { //fmt.Println(result) //fmt.Println(result) url := result[1] urls = append(urls, url) } return } //監控65個任務是否完成,完成則關閉通道 func CheckOk() { //計數 var count int for { url := <-chanTask fmt.Printf("%s 完成爬取任務\n", url) count++ if count == 65 { close(chanImageUrls) break } } waitGroup.Done() } //下載圖片 func DownloadImg() { for url := range chanImageUrls { //得到全路徑 filename := GetFilenameFromUrl(url, "D:/go_work/src/goapp01/07/img/") //保存到硬盤 ok := DownloadFile(url, filename) if ok { fmt.Printf("%s 下載成功\n", filename) } else { fmt.Printf("%s 下載失敗\n", filename) } } } //拼接文件名 func GetFilenameFromUrl(url string, dirPath string) (filename string) { //strings包的方法,截取最后一個/ lastIndex := strings.LastIndex(url, "/") filename = url[lastIndex+1:] //加一個時間戳,防止重名 timePrefix := strconv.Itoa(int(time.Now().UnixNano())) filename = timePrefix + "_" + filename filename = dirPath + filename return }