Python3.6多線程爬蟲


Python版本 3.6

簡單寫一個爬蟲,在寫的過程熟悉Python語法,不得不說Python用起來真666;

  代碼功能是訪問網站首頁將所有a標簽值作為文件夾,將當前網頁所有圖片下載對應文件夾中;其實還有很多很多需要修改和完善的地方 比如異常,多線程,遞歸等;以后有機會再說吧.歡迎拍磚

 1 # -*- UTF-8 -*-
 2 from urllib import request
 3 from bs4 import BeautifulSoup
 4 import os
 5 import time, threading
 6 
 7 
 8 exe_Count = 1
 9 aList = []
10 
11 def CallView(url, timeout, directoryPath,exe_count):
12     try:
13         listAvalue = []
14         headers = {
15             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2716.5 Safari/537.36",
16             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
17         }
18         rep = request.Request(url, headers=headers)
19         response = request.urlopen(rep, timeout=timeout)
20         soup = BeautifulSoup(response)
21         # 獲取a標簽href 屬性並寫入list
22         for a in soup.find_all("a"):
23             if a.string is None:
24                 continue
25             if not a.attrs["href"].strip() in aList:
26                 aList.append(a.attrs["href"].strip())
27                 listAvalue.append([a.string.strip()[0:11], a.attrs["href"].strip()])
28             else:
29                 continue
30         # 創建不存在的目錄
31         if not os.path.exists(directoryPath):
32             os.mkdir(directoryPath)
33         print("新目錄:" + directoryPath)
34         # 開啟線程遞歸
35         thread = threading.Thread(target=ForRequest, args=(listAvalue, timeout, directoryPath,exe_count))
36         thread.start()
37         listImgSrc = []
38         # 獲取img標簽 並下載
39         for img in soup.find_all("img"):
40             try:
41                 imgSrc = img.attrs["src"]
42                 print(imgSrc)
43                 # 過濾重復src
44                 if not imgSrc in listImgSrc:
45                     listImgSrc.append(imgSrc)
46                     # 讀取圖片
47                     rep = request.Request(imgSrc)
48                     response = request.urlopen(rep, timeout=timeout)
49                     # 寫入圖片
50                     filepath = directoryPath + "/" + imgSrc.split('/')[len(imgSrc.split('/')) - 1]
51                     with open(filepath, "wb") as o:
52                         o.write(response.read())
53             except:
54                 print("訪問圖片或者寫入本地Error")
55     except request.HTTPError as e:
56         print(e.code)
57     except:
58         print("CallView Error")
59 
60 
61 def ForRequest(listA, timeout, directoryPath,exe_count):
62     print("當前已執行:" + str(exe_count) + "")
63     #調用次數超過200跳出
64     if  exe_count == 2:
65         thread = threading.current_thread()
66         raise SystemError("正在停止線程")
67     else:
68         exe_count = exe_count + 1
69 
70     for info in listA:
71         directoryChildPath = directoryPath + "/" + info[0]
72         if not os.path.exists(directoryChildPath):
73             os.mkdir(directoryChildPath)
74         CallView(info[1], timeout, directoryChildPath, exe_count)
75 
76 try:
77     print("爬蟲開始活動了")
78     CallView("http://www.xxxxx.com", 5000, "D:/PythonTest/Img/素材公社",exe_Count);
79     print("爬蟲正在偷偷活動,不要着急哦!")
80 except:
81     print("Error")

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM