此代碼是根據網絡上其他人的代碼優化而成的,
環境准備:
pip install lxml
pip install bs4
pip install urllib
1 #!/usr/bin/env python
2 #-*- coding: utf-8 -*-
3
4 import requests 5 from bs4 import BeautifulSoup 6 import os 7 import urllib 8 import random 9
10
11 class mzitu(): 12
13 def all_url(self, url): 14 html = self.request(url) 15 all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find_all('a') 16 for a in all_a: 17 title = a.get_text() 18 print(u'開始保存:', title) 19 title = title.replace(':', '') 20 path = str(title).replace("?", '_') 21 if not self.mkdir(path): ##跳過已存在的文件夾
22 print(u'已經跳過:', title) 23 continue
24 href = a['href'] 25 self.html(href) 26
27 def html(self, href): 28 html = self.request(href) 29 max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 30 for page in range(1, int(max_span) + 1): 31 page_url = href + '/' + str(page) 32 self.img(page_url) 33
34 def img(self, page_url): 35 img_html = self.request(page_url) 36 img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] 37 self.save(img_url, page_url) 38
39 def save(self, img_url, page_url): 40 name = img_url[-9:-4] 41 try: 42 img = self.requestpic(img_url, page_url) 43 f = open(name + '.jpg', 'ab') 44 f.write(img.content) 45 f.close() 46 except FileNotFoundError: ##捕獲異常,繼續往下走
47 print(u'圖片不存在已跳過:', img_url) 48 return False 49
50 def mkdir(self, path): ##這個函數創建文件夾
51 path = path.strip() 52 isExists = os.path.exists(os.path.join("D:\mzitu", path)) 53 if not isExists: 54 print(u'建了一個名字叫做', path, u'的文件夾!') 55 path = path.replace(':','') 56 os.makedirs(os.path.join("D:\mzitu", path)) 57 os.chdir(os.path.join("D:\mzitu", path)) ##切換到目錄
58 return True 59 else: 60 print(u'名字叫做', path, u'的文件夾已經存在了!') 61 return False 62
63 def requestpic(self, url, Referer): ##這個函數獲取網頁的response 然后返回
64 user_agent_list = [ \ 65 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
65-1 "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",\ 66 "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 67 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 68 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 69 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 70 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 71 "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 72 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 73 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 74 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 75 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 76 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 77 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 78 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 79 "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 80 "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 81 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 82 "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
83 ] 84 ua = random.choice(user_agent_list) 85 headers = {'User-Agent': ua, "Referer": Referer} ##較之前版本獲取圖片關鍵參數在這里
86 content = requests.get(url, headers=headers) 87 return content 88
89 def request(self, url): ##這個函數獲取網頁的response 然后返回
90 headers = { 91 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"} 92 content = requests.get(url, headers=headers) 93 return content 94
95
96 Mzitu = mzitu() ##實例化
97 Mzitu.all_url('http://www.mzitu.com/all/') ##給函數all_url傳入參數 你可以當作啟動爬蟲(就是入口)
98 print(u'恭喜您下載完成啦!')
執行步驟:
重復執行代碼的話已保存的不會再次下載保存
執行結果:
遇到的錯誤如何解決:
1、錯誤提示:requests.exceptions.ChunkedEncodingError: ("Connection broken: ConnectionResetError(10054, '遠程主機強迫關閉了一個現有的連接。', None, 10054, None)", ConnectionResetError(10054, '遠程主機強迫關閉了一個現有的連接。', None, 10054, None))
錯誤原因分析:訪問量瞬間過大,被網站反爬機制攔截了
解決方法:稍等一段時間再次執行即可
2、requests.exceptions.ChunkedEncodingError: ("Connection broken: ConnectionResetError(10054, '遠程主機強迫關閉了一個現有的連接。', None, 10054, None)", ConnectionResetError(10054, '遠程主機強迫關閉了一個現有的連接。', None, 10054, None))
錯誤原因分析:可能對方服務器做了反爬
解決方法:requests手動添加一下header