Python爬取圖片(你懂得)
requests與Bs4
這兩個模塊是本文使用的主要模塊,requests可以獲取連接,bs4全名BeautifulSoup,是編寫python爬蟲常用庫之一,主要用來解析html標簽。這兩個模塊可以通過cmd終端下載
pip install bs4
pip install requests
代碼實現
import requests
from bs4 import BeautifulSoup
import os
class Mzitu():
def __init__(self):
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}# 構造請求頭,主網站的請求頭較為簡單只需構造瀏覽器頭
self.base_path = os.getcwd() # 獲取當前路徑
def get_url(self,html):
'''獲取每個套圖的鏈接,並返回'''
html_b=BeautifulSoup(html,'lxml')
urls_b = html_b.find_all('ul',attrs={'id':'pins'})[0]
urls = urls_b.find_all('a')
for i in urls:
yield i['href']
def get_img_url_max(self,url):
'''獲取圖片的張數'''
html_i = requests.get(url,headers=self.headers).text
html_b = BeautifulSoup(html_i,'lxml')
max_number=html_b.find_all('div',attrs={'class':'pagenavi'})[0]
max_number = max_number.find_all('a')[-2].span.text
return max_number
def get_img_url(self,url):
'''獲取每張圖片的鏈接'''
html_i = requests.get(url, headers=self.headers).text
html_b = BeautifulSoup(html_i, 'lxml')
img_url = html_b.find_all('div',attrs={'class':'main-image'})[0].p.a.img['src']
return img_url
def download_img(self,name,url):
'''獲取每張圖片的內容'''
headers = {
'Accept':'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host': 'i.meizitu.net',
'Referer': 'http://www.mzitu.com/%s'%name,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
img = requests.get(url,headers=headers).content
return img
def get_img(self,name,max,img_url):
'''下載圖片'''
path = os.path.join(self.base_path,name)
if os.path.exists(path):
pass
else:
os.mkdir(path)
for i in range(1,int(max)):
k = str(i)
file_name = k+'.jpg'
img_file_name = os.path.join(path,file_name)
if len(k) <2:
img_url = img_url[:-5]+k+img_url[-4:]
else:
img_url = img_url[:-6]+k+img_url[-4:]
img = self.download_img(name,img_url)
with open(img_file_name,'wb') as f:
f.write(img)
def get_html_url_link_max(self):
'''獲取主網站中的總頁數'''
url = 'http://www.mzitu.com/'
html = requests.get(url,headers = self.headers).text
html_b = BeautifulSoup(html,'lxml')
max_number = html_b.find_all('a',attrs={'class':'page-numbers'})[-2]['href']
max_number = max_number.split('/')[4]
return max_number
def main(self):
max_number = int(self.get_html_url_link_max())
for i in range(1,max_number+1):
'''遍歷構造網址'''
url = 'http://www.mzitu.com/page/%d/'%i
html = requests.get(url,headers=self.headers).text
urls = self.get_url(html)
for i in urls:
name = i.split('/')[-1]
max_number = self.get_img_url_max(i)
img_url = self.get_img_url(i)
self.get_img(name,max_number,img_url)
if __name__ == '__main__':
mzitu = Mzitu()
mzitu.main()
運行程序后,即可在同文件夾下發現不斷有包含圖片的文件夾生成