導入庫
import os import requests from bs4 import BeautifulSoup import time
生成請求headers
def res_headers():
headers = {
'User-Agent': 'Mozilla/5.0 ',
'Referer':'https://i5.meizitu.net/pfiles/style.css?091102',
}
return headers
網站請求
def get_page(url):
headers=res_headers()
# 創建session
s = requests.session()
s.keep_alive = False
# 獲取頁面
res = s.get(url,headers=headers)
html = res.text
return html
獲取頁面all girls的詳情頁url
def get_all_girls(url):
html = get_page(url)
# 構建soup頁面
soup = BeautifulSoup(html, 'lxml')
# 獲取 class_='archives' 下的所有 'a'標簽
total_info = soup.find(class_='archives').find_all('a')
# 遍歷 'a' 標簽,讀取'href'值
all_list=[]
for girls_info in total_info:
link_url = girls_info['href']
all_list.append(link_url)
return all_list
獲取girl的所有圖片url
def get_girl_all_page(url):
html=get_page(url)
soup = BeautifulSoup(html,'lxml')
# 在 class_='pagenavi' 中的倒數第3個標簽,讀取 'span' 的值(圖片數量)
max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string
title = soup.find(class_='main-title').string
# 循環讀取詳情頁面中的'img'標簽中的'src'值
pic_url_list = []
for i in range(int(max_page)):
html = get_page(url + "/%s" %(i+1))
# print(html)
soup = BeautifulSoup(html,'lxml')
# print(soup.text)
# pic_url = soup.find('img').get('src')
pic_url = soup.find('img').get('src')
# print(pic_url)
pic_url_list.append(pic_url)
time.sleep(0.1)
# print(pic_url_list)
download_Pic(title,pic_url_list)
下載圖片,以標題為文件夾名
def download_Pic(title, pic_url_list):
# 新建文件夾,路徑
os.mkdir(title)
headers = res_headers()
# 自定義序列號
j = 1
# 下載圖片
for item in pic_url_list:
# 定義文件路徑及名稱
filename = '%s/%s.jpg' % (title, str(j))
print('downloading....%s : NO.%s' % (title, str(j)))
with open(filename, 'wb') as f:
img = requests.get(item, headers=headers).content
f.write(img)
f.close()
j += 1
time.sleep(100)
主程序
if __name__ == '__main__':
url = "https://www.mzitu.com/all"
pic_list = get_all_girls(url)
for i in pic_list:
get_girl_all_page(i)
*本文根據崔老師視頻及自己實際測試得出,仍存在請求的問題,有待后續改進
