導入庫
import os import requests from bs4 import BeautifulSoup import time
生成請求headers
def res_headers(): headers = { 'User-Agent': 'Mozilla/5.0 ', 'Referer':'https://i5.meizitu.net/pfiles/style.css?091102', } return headers
網站請求
def get_page(url): headers=res_headers() # 創建session s = requests.session() s.keep_alive = False # 獲取頁面 res = s.get(url,headers=headers) html = res.text return html
獲取頁面all girls的詳情頁url
def get_all_girls(url): html = get_page(url) # 構建soup頁面 soup = BeautifulSoup(html, 'lxml') # 獲取 class_='archives' 下的所有 'a'標簽 total_info = soup.find(class_='archives').find_all('a') # 遍歷 'a' 標簽,讀取'href'值 all_list=[] for girls_info in total_info: link_url = girls_info['href'] all_list.append(link_url) return all_list
獲取girl的所有圖片url
def get_girl_all_page(url): html=get_page(url) soup = BeautifulSoup(html,'lxml') # 在 class_='pagenavi' 中的倒數第3個標簽,讀取 'span' 的值(圖片數量) max_page = soup.find(class_='pagenavi',).find_all('a')[-2].find('span').string title = soup.find(class_='main-title').string # 循環讀取詳情頁面中的'img'標簽中的'src'值 pic_url_list = [] for i in range(int(max_page)): html = get_page(url + "/%s" %(i+1)) # print(html) soup = BeautifulSoup(html,'lxml') # print(soup.text) # pic_url = soup.find('img').get('src') pic_url = soup.find('img').get('src') # print(pic_url) pic_url_list.append(pic_url) time.sleep(0.1) # print(pic_url_list) download_Pic(title,pic_url_list)
下載圖片,以標題為文件夾名
def download_Pic(title, pic_url_list): # 新建文件夾,路徑 os.mkdir(title) headers = res_headers() # 自定義序列號 j = 1 # 下載圖片 for item in pic_url_list: # 定義文件路徑及名稱 filename = '%s/%s.jpg' % (title, str(j)) print('downloading....%s : NO.%s' % (title, str(j))) with open(filename, 'wb') as f: img = requests.get(item, headers=headers).content f.write(img) f.close() j += 1 time.sleep(100)
主程序
if __name__ == '__main__': url = "https://www.mzitu.com/all" pic_list = get_all_girls(url) for i in pic_list: get_girl_all_page(i)
*本文根據崔老師視頻及自己實際測試得出,仍存在請求的問題,有待后續改進