廢話不多, 直接上代碼, python3.6:
import requests from bs4 import BeautifulSoup import os import time; import random #pip install BeautifulSoup4 -i https://pypi.douban.com/simple #pip install requests -i https://pypi.douban.com/simple # http請求頭 Hostreferer = { 'Referer': 'http://www.mzitu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } # 此請求頭Referer破解盜圖鏈接 Picreferer = { # 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36', # 'Referer': 'http://i.meizitu.net', # https://www.mzitu.com/224497/3 'Referer': 'http://www.mzitu.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } all_url = 'https://www.mzitu.com' # 對mzitu主頁all_url發起請求,將返回的HTML數據保存,便於解析 start_html = requests.get(all_url, headers=Hostreferer) soup = BeautifulSoup(start_html.text, "html.parser") # 縮進格式 page = soup.find_all('a', class_='page-numbers') # 最大頁數 max_page = page[-2].text for n in range(1, int(max_page) + 1): path = 'D:/mzitu/' #存儲路徑 all_url = 'https://www.mzitu.com' #重新賦值 if n!=1: all_url= all_url+"/page/"+str(n)+"/"; print('開始爬第 %s 頁, 網址是 %s' % (n , all_url)) start_html = requests.get(all_url, headers=Hostreferer); soup = BeautifulSoup(start_html.text, "html.parser") # alt = soup.find(id='pins').find_all('a', target='_blank').find_all('img',class_='lazy').get('alt'); hrefs = soup.find(id='pins').find_all('a', target='_blank'); #根據ID找 for href in hrefs: imgs = href.find('img',class_='lazy'); if imgs == None: break; alt = imgs.get('alt'); url = href.get('href'); start_html2 = requests.get(url, headers=Hostreferer); soup2 = BeautifulSoup(start_html2.text, "html.parser") # 縮進格式 page2 = soup2.find('div', class_='pagenavi').find_all('a'); # print (page2[0]) max_page2 = page2[-2].text; path = path + alt.strip().replace('?', ''); if (os.path.exists(path)): pass # print('目錄已存在') else: os.makedirs(path) for m in range(1,int(max_page2)): time.sleep(random.randint(1,5)) # alt = href.find('img', class_='lazy').get('alt'); # url = href.get('href'); url3 = url+'/'+str(m)+'/' print('開始爬→%s' % url3) start_html3 = requests.get(url3, headers=Hostreferer); soup3 = BeautifulSoup(start_html3.text, "html.parser") # 縮進格式 picSrc = soup3.find('div', class_='main-image').find('a').find('img').get('src');#.get('src');#.get('src'); #div class="main-image" # imglist = #獲取當前頁上所有的子連接, 不包含class="box" html = requests.get(picSrc, headers=Picreferer) # 提取圖片名字 file_name = path+'/'+picSrc.split(r'/')[-1]; # 保存圖片 f = open(file_name, 'wb') f.write(html.content) f.close() print('圖片保存到%s' % file_name);