
廢話不多, 直接上代碼, python3.6:
import requests
from bs4 import BeautifulSoup
import os
import time;
import random
#pip install BeautifulSoup4 -i https://pypi.douban.com/simple
#pip install requests -i https://pypi.douban.com/simple
# http請求頭
Hostreferer = {
'Referer': 'http://www.mzitu.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
# 此請求頭Referer破解盜圖鏈接
Picreferer = {
# 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36',
# 'Referer': 'http://i.meizitu.net',
# https://www.mzitu.com/224497/3
'Referer': 'http://www.mzitu.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
all_url = 'https://www.mzitu.com'
# 對mzitu主頁all_url發起請求,將返回的HTML數據保存,便於解析
start_html = requests.get(all_url, headers=Hostreferer)
soup = BeautifulSoup(start_html.text, "html.parser") # 縮進格式
page = soup.find_all('a', class_='page-numbers')
# 最大頁數
max_page = page[-2].text
for n in range(1, int(max_page) + 1):
path = 'D:/mzitu/' #存儲路徑
all_url = 'https://www.mzitu.com' #重新賦值
if n!=1:
all_url= all_url+"/page/"+str(n)+"/";
print('開始爬第 %s 頁, 網址是 %s' % (n , all_url))
start_html = requests.get(all_url, headers=Hostreferer);
soup = BeautifulSoup(start_html.text, "html.parser")
# alt = soup.find(id='pins').find_all('a', target='_blank').find_all('img',class_='lazy').get('alt');
hrefs = soup.find(id='pins').find_all('a', target='_blank'); #根據ID找
for href in hrefs:
imgs = href.find('img',class_='lazy');
if imgs == None:
break;
alt = imgs.get('alt');
url = href.get('href');
start_html2 = requests.get(url, headers=Hostreferer);
soup2 = BeautifulSoup(start_html2.text, "html.parser") # 縮進格式
page2 = soup2.find('div', class_='pagenavi').find_all('a');
# print (page2[0])
max_page2 = page2[-2].text;
path = path + alt.strip().replace('?', '');
if (os.path.exists(path)):
pass
# print('目錄已存在')
else:
os.makedirs(path)
for m in range(1,int(max_page2)):
time.sleep(random.randint(1,5))
# alt = href.find('img', class_='lazy').get('alt');
# url = href.get('href');
url3 = url+'/'+str(m)+'/'
print('開始爬→%s' % url3)
start_html3 = requests.get(url3, headers=Hostreferer);
soup3 = BeautifulSoup(start_html3.text, "html.parser") # 縮進格式
picSrc = soup3.find('div', class_='main-image').find('a').find('img').get('src');#.get('src');#.get('src'); #div class="main-image"
# imglist = #獲取當前頁上所有的子連接, 不包含class="box"
html = requests.get(picSrc, headers=Picreferer)
# 提取圖片名字
file_name = path+'/'+picSrc.split(r'/')[-1];
# 保存圖片
f = open(file_name, 'wb')
f.write(html.content)
f.close()
print('圖片保存到%s' % file_name);
