20200311_最新爬取mzitu


 

 廢話不多, 直接上代碼, python3.6:

import requests
from bs4 import BeautifulSoup
import os
import time;
import random

#pip install BeautifulSoup4 -i  https://pypi.douban.com/simple
#pip install requests -i  https://pypi.douban.com/simple

# http請求頭
Hostreferer = {
    'Referer': 'http://www.mzitu.com',
    
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}

# 此請求頭Referer破解盜圖鏈接
Picreferer = {
    # 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36',
    # 'Referer': 'http://i.meizitu.net',
    # https://www.mzitu.com/224497/3
    'Referer': 'http://www.mzitu.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
all_url = 'https://www.mzitu.com'
# 對mzitu主頁all_url發起請求,將返回的HTML數據保存,便於解析
start_html = requests.get(all_url, headers=Hostreferer)
 
soup = BeautifulSoup(start_html.text, "html.parser") # 縮進格式
page = soup.find_all('a', class_='page-numbers')
# 最大頁數
max_page = page[-2].text
for n in range(1, int(max_page) + 1):
    path = 'D:/mzitu/' #存儲路徑
    all_url = 'https://www.mzitu.com' #重新賦值
    if n!=1:
        all_url=  all_url+"/page/"+str(n)+"/";
    print('開始爬第 %s 頁, 網址是 %s' % (n , all_url))
    start_html = requests.get(all_url, headers=Hostreferer);
    soup = BeautifulSoup(start_html.text, "html.parser")
#    alt =  soup.find(id='pins').find_all('a', target='_blank').find_all('img',class_='lazy').get('alt');
    hrefs = soup.find(id='pins').find_all('a', target='_blank'); #根據ID找

    for href in hrefs:
        imgs = href.find('img',class_='lazy');
        if imgs == None:
            break;
        alt = imgs.get('alt');
        url = href.get('href');
        start_html2 = requests.get(url, headers=Hostreferer);
        soup2 = BeautifulSoup(start_html2.text, "html.parser")  # 縮進格式
        page2 = soup2.find('div', class_='pagenavi').find_all('a');
        # print (page2[0])
        max_page2 = page2[-2].text;
        path = path + alt.strip().replace('?', '');
        if (os.path.exists(path)):
            pass
            # print('目錄已存在')
        else:
            os.makedirs(path)
        for m in range(1,int(max_page2)):

            time.sleep(random.randint(1,5))
            # alt = href.find('img', class_='lazy').get('alt');
            # url = href.get('href');
            url3 = url+'/'+str(m)+'/'
            print('開始爬→%s' % url3)
            start_html3 = requests.get(url3, headers=Hostreferer);
            soup3 = BeautifulSoup(start_html3.text, "html.parser")  # 縮進格式
            picSrc = soup3.find('div', class_='main-image').find('a').find('img').get('src');#.get('src');#.get('src'); #div class="main-image"
            # imglist = #獲取當前頁上所有的子連接, 不包含class="box"
            html = requests.get(picSrc, headers=Picreferer)

            # 提取圖片名字
            file_name = path+'/'+picSrc.split(r'/')[-1];
            # 保存圖片
            f = open(file_name, 'wb')
            f.write(html.content)
            f.close()
            print('圖片保存到%s' % file_name);

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM