爬取mzi.com妹子圖片網站(requests庫)


看了崔大佬的文章,寫了這個爬蟲,學習了!原文地址

現在該網站加了反爬機制,不過在headers里加上refere參數就行了。

以下代碼僅做學習記錄之用:

from bs4 import BeautifulSoup
import requests
import os
import time
# 構造帶頁碼的頁面鏈接
def get_mzi_page():
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
                      " (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
    res = requests.get('http://www.mzitu.com', headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    page = soup.select('.nav-links a')  # 返回一個列表
    page_count = page[-2].get_text()
    for i in range(1, int(page_count) + 1):
        # 構造每個頁面鏈接
        page_url = "http://www.mzitu.com/page/" + f"{i}/"
        # 獲取當前頁面的所有專題,並輸出提示
        print(f"總計{page_count}頁,當前第{i}頁:")
        get_mzi_channel(page_url)


def get_mzi_channel(url):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
                      " (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    # 抓取每個妹子的專題頁面
    channel = soup.select('#pins li span a')
    channel_count_onepage = len(channel)
    print(channel)
    count = 0
    # 遍歷所有專題url,逐個訪問下載圖片
    for c in channel:
        count += 1
        channel_name = c.get_text()
        os.mkdir('D;\mziPic')
        filepath = f'D:\mziPic\{channel_name}'
        if os.path.exists(filepath):
            pass
        else:
            os.mkdir(filepath)
        channel_url = c.get('href')
        print(f"本頁總計{len(channel)}個妹子,當前第{count}個妹子")# 提示信息
        get_mzi_img(filepath, channel_url)


def get_mzi_img(filepath ,url):
    # 下載一個妹子的所有圖片
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1"
                      " (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, 'lxml')
    #每個妹子的all圖片是分多個頁面展示的,所以還要再訪問多個頁面,逐圖片下載
    img_page = soup.select('.pagenavi a')
    img_page_count = img_page[-2].get_text()
    # 構造逐圖片下載的url
    count2 = 0
    for i in range(1, int(img_page_count) + 1):
        count2 += 1
        img_page_url = url + f'/{i}'
        print(f"本妹子共{img_page_count}圖片,現第{count2}張")
        print("img_page_url(refere):", img_page_url)
        download(filepath, img_page_url, count2)


def download(filepath, img_page_url, count2):
    headers = {
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
               }
    headers2 = {'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
               'Accept-Encoding': 'gzip, deflate',
               'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
               'Connection': 'keep-alive',
               'DNT': '1',
               'Host': 'i.meizitu.net',
               'Referer': img_page_url,
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
               }
    res = requests.get(img_page_url, headers=headers)
    print("res.status_code", res.status_code)
    if res.status_code == 200:
        soup = BeautifulSoup(res.text, 'lxml')
        img_url = soup.find('div', class_='main-image').find('img').get('src')
        print("圖片地址", img_url)
        # 這個網站訪問圖片必須要帶refere參數,不然返回403,(折騰好久才發現問題
        res2 = requests.get(img_url, headers=headers2)
        print("res2.status_code:", res2.status_code)
        with open(filepath + f'/{count2}.jpg', 'ab') as f:
            f.write(res2.content)
            f.close()
        # time.sleep(1) # 注釋后運行下好像不封ip。。。
    else:
        pass


if __name__ == '__main__':
    get_mzi_page()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM