用python爬取全網妹子圖片【附源碼筆記】


   這是晚上沒事無聊寫的python爬蟲小程序,專門爬取妹子圖的,養眼用的,嘻嘻!身為程序狗只會這個了!

  廢話不多說,代碼附上,僅供參考學習!

    

"""
功能:爬取妹子圖全網妹子圖片,可以選擇爬取年份,自動分類保存
作者:68喜科技
"""
import requests
from lxml import etree
# import re
import os
# from time import sleep

class Meizitu(object):
    """爬取妹子圖中的圖片"""
    def __init__(self, year):
        self.url = "http://www.mzitu.com/all/"
        self.headers = {"User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
        self.year = year

    # 獲取頁面
    def get_page(self, url, headers):
        response = requests.get(url, headers=headers)
        return response.content.decode()

    # 提取列表頁中的urls
    def get_detail_urls_list(self, page_content, year):
        html_content = etree.HTML(page_content)
        year_list = html_content.xpath("//div[@class='year']/text()")
        index = 2019 - year
        # 提取某一年的相關主題的urls
        xpath_var = "//div[@class='year'][{}]/following-sibling::*[1]//p[@class='url']/a/@href".format(index)
        if index <= len(year_list):
            urls_list = html_content.xpath(xpath_var)
            # print(urls_list)
        else:
            urls_list = None
        return urls_list
    
    # 構造保存路徑並創建目錄
    def save_path(self, detail_html_content, first_img_url, img_name):
        # 構造保存路徑
        path_prefix1 = detail_html_content.xpath("//div[@class='currentpath']/a/text()")[1]
        # print(path_prefix1)
        path_prefix2 = first_img_url[20:29]
        # print(path_prefix2)
        save_path = "./妹子圖/" + path_prefix1 + path_prefix2 + img_name + "/"

        # 如果目錄不存在,則創建目錄
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        return save_path
    
    # 請求和保存圖片
    def save_img(self, img_url, img_headers, img_save_path):
        # 請求圖片
        img_content = requests.get(img_url, headers=img_headers).content
        # 保存圖片
        with open(img_save_path, "wb") as f:
            f.write(img_content)
    
    # 構造圖片請求地址
    def img_url(self, first_img_url, img_index):
        if img_index < 10:
            img_url = first_img_url[:32] + "0" + str(img_index) + ".jpg"
        else:
            img_url = first_img_url[:32] + str(img_index) + ".jpg"
        # print(img_url)
        return img_url
    
    # 構造圖片的請求頭
    def img_headers(self, url, img_index):
        if img_index == 1 :
            refer_url = url
        else:
            refer_url = url + "/" + str(img_index)
        # print(refer_url)

        img_headers = {
            # "Accept":"image/webp,image/apng,image/*,*/*;q=0.8",
            # "Accept-Encoding":"gzip, deflate",
            # "Accept-Language":"zh-CN,zh;q=0.9",
            # "Connection":"keep-alive",
            "Host":"i.meizitu.net",
            "Referer":refer_url,
            "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"
        }
        # print(img_headers,end="\n\n")
        return img_headers

    # 構造每個主題的圖片請求地址 並保存
    def get_img_urls(self, url, detail_html_content, first_img_url, img_name, save_path):
        # 每個主題中的圖片總數
        img_total_num = int(detail_html_content.xpath("//div[@class='pagenavi']/a/span/text()")[4])
        
        # 構造圖片地址 http://i.meizitu.net/2018/02/18c01.jpg
        for img_index in range(1, img_total_num + 1):
            img_url = self.img_url(first_img_url, img_index)
            img_headers = self.img_headers(url, img_index)            
            # 構造圖片具體保存路徑
            img_save_path = save_path + img_name + str(img_index) + ".jpg"            
            # sleep(10)
            # 請求和保存圖片
            self.save_img(img_url, img_headers, img_save_path)

    # 獲取圖片
    def get_image(self, detail_urls_list):
        for url in detail_urls_list:
            detail_page = self.get_page(url, headers=self.headers)
            detail_html_content = etree.HTML(detail_page)
            # 第一頁圖片地址
            first_img_url = detail_html_content.xpath("//div[@class='main-image']/p/a/img/@src")[0]
            # print(first_img_url)
            # 獲取圖片保存的名字
            img_name = detail_html_content.xpath("//h2[@class='main-title']/text()")[0]
            # print(img_name)
            
            # 構建保存路徑並創建目錄
            save_path = self.save_path(detail_html_content, first_img_url, img_name)

            # 構建圖片請求地址並下載
            self.get_img_urls(url, detail_html_content, first_img_url, img_name, save_path)


    # 啟動爬蟲
    def run_spider(self):
        # 獲取妹子圖中的列表頁內容
        page_content = self.get_page(self.url, self.headers)
        # 獲取詳情頁的地址列表
        detail_urls_list = self.get_detail_urls_list(page_content, self.year)
        # 獲取圖片
        self.get_image(detail_urls_list)

if __name__ == "__main__":
    year = int(input("請輸入您要爬取的年份:"))
    meizitu = Meizitu(year)
    meizitu.run_spider()

  

    


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM