python爬取圖片並保存到本地


Python爬取圖片(你懂得)

requests與Bs4

這兩個模塊是本文使用的主要模塊,requests可以獲取連接,bs4全名BeautifulSoup,是編寫python爬蟲常用庫之一,主要用來解析html標簽。這兩個模塊可以通過cmd終端下載

pip install bs4
pip install requests

代碼實現

import requests
from bs4 import BeautifulSoup
import os
class Mzitu():
    def __init__(self):
        self.headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }# 構造請求頭,主網站的請求頭較為簡單只需構造瀏覽器頭
        self.base_path = os.getcwd() # 獲取當前路徑

    def get_url(self,html):
    '''獲取每個套圖的鏈接,並返回'''
        html_b=BeautifulSoup(html,'lxml')
        urls_b = html_b.find_all('ul',attrs={'id':'pins'})[0]
        urls = urls_b.find_all('a')
        for i in urls:
            yield i['href']

    def get_img_url_max(self,url):
    '''獲取圖片的張數'''
        html_i = requests.get(url,headers=self.headers).text
        html_b = BeautifulSoup(html_i,'lxml')
        max_number=html_b.find_all('div',attrs={'class':'pagenavi'})[0]
        max_number = max_number.find_all('a')[-2].span.text
        return max_number

    def get_img_url(self,url):
    '''獲取每張圖片的鏈接'''
        html_i = requests.get(url, headers=self.headers).text
        html_b = BeautifulSoup(html_i, 'lxml')
        img_url = html_b.find_all('div',attrs={'class':'main-image'})[0].p.a.img['src']
        return img_url

    def download_img(self,name,url):
    '''獲取每張圖片的內容'''
        headers = {
            'Accept':'image/webp,image/apng,image/*,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate',
            'Accept-Language':'zh-CN,zh;q=0.9',
            'Connection':'keep-alive',
            'Host': 'i.meizitu.net',
            'Referer': 'http://www.mzitu.com/%s'%name,
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
        }
        img = requests.get(url,headers=headers).content
        return img

    def get_img(self,name,max,img_url):
    '''下載圖片'''
        path = os.path.join(self.base_path,name)
        if os.path.exists(path):
            pass
        else:
            os.mkdir(path)
        for i in range(1,int(max)):
            k = str(i)
            file_name = k+'.jpg'
            img_file_name = os.path.join(path,file_name)
            if len(k) <2:
                img_url = img_url[:-5]+k+img_url[-4:]
            else:
                img_url = img_url[:-6]+k+img_url[-4:]
            img = self.download_img(name,img_url)
            with open(img_file_name,'wb') as f:
                f.write(img)

    def get_html_url_link_max(self):
    '''獲取主網站中的總頁數'''
        url = 'http://www.mzitu.com/'
        html = requests.get(url,headers = self.headers).text
        html_b = BeautifulSoup(html,'lxml')
        max_number = html_b.find_all('a',attrs={'class':'page-numbers'})[-2]['href']
        max_number = max_number.split('/')[4]
        return max_number

    def main(self):
        max_number = int(self.get_html_url_link_max())
        for i in range(1,max_number+1):
        '''遍歷構造網址'''
            url = 'http://www.mzitu.com/page/%d/'%i
            html = requests.get(url,headers=self.headers).text
            urls = self.get_url(html)
            for i in urls:
                name = i.split('/')[-1]
                max_number = self.get_img_url_max(i)
                img_url = self.get_img_url(i)
                self.get_img(name,max_number,img_url)

if __name__ == '__main__':
    mzitu = Mzitu()
    mzitu.main()

運行程序后,即可在同文件夾下發現不斷有包含圖片的文件夾生成

封裝后的exe下載


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM