用Python爬E站本

一、前言

參考並改進自 OverJerry 大佬的教你怎么用Python爬取E站的本子_OverJerry。

本文為技術學習記錄，不提供訪問無存在網站的任何方法，也不包含不和諧內容。

環境：

Python版本為從Win10應用商店安裝的Python3.7.5，大概若無已安裝版本，cmd輸入python就會自動打開商店頁面吧。不用設置PATH，但無法使用 py 命令。安裝的位置在 C:\Users\<用戶名>\AppData\Local\Microsoft\WindowsApps\，pip安裝的模塊位置大概在 C:\users\<用戶名>\appdata\local\packages\

編輯器為VSCode，使用推薦的Python插件

語法檢查工具flake8：python -m pip install flake8

格式化工具autopep8：python -m pip install autopep8

依賴：

BeautifulSoup4：python -m pip install BeautifulSoup4

requests：python -m pip install requests

lxml：pip install lxml

二、改進內容

支持分頁下載；
允許一次輸入多條鏈接，方便批量執行；
文件名使用id+序號的方式，方便排序；
允許對同名文件跳過；
對於某些圖片不穩定導致卡死問題，做了請求超時處理，允許設置超時時長和最大重新請求次數，可以超時時間短但重發次數多，或者時間長但次數少；
對於用本名創建文件夾可能存在的名稱有不合法字符問題，允許檢查並替換字符；
對於站點某些本的內容不和諧提示：在cookie中添加nw=1，避免重定向導致錯誤；
那啥代理池沒有用，原先以為卡住是被反爬蟲了，原來只是單純下載卡住了，網上扒來的方法似乎也只會報錯。
想到但沒做的，添加傳入參數，方便批處理。

三、最終代碼

# -*- coding: utf-8 -*-
# ehentai本子爬取，學習from：https://blog.csdn.net/weixin_41732074/article/details/87287726
import requests
import os
import re
import time
from bs4 import BeautifulSoup
# import random
# import multiprocessing

# 默認請求頭
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
           'cookie': 'nw=1',  # 處理是否查看不宜內容的檢查，需要寫入cookie，不能用cookies直接寫。
           'Upgrade-Insecure-Requests': '1',  # 用於從http到https轉換允許通知給服務器
           'DNT': '1'}  # 禁止追蹤
rootdir = 'E:/MyGallery/comic/'
overwrite = False  # 當文件名存在時是否覆蓋重寫
replacechar = '_'  # 用於替換不當文件名的字符
conndelay = 5  # 連接服務器最大秒數
readdelay = 30  # 讀取最大秒數
maxretry = 2  # 下載單圖失敗時重試次數
ip_list = []  # 代理ip池


# def get_ip_list(url, headers):  # 從匿名ip提供網站獲取ip列表
#     web_data = requests.get(url, headers=headers)
#     soup = BeautifulSoup(web_data.text, 'lxml')
#     ips = soup.find_all('tr')
#     ip_list = []
#     for i in range(1, len(ips)):
#         ip_info = ips[i]
#         tds = ip_info.find_all('td')
#         ip_list.append(tds[1].text + ':' + tds[2].text)
#     return ip_list


# def get_random_ip(ip_list):  # 生成隨機ip加端口號
#     proxy_list = []
#     for ip in ip_list:
#         proxy_list.append('http://' + ip)
#     proxy_ip = random.choice(proxy_list)
#     proxies = {'http': proxy_ip}
#     return proxies


# def init_proxies():  # 初始化隨機代理
#     url = 'http://www.xicidaili.com/nn/'
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
#     }
#     global ip_list
#     ip_list = get_ip_list(url, headers=headers)
#     # proxies = get_random_ip(ip_list)
#     # print(proxies)


def saveFile(url, path):  # 保存文件
    # print('目標鏈接： ' + url)
    # 代理, 超時 , proxies=get_random_ip(ip_list), timeout=(180, 3000)
    response = requests.get(url, headers=headers,
                            timeout=(conndelay, readdelay))
    with open(path, 'wb') as f:  # 只寫二進制文件，存在則重寫，不存在則創建
        f.write(response.content)
        f.flush()


def getPicUrl(url):  # 獲取圖片源
    site_2 = requests.get(url, headers=headers)
    content_2 = site_2.text
    soup_2 = BeautifulSoup(content_2, 'lxml')
    imgs = soup_2.find_all(id="img")  # 圖片的id正是img
    for img in imgs:
        picSrc = img['src']
        return picSrc


def getPicList(url):  # 獲取圖片分頁
    site = requests.get(url, headers=headers)
    content = site.text
    soup = BeautifulSoup(content, 'lxml')
    # 獲取當前分頁所有gdtm類，gdtm是eh的默認小縮略圖類，gdtl是eh的大縮略圖類；find_all()返回一個包含元素的列表
    divs = soup.find_all(class_='gdtm')
    imgcount = 0    # 圖片計數器
    for div in divs:
        imgcount = imgcount + 1
    print('||共 %d 張圖，開始下載...' % (imgcount))
    title = re.sub(r'[\\/:*?"<>|\r\n]', replacechar, soup.h1.get_text())
    imgnum = 0
    i = 0
    for div in divs:
        picUrl = div.a.get('href')
        picAlt = div.a.img.get('alt')
        # 獲取鏈接最右邊一段，形如<漫畫id-圖片序號>，因圖片序號前確少0可能導致排序問題，使用alt拼接
        picName = picUrl.rpartition('/')[2].rpartition('-')[0] + '-' + picAlt
        imgnum = imgnum + 1
        print('>> Saving：' + picName + '.jpg')
        picPath = '%s%s/%s.jpg' % (rootdir, title, picName)
        try:
            # 非覆寫模式下，判斷文件是否存在
            if not overwrite and os.path.exists(picPath) and os.path.isfile(picPath):
                print('Already Exists <<')
            else:
                saveFile(getPicUrl(picUrl), picPath)
        # except requests.exceptions.ConnectionError:
        #     print('鏈接失敗')
        #     print('Failed <<')
        #     time.sleep(1)
        # except requests.exceptions.ConnectTimeout:
        #     print('鏈接超時')
        #     print('Failed <<')
        #     time.sleep(1)
        # except requests.exceptions.ReadTimeout:
        #     print('返回數據超時')
        #     print('Failed <<')
        #     time.sleep(1)
        except Exception as e:
            print(e)
            if(maxretry < 1):
                print('Failed <<')
            time.sleep(1)
            for ri in range(0, maxretry):  # 重獲鏈接嘗試下載
                try:
                    print('>> Retry times ' + str(ri + 1) + '：')
                    saveFile(getPicUrl(picUrl), picPath)
                except Exception as e2:
                    print(e2)
                    if(ri == maxretry - 1):
                        print('Failed <<')
                    time.sleep(1)
                else:  # 下載成功，結束循環
                    print('Succeed <<')
                    i = i + 1
                    break

        else:
            print('Succeed <<')
            i = i + 1
    print('||本頁共下載 %d 個文件，其中 %d 個成功。' % (imgnum, i))
    return [imgnum, i]


def getGallery(url):  # 主頁，輸入url
    if (url.find('https://e-hentai.org/g/') != -1):
        url = url.partition('?p')[0]  # 從參數出現的第一個位置起，將字符串分成包含前中后三個元素的元組
        print('== 正在獲取內容...==')
        try:
            site = requests.get(url, headers=headers)
            # print(str(site.cookies))
            # print(str(site.headers))
            content = site.text
            # 推薦使用lxml解析器解析而不是默認的html解析器，更快，更強
            soup = BeautifulSoup(content, 'lxml')
            # 獲取分頁數，ptds是當前頁的class，不是最后一頁的；ptt是頭部頁碼table的類，ptd是底部頁碼table類名
            pages = soup.find(class_='ptt').find_all('a')
            # for link in pages:
            #     print(link.get_text())
            # 獲取列表倒數第二個項，對應頁碼最大數值
            pagecount = int(pages[len(pages) - 2].get_text())
            # 獲取標題，gn是大標題，gj是日文標題
            title = str(soup.h1.get_text())
            title2 = str(soup.find(id="gj").get_text())
            print('||[漫畫名] 《%s》\n||[日文名] 《%s》\n||共 %d 頁' %
                  (title, title2, pagecount))
            title = re.sub(r'[\\/:*?"<>|\r\n]', replacechar,
                           title)  # 處理windows不支持的文件名
            if not os.path.exists(rootdir + title):  # 創建目標文件夾
                os.mkdir(rootdir + title)
        except Exception as e:
            print(e)
            print('== 未知錯誤！已停止解析。==')
        else:
            totalfile = 0
            succeedfile = 0
            for pagenum in range(0, pagecount):  # range是從參數1到參數2前一個的范圍，且參數2須大於參數1
                print('||當前第 %d 頁' % (pagenum + 1))
                targeturl = url
                if pagenum != 0:  # 不是第一頁，需加上頁碼get參數
                    targeturl = url + '?p=' + str(pagenum)
                returnargs = getPicList(targeturl)
                totalfile += returnargs[0]
                succeedfile += returnargs[1]
            print('== 《%s》下載完成！共 %d 個文件，其中 %d 個成功！==' %
                  (title, totalfile, succeedfile))
    else:
        print('<錯誤："' + url + '" 不是一個有效的eh漫畫目錄頁面的地址。>\n')


def main():
    # init_proxies()  # 初始化ip池
    # print(str(ip_list))
    urls = []  # 允許批量處理，方便睡覺時下載
    url = input('<請輸入鏈接（輸入空白內容結束）：>\n')
    while url != "":
        urls.append(url)
        url = input('== 已輸入鏈接列表 ==\n' + str(urls) + '\n<請輸入鏈接（輸入空白內容結束）：>\n')
    print('== 輸入結束 ==')
    if(len(urls) > 0):
        for item in urls:
            getGallery(item)
        main()
    else:
        print('== 結束運行 ==')


main()

四、效果圖

運行效果

五、參考來源

教你怎么用Python爬取E站的本子_OverJerry - weixin_41732074的博客

Python 3.7.5 文檔

requests 模塊官方文檔

Beautiful Soup 4.4.0 文檔

python 判斷文件是否存在 - 熔遁丶螺旋手里劍 - 博客園

Python Windows文件名稱檢查 - Just do IT

python捕獲異常及方法總結 - Mr、北樂 - 博客園

python中全局變量與局部變量 - CoderWangSon

vscode 編寫python如何禁止 flake8 提示 line too long - 你好阿湯哥 - 博客園

Autopep8的使用 - WrYcF - 博客園

Python爬取大量數據時防止被封IP - freeking101的博客

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 簡單python爬蟲練習 E站本爬取 Python 爬取b站專欄圖片 Python爬取b站視頻 python爬取某站磁力鏈 Python如何實現爬取B站視頻 Python 自動爬取B站視頻 python B站彈幕爬取 python爬蟲（BeautifulSoup）爬取B站視頻字幕使用python爬取B站彈幕和三連 Python爬取B站視頻信息