Python | 一人之下漫畫爬取並保存為pdf文件

本文轉載自查看原文 2021-12-14 10:32 1295 簡單Python小程序實現

最近在看騰訊視頻的一人之下4『陳朵篇』，但是這一季只有12集，且已經完結了，對陳朵仍舊充滿好奇的我，耐不住下一季了，所以嘻嘻

本文主人公：
36漫畫網

因為這個網站的反爬措施做得還OK，值得表揚，所以我就不一一講解了，因為這是一個非常簡單的爬蟲流程，圖片還是懶加載，很容易找到。

直接上代碼了：

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """獲取每章的url鏈接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """獲取每章的圖片鏈接"""
    path, chapters = path_chapters
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        folder_path = '\\'.join([path, name])
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        img_download(folder_path, pic_linking)


def img_download(path, pics):
    """下載圖片"""
    num = 1
    print(f"開始下載  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}張")
    for pic in pics:
        print(num, end=' ')
        try:
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except:
            print("出現錯誤！請等候5s...")
            time.sleep(5)
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        num += 1
    jpg_to_pdf(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"--->>> 正在圖片轉pdf文件  文件路徑{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    _chapter = get_chapter(_url)
    get_pic_linking(_chapter)


if __name__ == '__main__':
    main()

代碼運行的時候，可能會報錯誤requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)
解決python爬蟲requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)問題

為了解決這一問題，同時也為了沒必要下載全部章節的需要，我就重整了下代碼。
用法：輸入1，則下載1-10話，輸入2，則下載11-20話，以此類推......
就每10話為一PDF，不需要下周全部章節了哈哈。

點擊查看代碼

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """獲取每章鏈接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """獲取圖片鏈接"""
    folder_path, chapters = path_chapters
    pics_linking = []
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        pics_linking += pic_linking
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    try:
        img_download(folder_path, pics_linking)
    except:
        print("出錯了，請重新嘗試o(╥﹏╥)o")
        shutil.rmtree(folder_path)


def img_download(path, pics):
    """下載圖片"""
    num = 1
    row = list(range(1, 30))
    print(f"開始下載  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}張")
    for pic in pics:
        print(num, end=' ')
        if num//30 in row:
            print()
            row.pop(0)
        try:
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except Exception as e:
            print("出現錯誤！請耐心等待5s！")
            time.sleep(5)
            with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        num += 1
    jpg_to_pdf(path)
    shutil.rmtree(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"\n--->>> 正在圖片轉pdf文件  文件路徑{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def select_section(section, chapters):
    """選擇下載范圍"""
    sec = int(section)
    name = f'{(sec - 1) * 10+1}-{sec * 10}'
    if sec * 10 > len(chapters[1])+14:
        print(f"漫畫一共才更到{len(chapters[1])+4}話，你想下載{(sec-1)*10+1}-{sec*10},你有毛病吧！")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意，缺少425-428話！")
    elif sec*10 < len(chapters[1])+4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫畫一共才更到{len(chapters[1])+4}話，所以只能下載{(sec-1)*10+1}-{len(chapters[1])+4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec-1)*10-4:]
        name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
    return chapters[0]+f"\\{name}章", chapter


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    print("輸入1，則下載1-10話，輸入2，則下載11-20話，以此類推......")
    _section = input("請輸入指定數字：")
    _chapter = get_chapter(_url)
    _chapters = select_section(_section, _chapter)
    get_pic_linking(_chapters)


if __name__ == '__main__':
    main()

因為這個網站，少了425-428的章節，見下圖

所以使用了一個函數做判斷（若網站以后更新有了這些章節，小伙伴們可自行更改喔，或者私信給我哈）：

def select_section(section, chapters):
    """選擇下載范圍"""
    sec = int(section)
    name = f'{(sec - 1) * 10+1}-{sec * 10}'
    if sec * 10 > len(chapters[1])+14:
        print(f"漫畫一共才更到{len(chapters[1])+4}話，你想下載{(sec-1)*10+1}-{sec*10},你有毛病吧！")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意，缺少425-428話！")
    elif sec*10 < len(chapters[1])+4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫畫一共才更到{len(chapters[1])+4}話，所以只能下載{(sec-1)*10+1}-{len(chapters[1])+4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec-1)*10-4:]
        name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
    return chapters[0]+f"\\{name}章", chapter

進度條展示

點擊查看代碼

from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}


def get_chapter(url):
    """獲取每章鏈接"""
    html = requests.get(url, headers=headers)
    html.encoding = 'utf-8'
    folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
                a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
    chapters.reverse()
    return folder_path, chapters


def get_pic_linking(path_chapters):
    """獲取圖片鏈接"""
    folder_path, chapters = path_chapters
    pics_linking = []
    for name, chapter in chapters:
        html = requests.get(chapter, headers=headers)
        html.encoding = 'utf-8'
        pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
        pics_linking += pic_linking
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    try:
        img_download(folder_path, pics_linking)
    except Exception as e:
        print(e)
        print("出錯了，請重新嘗試o(╥﹏╥)o")
        shutil.rmtree(folder_path)


def img_download(path, pics):
    """下載圖片"""
    print(f"開始下載  >>>  {os.path.split(path)[1]}  >> 共{len(pics)}張")
    for num, pic in enumerate(pics):
        print(f'\r{"▇" * ((num + 1) // 2)} {(num + 1) / len(pics) * 100:.0f}%', end='')
        try:
            with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
        except Exception as e:
            time.sleep(5)
            with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
                f.write(requests.get(pic, verify=False).content)
    jpg_to_pdf(path)
    shutil.rmtree(path)


def jpg_to_pdf(path):
    """生成PDF文件"""
    print(f"\n--->>> 正在圖片轉pdf文件  文件路徑{path}.pdf")
    jpg_path = glob.glob(f"{path}\*.jpg")
    jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
    w, h = Image.open(jpg_path[0]).size
    ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
    for jpg in jpg_path:
        ca.drawImage(jpg, 0, 0, w, h)
        ca.showPage()
    ca.save()


def select_section(section, chapters):
    """選擇下載范圍"""
    sec = int(section)
    name = f'{(sec - 1) * 10 + 1}-{sec * 10}'
    if sec * 10 > len(chapters[1]) + 14:
        print(f"漫畫一共才更到{len(chapters[1]) + 4}話，你想下載{(sec - 1) * 10 + 1}-{sec * 10},你有毛病吧！")
        exit()
    if sec < 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10]
    elif sec == 43:
        chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
        print("注意，缺少425-428話！")
    elif sec * 10 < len(chapters[1]) + 4:
        chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
    else:
        print(f"漫畫一共才更到{len(chapters[1]) + 4}話，所以只能下載{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}  o(╥﹏╥)o")
        chapter = chapters[1][(sec - 1) * 10 - 4:]
        name = f"{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}"
    return chapters[0] + f"\\{name}章", chapter


def main():
    _url = 'https://m.36mh.net/manhua/yirenzhixia/'
    print("輸入1，則下載1-10話，輸入2，則下載11-20話，以此類推......")
    _section = input("請輸入指定數字：")
    _chapter = get_chapter(_url)
    _chapters = select_section(_section, _chapter)
    get_pic_linking(_chapters)


if __name__ == '__main__':
    main()

我用的是PyCharm運行的，貌似用自帶的IDLE不可以,怪怪的

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python：爬取一個可下載的PDF鏈接並保存為本地pdf文件一人之下（名言吧）使用Python爬取微信公眾號文章並保存為PDF文件(解決圖片不顯示的問題) 使用selenium + Chrome爬取某網站烏雲公開漏洞文章並保存為pdf文件爬取博主的所有文章並保存為PDF文件爬取某網站景區列表並保存為csv文件 Python爬取前程無憂網址，並保存為txt文件將爬取的網頁數據分別保存為csv和xls文件(Python） HTML網頁保存為PDF文件如何將網頁保存為PDF文件