最近在看騰訊視頻的一人之下4『陳朵篇』,但是這一季只有12集,且已經完結了,對陳朵仍舊充滿好奇的我,耐不住下一季了,所以嘻嘻
本文主人公:
36漫畫網
因為這個網站的反爬措施做得還OK,值得表揚,所以我就不一一講解了,因為這是一個非常簡單的爬蟲流程,圖片還是懶加載,很容易找到。
直接上代碼了:
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import time
import glob
import os
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
def get_chapter(url):
"""獲取每章的url鏈接"""
html = requests.get(url, headers=headers)
html.encoding = 'utf-8'
folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
if not os.path.exists(folder_path):
os.mkdir(folder_path)
chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
chapters.reverse()
return folder_path, chapters
def get_pic_linking(path_chapters):
"""獲取每章的圖片鏈接"""
path, chapters = path_chapters
for name, chapter in chapters:
html = requests.get(chapter, headers=headers)
html.encoding = 'utf-8'
pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
folder_path = '\\'.join([path, name])
if not os.path.exists(folder_path):
os.mkdir(folder_path)
img_download(folder_path, pic_linking)
def img_download(path, pics):
"""下載圖片"""
num = 1
print(f"開始下載 >>> {os.path.split(path)[1]} >> 共{len(pics)}張")
for pic in pics:
print(num, end=' ')
try:
with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
f.write(requests.get(pic, verify=False).content)
except:
print("出現錯誤!請等候5s...")
time.sleep(5)
with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
f.write(requests.get(pic, verify=False).content)
num += 1
jpg_to_pdf(path)
def jpg_to_pdf(path):
"""生成PDF文件"""
print(f"--->>> 正在圖片轉pdf文件 文件路徑{path}.pdf")
jpg_path = glob.glob(f"{path}\*.jpg")
jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
w, h = Image.open(jpg_path[0]).size
ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
for jpg in jpg_path:
ca.drawImage(jpg, 0, 0, w, h)
ca.showPage()
ca.save()
def main():
_url = 'https://m.36mh.net/manhua/yirenzhixia/'
_chapter = get_chapter(_url)
get_pic_linking(_chapter)
if __name__ == '__main__':
main()
代碼運行的時候,可能會報錯誤requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)
解決python爬蟲requests.exceptions.SSLError: HTTPSConnectionPool(host='XXX', port=443)問題
為了解決這一問題,同時也為了沒必要下載全部章節的需要,我就重整了下代碼。
用法:輸入1,則下載1-10話,輸入2,則下載11-20話,以此類推......
就每10話為一PDF,不需要下周全部章節了哈哈。
點擊查看代碼
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
def get_chapter(url):
"""獲取每章鏈接"""
html = requests.get(url, headers=headers)
html.encoding = 'utf-8'
folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
if not os.path.exists(folder_path):
os.mkdir(folder_path)
chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
chapters.reverse()
return folder_path, chapters
def get_pic_linking(path_chapters):
"""獲取圖片鏈接"""
folder_path, chapters = path_chapters
pics_linking = []
for name, chapter in chapters:
html = requests.get(chapter, headers=headers)
html.encoding = 'utf-8'
pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
pics_linking += pic_linking
if not os.path.exists(folder_path):
os.mkdir(folder_path)
try:
img_download(folder_path, pics_linking)
except:
print("出錯了,請重新嘗試o(╥﹏╥)o")
shutil.rmtree(folder_path)
def img_download(path, pics):
"""下載圖片"""
num = 1
row = list(range(1, 30))
print(f"開始下載 >>> {os.path.split(path)[1]} >> 共{len(pics)}張")
for pic in pics:
print(num, end=' ')
if num//30 in row:
print()
row.pop(0)
try:
with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
f.write(requests.get(pic, verify=False).content)
except Exception as e:
print("出現錯誤!請耐心等待5s!")
time.sleep(5)
with open('\\'.join([path, str(num) + '.jpg']), 'wb') as f:
f.write(requests.get(pic, verify=False).content)
num += 1
jpg_to_pdf(path)
shutil.rmtree(path)
def jpg_to_pdf(path):
"""生成PDF文件"""
print(f"\n--->>> 正在圖片轉pdf文件 文件路徑{path}.pdf")
jpg_path = glob.glob(f"{path}\*.jpg")
jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
w, h = Image.open(jpg_path[0]).size
ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
for jpg in jpg_path:
ca.drawImage(jpg, 0, 0, w, h)
ca.showPage()
ca.save()
def select_section(section, chapters):
"""選擇下載范圍"""
sec = int(section)
name = f'{(sec - 1) * 10+1}-{sec * 10}'
if sec * 10 > len(chapters[1])+14:
print(f"漫畫一共才更到{len(chapters[1])+4}話,你想下載{(sec-1)*10+1}-{sec*10},你有毛病吧!")
exit()
if sec < 43:
chapter = chapters[1][(sec - 1) * 10:sec * 10]
elif sec == 43:
chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
print("注意,缺少425-428話!")
elif sec*10 < len(chapters[1])+4:
chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
else:
print(f"漫畫一共才更到{len(chapters[1])+4}話,所以只能下載{(sec-1)*10+1}-{len(chapters[1])+4} o(╥﹏╥)o")
chapter = chapters[1][(sec-1)*10-4:]
name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
return chapters[0]+f"\\{name}章", chapter
def main():
_url = 'https://m.36mh.net/manhua/yirenzhixia/'
print("輸入1,則下載1-10話,輸入2,則下載11-20話,以此類推......")
_section = input("請輸入指定數字:")
_chapter = get_chapter(_url)
_chapters = select_section(_section, _chapter)
get_pic_linking(_chapters)
if __name__ == '__main__':
main()
因為這個網站,少了425-428
的章節,見下圖
所以使用了一個函數做判斷(若網站以后更新有了這些章節,小伙伴們可自行更改喔,或者私信給我哈):
def select_section(section, chapters):
"""選擇下載范圍"""
sec = int(section)
name = f'{(sec - 1) * 10+1}-{sec * 10}'
if sec * 10 > len(chapters[1])+14:
print(f"漫畫一共才更到{len(chapters[1])+4}話,你想下載{(sec-1)*10+1}-{sec*10},你有毛病吧!")
exit()
if sec < 43:
chapter = chapters[1][(sec - 1) * 10:sec * 10]
elif sec == 43:
chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
print("注意,缺少425-428話!")
elif sec*10 < len(chapters[1])+4:
chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
else:
print(f"漫畫一共才更到{len(chapters[1])+4}話,所以只能下載{(sec-1)*10+1}-{len(chapters[1])+4} o(╥﹏╥)o")
chapter = chapters[1][(sec-1)*10-4:]
name = f"{(sec-1)*10+1}-{len(chapters[1])+4}"
return chapters[0]+f"\\{name}章", chapter
進度條展示
點擊查看代碼
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from pyquery import PyQuery
from PIL import Image
import requests
import shutil
import time
import glob
import os
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
}
def get_chapter(url):
"""獲取每章鏈接"""
html = requests.get(url, headers=headers)
html.encoding = 'utf-8'
folder_path = '\\'.join([os.getcwd(), PyQuery(html.text)('title').text().split('_')[0]])
if not os.path.exists(folder_path):
os.mkdir(folder_path)
chapters = [[a.text(), a.attr('href')] for a in PyQuery(html.text)('#chapter-list-4 li a').items() if
a.text().split('.')[0].isdigit() or a.text()[0].isdigit()]
chapters.reverse()
return folder_path, chapters
def get_pic_linking(path_chapters):
"""獲取圖片鏈接"""
folder_path, chapters = path_chapters
pics_linking = []
for name, chapter in chapters:
html = requests.get(chapter, headers=headers)
html.encoding = 'utf-8'
pic_linking = [pic_url.attr('src') for pic_url in PyQuery(html.text)('div > mip-link mip-img').items()]
pics_linking += pic_linking
if not os.path.exists(folder_path):
os.mkdir(folder_path)
try:
img_download(folder_path, pics_linking)
except Exception as e:
print(e)
print("出錯了,請重新嘗試o(╥﹏╥)o")
shutil.rmtree(folder_path)
def img_download(path, pics):
"""下載圖片"""
print(f"開始下載 >>> {os.path.split(path)[1]} >> 共{len(pics)}張")
for num, pic in enumerate(pics):
print(f'\r{"▇" * ((num + 1) // 2)} {(num + 1) / len(pics) * 100:.0f}%', end='')
try:
with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
f.write(requests.get(pic, verify=False).content)
except Exception as e:
time.sleep(5)
with open('\\'.join([path, str(num + 1) + '.jpg']), 'wb') as f:
f.write(requests.get(pic, verify=False).content)
jpg_to_pdf(path)
shutil.rmtree(path)
def jpg_to_pdf(path):
"""生成PDF文件"""
print(f"\n--->>> 正在圖片轉pdf文件 文件路徑{path}.pdf")
jpg_path = glob.glob(f"{path}\*.jpg")
jpg_path.sort(key=lambda x: int(os.path.basename(x).split('.')[0]))
w, h = Image.open(jpg_path[0]).size
ca = canvas.Canvas(path + '.pdf', pagesize=portrait((w, h)))
for jpg in jpg_path:
ca.drawImage(jpg, 0, 0, w, h)
ca.showPage()
ca.save()
def select_section(section, chapters):
"""選擇下載范圍"""
sec = int(section)
name = f'{(sec - 1) * 10 + 1}-{sec * 10}'
if sec * 10 > len(chapters[1]) + 14:
print(f"漫畫一共才更到{len(chapters[1]) + 4}話,你想下載{(sec - 1) * 10 + 1}-{sec * 10},你有毛病吧!")
exit()
if sec < 43:
chapter = chapters[1][(sec - 1) * 10:sec * 10]
elif sec == 43:
chapter = chapters[1][(sec - 1) * 10:sec * 10 - 4]
print("注意,缺少425-428話!")
elif sec * 10 < len(chapters[1]) + 4:
chapter = chapters[1][(sec - 1) * 10 - 4:sec * 10 - 4]
else:
print(f"漫畫一共才更到{len(chapters[1]) + 4}話,所以只能下載{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4} o(╥﹏╥)o")
chapter = chapters[1][(sec - 1) * 10 - 4:]
name = f"{(sec - 1) * 10 + 1}-{len(chapters[1]) + 4}"
return chapters[0] + f"\\{name}章", chapter
def main():
_url = 'https://m.36mh.net/manhua/yirenzhixia/'
print("輸入1,則下載1-10話,輸入2,則下載11-20話,以此類推......")
_section = input("請輸入指定數字:")
_chapter = get_chapter(_url)
_chapters = select_section(_section, _chapter)
get_pic_linking(_chapters)
if __name__ == '__main__':
main()
我用的是PyCharm
運行的,貌似用自帶的IDLE
不可以,怪怪的