根據名稱搜索小說並下載到本地【全書小說網】


'''
獲取全書網的任意一本書的正文,每個章節為一個txt文件(如果章節太多可以獲取前5章),這些文件全部放在以書名命名的文件夾中
'''
import os
import re
from urllib.parse import quote

import requests


class QuanShu:
    def __init__(self, name=''):
        name = quote(name.encode('gb2312'))
        # print(name)
        self.host = 'http://www.quanshuxs.com/'
        self.url = f'http://www.quanshuxs.com/search.asp?key={name}&x=0&y=0'
        pass

    def get_search_html(self):
        resp = requests.get(self.url)
        resp.encoding = resp.apparent_encoding
        html = resp.text
        tables = re.findall(
            r'<table cellspacing="0" cellpadding="0" width="962" border="0" align="center" class="m9">(.*?)</table>',
            html, re.S)
        url_list = []
        for i, table in enumerate(tables):
            works_url = re.findall(r'<a href="(.*)" target="_blank">', table, re.S)
            works_info = re.findall(r'<a href=".*?">(.*?)</a>', table, re.S)
            works_status = re.findall(r'狀態: </font>\r\n(.*?)&nbsp;\|', table, re.S)
            if len(works_url) < 1:
                print("沒有搜索到對應作品!")
                break
            else:
                works_url = works_url[0]
                works_name = str(works_info[0]).replace("<font color='red'>", "").replace("</font>", "")
                new_chapter = works_info[1]
                works_author = works_info[2]
                works_type = works_info[3]
                works_status = works_status[0]
                url_list.append(self.host + works_url)
                print(
                    f'序號:{i:3}作品名稱:{works_name} 最新章節:{new_chapter} 作者: {works_author} 類型: {works_type} 狀態: {works_status}',
                    end='\n\n')
        url_num = input("請選擇需要下載的作品序號(enter):")
        self.get_works_html(url_list[int(url_num)])

    def get_works_html(self, url):
        works_resp = requests.get(url)
        works_resp.encoding = works_resp.apparent_encoding
        html = works_resp.text
        table = re.findall(r'class="mread">(.*?)</table>', html, re.S)[0]
        self.title = re.findall(r'<font color="#7B352B">(.*?)全文閱讀</font>', table, re.S)[0]
        if not os.path.exists(self.title):
            os.mkdir(self.title)
        chapters = re.findall(r'<div class="bai"><a href="(.*?)">(.*?)</a>', table, re.S)[0]
        # print(chapters)
        self.get_chapter_html(chapters[0])

    # 訪問單個章節
    def get_chapter_html(self, url):
        chapter_resp = requests.get(url)
        chapter_resp.encoding = chapter_resp.apparent_encoding
        html = chapter_resp.text
        chapter_name = re.findall(r"<strong>(.*?)</strong>", html, re.S)[0]
        print(f'正在下載---->{chapter_name}')
        next = re.findall(r"<a href='(.*?)'><font color='#7B352B'>下一章</font></a> \( → \)", html, re.S)
        content = re.findall(r'<td colspan="2" class="content">(.*?)</td>', html, re.S)
        # print(content)
        content = content[0].replace('<br><br>', '\n').replace('<img src="image/', '').replace('.jpg">', ' ').replace(
            '&mdash;', '—')
        with open(f'{self.title}/{chapter_name}.txt', 'w+') as f:
            f.write("  ")
            f.write(content)
        print(f'已下載---->{chapter_name},{url}')
        if len(next) > 0:
            self.get_chapter_html(next[0])


if __name__ == '__main__':
    name = input("請輸入小說名稱:")
    quanshu = QuanShu(name)
    quanshu.get_search_html()
    quanshu.get_works_html()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM