爬取並下載「舊時光文學」小說


爬取小說全部章節,所以要在小說目錄頁進行爬取。只是涉及到文字,所以用 Xpath 解析。

# -*- coding: utf-8 -*-
# @Time    : 2020/6/21 11:09
# @Author  : banshaohuan
# @Site    :
# @File    : pa_xiaoshuo.py
# @Software: PyCharm
import requests
from lxml import etree
from fake_useragent import UserAgent

# 設置headers
ua = UserAgent()
headers = {
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh_CN,zh;q=0.9",
    "Connection": "close",
    "User-Agent": ua.random,
}

# 獲取url鏈接的xml格式
def get_xml(url):
    res = requests.get(url, headers, timeout=10)
    res.encoding = res.apparent_encoding
    text = res.text
    xml = etree.HTML(text)
    return xml


# 獲取文章內容
def get_content(xml, f):
    data = xml.xpath('//div[@class="readcontent"]/text()')
    for text in data:
        if text == "\n":
            pass
        else:
            f.write(text.replace("\n", ""))


def download_book(url):
    xml_list = get_xml(url)
    name = xml_list.xpath("//h1/text()")
    # 章節名
    chapters = xml_list.xpath('//div[@id="list-chapterAll"]//dd/a/text()')
    links = xml_list.xpath('//div[@id="list-chapterAll"]//dd/a/@href')

    print(f"《{name[0]}》獲取中,共{len(links)}章")

    file_name = f"D:/{name[0]}.txt"

    with open(file_name, "w", encoding="utf-8") as f:
        for i in range(0, len(links) - 200):
            f.write("\n")
            f.write(chapters[i])
            url_text = f"{url}{links[i]}"
            xml_content = get_xml(url_text)
            page = xml_content.xpath('//div[@class="book read"]//small/text()')
            get_content(xml_content, f)
            if "(1/2)" in page:
                url_text2 = f"{url_text[0:-5]}_2.html"
                xml_content2 = get_xml(url_text2)
                get_content(xml_content2, f)
            print(f"{chapters[i]}:已完成")
    print("下載完成")


if __name__ == "__main__":
    # url為小說目錄頁
    url = "https://www.oldtimescc.cc/go/16078/"
    download_book(url)

參考:https://www.52pojie.cn/thread-1200971-1-1.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM