python3爬蟲-使用requests爬取起點小說


import requests
from lxml import etree
from urllib import parse
import os, time


def get_page_html(url):
    '''向url發送請求'''
    resoponse = session.get(url, headers=headers, timeout=timeout)
    try:
        if resoponse.status_code == 200:
            return resoponse
    except Exception:
        return None


def get_next_url(resoponse):
    '''獲取下一頁的url鏈接'''
    if resoponse:
        try:
            selector = etree.HTML(resoponse.text)
            url = selector.xpath("//a[@id='j_chapterNext']/@href")[0]
            next_url = parse.urljoin(resoponse.url, url)
            return next_url
        except IndexError:
            return None


def xs_content(resoponse):
    '''獲取小說的章節名,內容'''
    if resoponse:
        selector = etree.HTML(resoponse.text)
        title = selector.xpath("//h3[@class='j_chapterName']/text()")[0]
        content_xpath = selector.xpath(
            "//div[contains(@class,'read-content') and contains(@class,'j_readContent')]//p/text()")
        return title, content_xpath


def write_to_txt(info_tuple: tuple):
    if not info_tuple: return
    path = os.path.join(BASE_PATH, info_tuple[0])
    if not os.path.exists(path):
        with open(path + ".txt", "wt", encoding="utf-8") as f:
            for line in info_tuple[1]:
                f.write(line + "\n")
            f.flush()


def run(url):
    '''啟動'''
    html = get_page_html(url)
    next_url = get_next_url(html)
    info_tupe = xs_content(html)
    if next_url and info_tupe:
        print("正在寫入")
        write_to_txt(info_tupe)
        time.sleep(sleep_time)  # 延遲發送請求的時間,減少對服務器的壓力。
        print("正在爬取%s" % info_tupe[0])
        print("正在爬取%s" % next_url)
        run(next_url)


if __name__ == '__main__':
    session = requests.Session()
    sleep_time = 5
    timeout = 5
    BASE_PATH = r"D:\圖片\LSZJ"  # 存放文件的目錄
    url = "https://read.qidian.com/chapter/8iw8dkb_ZTxrZK4x-CuJuw2/fWJwrOiObhn4p8iEw--PPw2"  # 這是斗破蒼穹第一章的url    需要爬取的小說的第一章的鏈接(url)
    headers = {
        "Referer": "read.qidian.com",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
    }
    print('開始運行爬蟲')
    run(url)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM