python3.6 單文件爬蟲斷點續存普通版文件續存方式

本文轉載自查看原文 2018-04-28 14:27 1124

# 導入必備的包
# 本文爬取的是頂點小說中的完美世界為列。文中的aa.text,bb.text為自己創建的text文件
import requests
from bs4 import BeautifulSoup

# 爬取目標url

url = 'https://www.x23us.com/html/42/42377/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3393.4 Safari/537.36'
}


# 單獨的函數，用於打開bb文件,bb文件用於存章節的url
def open_href():
    with open('bb.text', 'r', encoding='utf-8') as f:
        a = f.readlines()
    f.close()
    # 將文件里的所有url讀出並以列表的形式返回
    return a


# 請求目標網址，並返回文本源碼
def page_index():
    html = requests.get(url, headers=headers)
    if html:
        return html.text


# 將目標網址進行解析，得到所有章節的url
def page_list(html):
    if html:
        a = []
        html_bs4 = BeautifulSoup(html, 'lxml')
        html_b = html_bs4.select('.L a')
        for i in html_b:
            title = i.get_text()
            href = url + i.get('href')
            data = {
                'title': title,
                'href': href
            }
            a.append(data)
        return a


# 將aa文件打開並將得到的章節內容寫進文件里並關閉存檔
def text_cun_html(title, html):
    if html:
        with open('aa.text', 'a+', encoding='utf-8') as f:
            f.write(title + '\n' + html + '\n')
        f.close()
        print('存檔成功！！！！')
        return 'yes'
    else:
        None


# 將bb文件打開並將寫入aa文件相對應的url寫進bb文件里並關閉存檔
def text_cun_href(href):
    if href:
        with open('bb.text', 'a+', encoding='utf-8') as f:
            f.write(href + '\n')
        f.close()
        print('href存檔成功！！')
        return 'ok'
    else:
        None


# 將得到的章節url解析並二次請求獲取章節內容
def html_list_index(title, href):
    if href:
        html = requests.get(url=href, headers=headers)
        if html.status_code == 200:
            bs = BeautifulSoup(html.text, 'lxml')
            bs4 = bs.select('#contents')
            for item in bs4:
                a = text_cun_html(title, item.get_text())
                # 章節內容存檔成功返回yes
                if a == 'yes':
                    text_cun_href(href)
                else:
                    None


def main():
    # 首先獲取下bb文件的url列表
    number = open_href()
    print(number)
    html = page_index()
    data = page_list(html)
    for i in data:
        title = i.get('title')
        href = i.get('href')
        print(href)
        # 判斷發生異常后，在啟動服務時，我們過濾掉已爬取過的url
        if href + '\n' not in number:
            html_list_index(title, href)
        else:
            None


if __name__ == '__main__':
    main()


#簡單思路實現斷點續存，不喜勿噴，歡迎共同討論

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python3.6 簡單爬蟲 Python3.6 import源文件與編譯文件的關系 Python3.6多線程爬蟲 CentOS安裝tgz文件，安裝python3.6 python3.6版本安裝dlib python3.7與python3.6,python2.7 pyc文件頭部差異 Python3.6安裝protobuf模塊+將proto文件轉換成pb2.py文件華為雲照片的爬蟲程序更新(python3.6) python獲取數據網頁數據並創建文件夾保存（基於python3.6）【Python】將python3.6軟件的py文件打包成exe程序

python3.6 單文件爬蟲 斷點續存 普通版 文件續存方式

免責聲明！

python3.6 單文件爬蟲斷點續存普通版文件續存方式