用python爬整本小說寫入txt文件

本文轉載自查看原文 2017-11-15 21:09 1064

沒太完善，但是可以爬下整本小說。日后會寫入數據庫，注釋不要太在意，都是調試的。入庫估計這周之后，這次爬的是筆趣閣的第1150本書，大家只要可以改get_txt（）里數字就行，查到自己要看哪本書一改就可以了！

# coding:utf-8
import requests
import threading
from bs4 import BeautifulSoup
import MySQLdb
import re
import os
import time
import sys
import mysql  # 由於爬取的數據太多，我們要把他存入MySQL數據庫中，這個庫用於連接數據庫
import mysql.connector
import logging

# con = mysql.connector.connect(
#     user="root",
#     password='123456',
#     host='localhost',
#     port='3306',
#     database='test11'
# )
# insertSql = "INSERT INTO spider('id','title','txt_section','section_name','section_name') VALUES (%s,%s,%s,%s,%s)"
# cursor = con.cursor()
req_header = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.8',
    'Cookie': '__cfduid=d577ccecf4016421b5e2375c5b446d74c1499765327; UM_distinctid=15d30fac6beb80-0bdcc291c89c17-9383666-13c680-15d30fac6bfa28; CNZZDATA1261736110=1277741675-1499763139-null%7C1499763139; tanwanhf_9821=1; Hm_lvt_5ee23c2731c7127c7ad800272fdd85ba=1499612614,1499672399,1499761334,1499765328; Hm_lpvt_5ee23c2731c7127c7ad800272fdd85ba=1499765328; tanwanpf_9817=1; bdshare_firstime=1499765328088',
    'Host': 'www.qu.la',
    'Proxy-Connection': 'keep-alive',
    'Referer': 'http://www.qu.la/book/',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'
}

req_url_base = 'http://www.qu.la/book/'  # 小說主地址


# 小說下載函數
# txt_id：小說編號
# txt字典項介紹
# id：小說編號
# title：小說題目
# first_page：第一章頁面
# txt_section：章節地址
# section_name：章節名稱
# section_text：章節正文
# section_ct：章節頁數
def get_txt(txt_id):
    txt = {}
    txt['title'] = ''
    txt['id'] = str(txt_id)
    try:
        # print("請輸入需要下載的小說編號：")
        # txt['id']=input()
        req_url = req_url_base + txt['id'] + '/'  # 根據小說編號獲取小說URL
        print("小說編號：" + txt['id'])
        res = requests.get(req_url, params=req_header)  # 獲取小說目錄界面
        soups = BeautifulSoup(res.text, "html.parser")  # soup轉化
        # 獲取小說題目
        txt['title'] = soups.select('#wrapper .box_con #maininfo #info h1')[0].text
        txt['author'] = soups.select('#wrapper .box_con #maininfo #info p')
        # 獲取小說最近更新時間
        txt['update'] = txt['author'][2].text
        # 獲取最近更新章節名稱
        txt['lately'] = txt['author'][3].text
        # 獲取小說作者
        txt['author'] = txt['author'][0].text
        # 獲取小說簡介
        txt['intro'] = soups.select('#wrapper .box_con #maininfo #intro')[0].text.strip()
        print("編號：" + '{0:0>8}   '.format(txt['id']) + "小說名：《" + txt['title'] + "》  開始下載。")
        print("正在尋找第一章頁面。。。")
        # 獲取小說所有章節信息
        first_page = soups.select('#wrapper .box_con #list dl dd a')
        # 獲取小說總章頁面數
        section_ct = len(first_page)
        # 獲取小說第一章頁面地址
        first_page = first_page[0]['href'].split('/')[3]
        print("小說章節頁數：" + str(section_ct))
        print("第一章地址尋找成功：" + first_page)
        # 設置現在下載小說章節頁面
        txt_section = first_page
        # 打開小說文件寫入小說相關信息
        fo = open('{0:0>8}-{1}.txt.download'.format(txt['id'], txt['title']), "ab+")
        fo.write((txt['title'] + "\r\n").encode('UTF-8'))
        fo.write((txt['author'] + "\r\n").encode('UTF-8'))
        fo.write((txt['update'] + "\r\n").encode('UTF-8'))
        fo.write((txt['lately'] + "\r\n").encode('UTF-8'))
        fo.write(("*******簡介*******\r\n").encode('UTF-8'))
        fo.write(("\t" + txt['intro'] + "\r\n").encode('UTF-8'))
        fo.write(("******************\r\n").encode('UTF-8'))
        # 進入循環，寫入每章內容
        while (1):
            try:
                # 請求當前章節頁面
                r = requests.get(req_url + str(txt_section), params=req_header)
                # soup轉換
                soup = BeautifulSoup(r.text, "html.parser")
                # 獲取章節名稱
                section_name = soup.select('#wrapper .content_read .box_con .bookname h1')[0]
                section_text = soup.select('#wrapper .content_read .box_con #content')[0]
                for ss in section_text.select("script"):  # 刪除無用項
                    ss.decompose()
                # 獲取章節文本
                section_text = re.sub('\s+', '\r\n\t', section_text.text).strip('\r\n')  #
                # 獲取下一章地址
                txt_section = soup.select('#wrapper .content_read .box_con .bottem2 #A3')[0]['href']
                # 判斷是否最后一章，當為最后一章時，會跳轉至目錄地址，最后一章則跳出循環
                if (txt_section == './'):
                    print("編號：" + '{0:0>8}   '.format(txt['id']) + "小說名：《" + txt['title'] + "》 下載完成")
                    break
                # 以二進制寫入章節題目
                fo.write(('\r' + section_name.text + '\r\n').encode('UTF-8'))
                # 以二進制寫入章節內容
                fo.write((section_text).encode('UTF-8'))
                print(txt['title'] + ' 章節：' + section_name.text + '     已下載')
                # print(section_text.text.encode('UTF-8'))
            except:
                print("編號：" + '{0:0>8}   '.format(txt['id']) + "小說名：《" + txt['title'] + "》 章節下載失敗，正在重新下載。")
        fo.close()
        os.rename('{0:0>8}-{1}.txt.download'.format(txt['id'], txt['title']),
                  '{0:0>8}-{1}.txt'.format(txt['id'], txt['title']))
        # try:
        #
        #     # cursor.execute("INSERT INTO wangzhi VALUES urlVal")
        #     cursor.execute(insertSql%(id,title,txt_section,section_name,section_name))
        #
        #     con.commit()

        # except Exception as err:
        #
        #     print(err)
        #     # con.rollback()
        #
        # con.close()
        # cursor.close()

    except:  # 出現錯誤會將錯誤信息寫入dowload.log文件，同時答應出來
        fo_err = open('dowload.log', "ab+")
        try:
            fo_err.write(('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]：編號：" + '{0:0>8}   '.format(
                txt['id']) + "小說名：《" + txt['title'] + "》 下載失敗。\r\n").encode('UTF-8'))
            print('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]：編號：" + '{0:0>8}   '.format(
                txt['id']) + "小說名：《" + txt['title'] + "》 下載失敗。")
            os.rename('{0:0>8}'.format(txt['id']) + '-' + txt['title'] + '.txt.download',
                      '{0:0>8}'.format(txt['id']) + '-' + txt['title'] + '.txt.error')
        except:
            fo_err.write(('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]：編號：" + '{0:0>8}   '.format(
                txt['id']) + "下載失敗。\r\n").encode('UTF-8'))
            print('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]：編號：" + '{0:0>8}   '.format(
                txt['id']) + "下載失敗。")
        finally:  # 關閉文件
            fo_err.close()


# 此處為需要下載小說的編號，編號獲取方法在上文中已經講過。
get_txt(1150)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python 爬取網絡小說清洗並下載至txt文件 python，爬取小說網站小說內容，同時每一章存在不同的txt文件中 python 將分詞結果寫入txt文件 python 讀取寫入txt文件 Python爬蟲-爬小說 Python 文件IO：TXT 文件的讀取與寫入 python新建txt文件，並逐行寫入數據 Python：將 list 寫入一個 txt 文件 python寫入txt文件時的覆蓋和追加 Python寫文件：將爬取結果寫入本地txt，寫入本地csv，寫入本地excel，保存到服務端數據庫