沒太完善,但是可以爬下整本小說。日后會寫入數據庫,注釋不要太在意,都是調試的。入庫估計這周之后,這次爬的是筆趣閣的第1150本書,大家只要可以改get_txt()里數字就行,查到自己要看哪本書一改就可以了!
# coding:utf-8 import requests import threading from bs4 import BeautifulSoup import MySQLdb import re import os import time import sys import mysql # 由於爬取的數據太多,我們要把他存入MySQL數據庫中,這個庫用於連接數據庫 import mysql.connector import logging # con = mysql.connector.connect( # user="root", # password='123456', # host='localhost', # port='3306', # database='test11' # ) # insertSql = "INSERT INTO spider('id','title','txt_section','section_name','section_name') VALUES (%s,%s,%s,%s,%s)" # cursor = con.cursor() req_header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': '__cfduid=d577ccecf4016421b5e2375c5b446d74c1499765327; UM_distinctid=15d30fac6beb80-0bdcc291c89c17-9383666-13c680-15d30fac6bfa28; CNZZDATA1261736110=1277741675-1499763139-null%7C1499763139; tanwanhf_9821=1; Hm_lvt_5ee23c2731c7127c7ad800272fdd85ba=1499612614,1499672399,1499761334,1499765328; Hm_lpvt_5ee23c2731c7127c7ad800272fdd85ba=1499765328; tanwanpf_9817=1; bdshare_firstime=1499765328088', 'Host': 'www.qu.la', 'Proxy-Connection': 'keep-alive', 'Referer': 'http://www.qu.la/book/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36' } req_url_base = 'http://www.qu.la/book/' # 小說主地址 # 小說下載函數 # txt_id:小說編號 # txt字典項介紹 # id:小說編號 # title:小說題目 # first_page:第一章頁面 # txt_section:章節地址 # section_name:章節名稱 # section_text:章節正文 # section_ct:章節頁數 def get_txt(txt_id): txt = {} txt['title'] = '' txt['id'] = str(txt_id) try: # print("請輸入需要下載的小說編號:") # txt['id']=input() req_url = req_url_base + txt['id'] + '/' # 根據小說編號獲取小說URL print("小說編號:" + txt['id']) res = requests.get(req_url, params=req_header) # 獲取小說目錄界面 soups = BeautifulSoup(res.text, "html.parser") # soup轉化 # 獲取小說題目 txt['title'] = soups.select('#wrapper .box_con #maininfo #info h1')[0].text txt['author'] = soups.select('#wrapper .box_con #maininfo #info p') # 獲取小說最近更新時間 txt['update'] = txt['author'][2].text # 獲取最近更新章節名稱 txt['lately'] = txt['author'][3].text # 獲取小說作者 txt['author'] = txt['author'][0].text # 獲取小說簡介 txt['intro'] = soups.select('#wrapper .box_con #maininfo #intro')[0].text.strip() print("編號:" + '{0:0>8} '.format(txt['id']) + "小說名:《" + txt['title'] + "》 開始下載。") print("正在尋找第一章頁面。。。") # 獲取小說所有章節信息 first_page = soups.select('#wrapper .box_con #list dl dd a') # 獲取小說總章頁面數 section_ct = len(first_page) # 獲取小說第一章頁面地址 first_page = first_page[0]['href'].split('/')[3] print("小說章節頁數:" + str(section_ct)) print("第一章地址尋找成功:" + first_page) # 設置現在下載小說章節頁面 txt_section = first_page # 打開小說文件寫入小說相關信息 fo = open('{0:0>8}-{1}.txt.download'.format(txt['id'], txt['title']), "ab+") fo.write((txt['title'] + "\r\n").encode('UTF-8')) fo.write((txt['author'] + "\r\n").encode('UTF-8')) fo.write((txt['update'] + "\r\n").encode('UTF-8')) fo.write((txt['lately'] + "\r\n").encode('UTF-8')) fo.write(("*******簡介*******\r\n").encode('UTF-8')) fo.write(("\t" + txt['intro'] + "\r\n").encode('UTF-8')) fo.write(("******************\r\n").encode('UTF-8')) # 進入循環,寫入每章內容 while (1): try: # 請求當前章節頁面 r = requests.get(req_url + str(txt_section), params=req_header) # soup轉換 soup = BeautifulSoup(r.text, "html.parser") # 獲取章節名稱 section_name = soup.select('#wrapper .content_read .box_con .bookname h1')[0] section_text = soup.select('#wrapper .content_read .box_con #content')[0] for ss in section_text.select("script"): # 刪除無用項 ss.decompose() # 獲取章節文本 section_text = re.sub('\s+', '\r\n\t', section_text.text).strip('\r\n') # # 獲取下一章地址 txt_section = soup.select('#wrapper .content_read .box_con .bottem2 #A3')[0]['href'] # 判斷是否最后一章,當為最后一章時,會跳轉至目錄地址,最后一章則跳出循環 if (txt_section == './'): print("編號:" + '{0:0>8} '.format(txt['id']) + "小說名:《" + txt['title'] + "》 下載完成") break # 以二進制寫入章節題目 fo.write(('\r' + section_name.text + '\r\n').encode('UTF-8')) # 以二進制寫入章節內容 fo.write((section_text).encode('UTF-8')) print(txt['title'] + ' 章節:' + section_name.text + ' 已下載') # print(section_text.text.encode('UTF-8')) except: print("編號:" + '{0:0>8} '.format(txt['id']) + "小說名:《" + txt['title'] + "》 章節下載失敗,正在重新下載。") fo.close() os.rename('{0:0>8}-{1}.txt.download'.format(txt['id'], txt['title']), '{0:0>8}-{1}.txt'.format(txt['id'], txt['title'])) # try: # # # cursor.execute("INSERT INTO wangzhi VALUES urlVal") # cursor.execute(insertSql%(id,title,txt_section,section_name,section_name)) # # con.commit() # except Exception as err: # # print(err) # # con.rollback() # # con.close() # cursor.close() except: # 出現錯誤會將錯誤信息寫入dowload.log文件,同時答應出來 fo_err = open('dowload.log', "ab+") try: fo_err.write(('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]:編號:" + '{0:0>8} '.format( txt['id']) + "小說名:《" + txt['title'] + "》 下載失敗。\r\n").encode('UTF-8')) print('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]:編號:" + '{0:0>8} '.format( txt['id']) + "小說名:《" + txt['title'] + "》 下載失敗。") os.rename('{0:0>8}'.format(txt['id']) + '-' + txt['title'] + '.txt.download', '{0:0>8}'.format(txt['id']) + '-' + txt['title'] + '.txt.error') except: fo_err.write(('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]:編號:" + '{0:0>8} '.format( txt['id']) + "下載失敗。\r\n").encode('UTF-8')) print('[' + time.strftime('%Y-%m-%d %X', time.localtime()) + "]:編號:" + '{0:0>8} '.format( txt['id']) + "下載失敗。") finally: # 關閉文件 fo_err.close() # 此處為需要下載小說的編號,編號獲取方法在上文中已經講過。 get_txt(1150)