網絡爬蟲&起點中文網完本榜500部小說

本文轉載自查看原文 2019-04-08 08:44 502

# 網絡爬蟲爬取起點中文網完本榜小說500部
# 四步，分步操作，不易出錯
#  所需要獲取的數據：書名 、作者、網址、類型、主要介紹、作品信息

from urllib.request import *  #導入所有的request，urllib相當於一個文件夾，用到它里面的方法request
from lxml.etree import *  #調用包
import pickle #
import time
import pickle,fake_useragent
# 第一步，將25頁起點完本榜的每部小說的名字和相對應的鏈接寫入txt文件中

# arr=[]
# url0='https://www.qidian.com/rank/fin?page='
# urls=[ url0+str(i) for i in range(1,26)]
#
#
# def aa(link):
#     time.sleep(1)
#     print("正在爬取:%s"%link)   #提示信息可以實時看到爬取信息
#     with urlopen(link) as html:  # 在html中打開爬取的數據
#         text = html.read().decode("utf-8")# 讀取並且解碼數據
#         doc =HTML(text)       #解析html  etree這是lxml中的方法
#     url=doc.xpath("//div[@class='book-img-text']/ul/li/div[@class='book-mid-info']/h4/a/@href")
#     name=doc.xpath("//div[@class='book-img-text']/ul/li/div[@class='book-mid-info']/h4/a/text()")
#
#     arr.append(list(zip(name,url))) #用append方法將爬取數據添加到數組arr
# for link in urls:
#     aa(link)
# print(arr)
# with open("完本榜.txt",'wb') as f: #打開本地文件“完本榜.txt”以寫的方式，二進制
#     pickle.dump(arr,f)     #pickle包


# 第二步，將每部小說鏈接內的作者、類型、主要介紹、作品信息分別獲取到並寫入txt1文件中

# with open('完本榜.txt','rb') as f:
#     arr1 = pickle.load(f)
# lists = []
# for arr2 in arr1:
#     for name,url in arr2:
#         url='https:'+url
#         lists.append(url)
#
# print(lists)
# #獲取代理開始（讓網站不認為你在爬取數據）
# ua = fake_useragent.UserAgent()
# header = {
#     'User-Agent':ua.random
# }
# list2 = []
# def spider(url):
#     time.sleep(1)
#     # print("正在爬取:%s"%url)   #提示信息可以實時看到爬取信息
#     req = Request(url,headers=header)
#     with urlopen(req) as html:
#         text = html.read().decode()
#     doc =HTML(text)
#     # 作者
#     pl1 = doc.xpath("//span/a[@class='writer']/text()")
#     # 類型
#     # print(pl1)
#
#     pl2 = doc.xpath("//p/a[@class='red']/text()")
#     # 主要介紹
#     # print(pl2)
#     #
#     pl3 = doc.xpath("//p[@class='intro']/text()")
#     # 作品信息
#     # print(pl3)
#     #
#     pl4 = doc.xpath("//div[@class='book-info-detail']/div[@class='book-intro']/p/text()")
#     # print(pl4)
#
#     list2.append(list(zip(pl1, pl2,pl3,pl4)))  # 用append方法將爬取數據添加到數組lists
#     print(list2)
# for url in lists:
#     mm=spider(url)
# with open("完本榜1.txt",'wb') as f: #打開本地文件“完本榜.txt”以寫的方式，二進制
#     pickle.dump(list2,f)     #pickle包


# 第三步，將txt文件寫入表格xls中

# import xlwt#（寫入）
# wb=xlwt.Workbook()  #創建表格對象
# ws=wb.add_sheet("完本榜")
# with open("完本榜.txt",'rb') as f:
#     arr6=pickle.load(f)
# index=0
# for arr7 in arr6:
#     for name,url in arr7:
#         #序號
#         ws.write(index,0,index+1)
#         # title
#         ws.write(index,1,name)
#         ws.write(index,2,url)
#         index+=1
# wb.save("完本榜.xls")


# 第四步  將txt文件寫入xls1中
import xlwt#（寫入）
wb=xlwt.Workbook()  #創建表格對象
ws=wb.add_sheet("完本榜1")
with open("完本榜1.txt",'rb') as f:
    arr6=pickle.load(f)
index=0
for arr7 in arr6:
    for pl1,pl2,pl3,pl4 in arr7:
        #序號
        # ws.write(index,0,index+1)
        # title
        ws.write(index,3,pl1)
        ws.write(index,4,pl2)
        ws.write(index,5,pl3)
        ws.write(index,6,pl4)
        index+=1
wb.save("完本榜1.xls")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 [Python爬蟲]起點中文網小說排行榜爬蟲實踐-爬取起點中文網小說信息爬蟲實戰——起點中文網小說的爬取起點中文網月票榜爬取及數據分析 Python3爬取起點中文網閱讀量信息，解決文字反爬~~~附源代碼 VitePress中文網 UIkit中文網 getuikit.work 油猴中文網/腳本貓【python爬蟲】爬取當當網TOP500圖書暢銷榜 jmockit中文網 expectations 入門