源碼地址: https://gitee.com/Black-sky-cloud/python-spider/tree/master/bqg_Spider
exe 下載地址: https://www.lanzouw.com/iKz7gxdhsne 密碼:8d9f
不願意下載的可以直接復制下面代碼:
點擊查看代碼
"""
這個爬蟲腳本可以再筆趣閣中搜索相應的小說並爬取
"""
import requests
import time
from prettytable import PrettyTable
from lxml import etree
headers = {
# 設置 UA 反爬
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36",
}
def search():
"""
查找並輸出搜索到的相關圖書信息
:return:
"""
se = requests.Session()
se.get("https://www.biqugeq.com/")
name = input("請輸入你要查找的書名: \t")
url = "https://www.biqugeq.com/search/?ie=gbk&siteid=xszww.com&q=" + name
# 獲取查詢到的頁面
res = etree.HTML(se.get(url, headers=headers).text)
bookIndex = output(res)
Save().__int__(se, bookIndex, name)
def output(res):
"""
從 html 中獲取到 頁面列表進行 打印輸出
:return:
"""
cssLi = res.xpath('//div[@class="l bd"]/ul')[0]
bookNames = cssLi.xpath("li/span[2]/a/text()")
authors = cssLi.xpath("li/span[4]/text()")
table = PrettyTable(['序號', '書名', '作者名'])
for i in range(len(bookNames)):
table.add_row([i + 1, bookNames[i], authors[i]])
table.align[1] = "c"
print(table)
num = input("請輸入圖書序號開始下載: \t")
return "https://www.biqugeq.com" + cssLi.xpath("li[" + num + "]/span[2]/a/@href")[0]
class Save():
"""
拿到 url 后 爬取每一頁 url 保存
"""
pageDict = {}
def __int__(self, session, url, book):
self.session = session
self.url = url
self.book = book
self.path = input("請輸入你要保存的位置路徑, 輸入0或按回車 默認保存到D盤根目錄: \t")
pageList = self.getHeadHtml()
for i in pageList:
self.save(self.getText(i), self.book)
time.sleep(2)
def getHeadHtml(self):
"""
獲取當前頁面的 html 中每一章的請求路徑
:return: text 數據
"""
# 請求鏈接地址
res = self.session.get(self.url, headers=headers)
# 設置字符集編碼
res.encoding = "gbk123"
# 格式化拿到的 html 頁面
etreeHtml = etree.HTML(res.text)
# uri 請求頭
urlHead = "https://www.biqugeq.com"
# 獲取首頁每一章的請求地址
urlNoHeadList = etreeHtml.xpath('//div[@class="listmain"]/dl/dd/a/@href')[12:]
pageList = []
for i in urlNoHeadList:
# 拼接 uri
pageList.append(urlHead + i)
return pageList
def getText(self, href):
# 請求鏈接地址
res = self.session.get(href, headers=headers)
# 設置字符集編碼
res.encoding = "gbk123"
# 格式化拿到的 html 頁面
etreeHtml = etree.HTML(res.text)
# 獲取章節名
pageName = etreeHtml.xpath('//div[@class="content"]/h1/text()')[0]
# 獲取章節內容
pageTextList = etreeHtml.xpath('//div[@id="content"]/text()')
pageText = ""
for i in pageTextList:
pageText += i.replace("\u3000", "").replace("\n", "").replace("(https://www.biqumo.com/0_269/2243417.html)",
"").replace(
"請記住本書首發域名:https://www.biqumo.com。筆趣閣手機版閱讀網址:https://m.biqumo.com", "").replace(
"(https://www.biqumo.com/2_2784/57553374.html)", "")
return [pageName, pageText]
def save(self, page, bookName):
# path = input("請輸入你要保存的位置路徑, 輸入0或按回車 默認保存到D盤根目錄: \t")
savePath = ""
if self.path == "0":
savePath = "D://" + bookName + ".txt"
elif self.path == "":
savePath = "D://" + bookName + ".txt"
else:
savePath = self.path + "/" + bookName + ".txt"
pageName = page[0]
pageText = page[1]
print("開始保存 {}".format(pageName))
with open(savePath, "a", encoding="utf8") as f:
f.write(pageName)
f.write("\n\n")
f.write(pageText)
f.write("\n\n")
if __name__ == '__main__':
search()
注: 此爬蟲是練習爬蟲,侵刪.