練習使用requests BeautifulSoup 抓取一本小說存放到D盤中
速度比較慢、抓取服務器容易中斷
# -*- coding:UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import re
"""
獲取書籍目錄
"""
def getBookContents(url):
req = requests.get(url=url)
req.encoding = "gb2312"
html = req.text
dv_bf = BeautifulSoup(html, "html5lib")
dv = dv_bf.find("div", class_="listmain")
# dvs = dv_bf.find_all("div", class_="listmain")
a_bf = BeautifulSoup(str(dv), "html5lib")
a = a_bf.find_all("a")
book_contents_list = []
i = 0
for content in a[13:]:
book_title = content.string
book_url = content.get("href")
try:
# 數據清洗 獲取標題"章"字索引 若沒有則出現異常 不記錄數據
book_title_index = str(book_title).index("章", 0)
# print(book_title_index)
# 通過index切片 獲取新的章節標題
new_book_title = book_title[book_title_index + 1:]
# print(new_book_title)
# 去除標題含有的空格
i = i + 1
new_book_titles = "第{}章".format(i) + new_book_title.lstrip()
new_book_url = "http://www.biqukan.com{}".format(book_url)
#print(new_book_titles, new_book_url)
# 一組數據設置為字典類型
contenets = {new_book_titles: new_book_url}
# 存放到list
book_contents_list.append(contenets)
except:
# 通過異常捕捉,出現異常是沒有找到"章"字符索引
print("*****************不是正文章節節點,不予記錄****************")
print("原標題=", book_title)
print("原鏈接=", new_book_url)
return book_contents_list
"""
通過文章鏈接地址獲取章節內容
"""
def getConnect(url):
target = 'http://www.biqukan.com/1_1094/5403177.html'
req = requests.get(url=url)
req.encoding = 'gb2312'
html = req.text
div_bf = BeautifulSoup(html, "html5lib")
div = div_bf.find("div", id="content")
# 去除script
[s.extract() for s in div('script')]
# print(div.text)
return div.text
"""
將小說內容寫入到文件
"""
def saveData(filepath, text):
with open(filepath, mode="w", encoding="UTF-8") as f:
f.writelines(text)
f.write('\n\n')
if __name__ == '__main__':
book_list = getBookContents("http://www.biqukan.com/1_1094")
for li in book_list:
filepath = "d:\\123\\"
connecturl = ""
for aa in li.keys():
filepath = filepath+aa
connecturl = li[aa]
text = getConnect(connecturl)
saveData(filepath,text)


