練習使用requests BeautifulSoup 抓取一本小說存放到D盤中
速度比較慢、抓取服務器容易中斷
# -*- coding:UTF-8 -*- import requests from bs4 import BeautifulSoup import re """ 獲取書籍目錄 """ def getBookContents(url): req = requests.get(url=url) req.encoding = "gb2312" html = req.text dv_bf = BeautifulSoup(html, "html5lib") dv = dv_bf.find("div", class_="listmain") # dvs = dv_bf.find_all("div", class_="listmain") a_bf = BeautifulSoup(str(dv), "html5lib") a = a_bf.find_all("a") book_contents_list = [] i = 0 for content in a[13:]: book_title = content.string book_url = content.get("href") try: # 數據清洗 獲取標題"章"字索引 若沒有則出現異常 不記錄數據 book_title_index = str(book_title).index("章", 0) # print(book_title_index) # 通過index切片 獲取新的章節標題 new_book_title = book_title[book_title_index + 1:] # print(new_book_title) # 去除標題含有的空格 i = i + 1 new_book_titles = "第{}章".format(i) + new_book_title.lstrip() new_book_url = "http://www.biqukan.com{}".format(book_url) #print(new_book_titles, new_book_url) # 一組數據設置為字典類型 contenets = {new_book_titles: new_book_url} # 存放到list book_contents_list.append(contenets) except: # 通過異常捕捉,出現異常是沒有找到"章"字符索引 print("*****************不是正文章節節點,不予記錄****************") print("原標題=", book_title) print("原鏈接=", new_book_url) return book_contents_list """ 通過文章鏈接地址獲取章節內容 """ def getConnect(url): target = 'http://www.biqukan.com/1_1094/5403177.html' req = requests.get(url=url) req.encoding = 'gb2312' html = req.text div_bf = BeautifulSoup(html, "html5lib") div = div_bf.find("div", id="content") # 去除script [s.extract() for s in div('script')] # print(div.text) return div.text """ 將小說內容寫入到文件 """ def saveData(filepath, text): with open(filepath, mode="w", encoding="UTF-8") as f: f.writelines(text) f.write('\n\n') if __name__ == '__main__': book_list = getBookContents("http://www.biqukan.com/1_1094") for li in book_list: filepath = "d:\\123\\" connecturl = "" for aa in li.keys(): filepath = filepath+aa connecturl = li[aa] text = getConnect(connecturl) saveData(filepath,text)