學習了Python3 爬蟲實戰教程_w3cschool的教程
第一次做爬蟲,練手網站是筆趣閣(http://www.ibiqu.net/),反正他們也是爬別人的 ^_^!
將源碼貼出來給和我一樣的菜鳥參考,代碼有點亂,沒有寫def,也沒有做什么優化。
有兩個引用的庫得單獨安裝一下
pip install beautifulsoup4
pip install requests
手冊地址:http://beautifulsoup.readthedocs.io/zh_CN/latest/
from bs4 import BeautifulSoup import requests import re import time if __name__ == '__main__': t = 1 while t ==1: bookname = input('請輸入要下載的書名:') target = 'http://www.ibiqu.net//modules/article/search.php?searchkey='+ bookname #搜索引擎路徑 req = requests.get(url = target) html = req.text bf = BeautifulSoup(html,'html.parser') texts = bf.find_all('a') x = 1 for a in texts: if a.string == bookname: url_a = a.get('href') target2 = 'http://www.ibiqu.net' + url_a #目錄路徑 req2 = requests.get(url = target2) html2 = req2.text q = re.search('正文',html2).end() + 5 h = re.search('</dl>',html2).start() m = html2[q:h] bf2 = BeautifulSoup(m,'html.parser') texts2 = bf2.find_all('a') print('本書共找到到'+ str(len(texts2)) + '個章節') n = int(input('請輸入開始下載章節(阿拉伯數字):')) #緩存計數 path = 'D:/pydown/' + bookname + '.txt' f = open(path,mode='a',encoding='utf-8') #創建下載文件 for a in texts2[n-1:]: url_b = a.get('href') name_b = a.string #章名 f.write(name_b + '\n') #寫入章節名 target3 = 'http://www.ibiqu.net' + url_b #章路徑 req3 = requests.get(url = target3) html3 = req3.text bf3 = BeautifulSoup(html3,'html.parser') d = bf3.find_all('div',id = 'content') p0 = d[0] p1 = p0.find_all('p') print('開始寫入'+ name_b) for a in p1: if a.string: #去除空段落 f.write(a.string + '\n') #寫入章節內容 n +=1 if n%500 == 0: #定時存盤 f.close() f = open(path,mode='a',encoding='utf-8') print('************緩存清理完成!************') time.sleep(2) #暫停兩秒,別把人家服務器擠崩了 ''' y = input('1跳出 >>>') if y: break ''' print('下載結束!') f.close() #關閉文件 x = 0 if x: print('找不到此書,請重新輸入正確書名!')