利用Python爬蟲爬取目標小說並保存到本地
小說地址:http://book.zongheng.com/showchapter/749819.html(目錄地址)
通過小說目錄獲取小說所有章節對應的url地址,然后逐個訪問解析得到每一章節小說的內容,最后保存到本地文件內
文章中的代碼只是第一個版本,可以自行優化
例如:使用IP代理池防止IP地址被封禁
使用多線程對小說章節內容進行爬取可以提高爬取效率,降低運行時間
構建更加詳細的requests請求頭
代碼還有諸多不足,歡迎指導
1 import requests 2 import bs4 3 from bs4 import BeautifulSoup 4 import lxml 5 import urllib 6 7 8 def getMuLu(Html): 9 """ 10 函數getMuLu由主函數傳入小說目錄網址,經解析后返回每一章節的具體網址 11 涉及內容: 12 requests庫:進行網頁請求 13 BeautifulSouping庫:解析請求返回的網頁內容 14 時間:2020-05-15 15 16 環境:Windows + python3.8 17 工具:Pycharm 18 21 22 """ 23 #構建請求頭 24 headers = { 25 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 26 } 27 #使用requests中的get方法請求網址並轉換為text格式 28 demo = requests.get(Html) 29 demo1 = demo.text 30 31 #使用BeautifulSoup庫對text格式的網頁內容進行解析 32 soup = BeautifulSoup(demo1, 'html.parser') 33 34 #將soup變量的值中的所有ID為list的標簽返回到MuLu在轉換為str類型后再次使用BeautifulSoup庫進行解析 35 MuLu = soup.find_all(class_ = 'volume-list') 36 soup1 = BeautifulSoup(str(MuLu), 'html.parser') 37 38 #對soup1解析后將所有的a標簽進行取出並賦值href1 39 href1 = soup1.find_all('li') 40 soup2 = BeautifulSoup(str(href1), 'html.parser') 41 href2 = soup2.find_all('a') 42 43 #將所有a標簽取出后,將a標簽中href屬性的值存儲到列表類型的Web3中 44 Web3 = [] 45 for link in href2: 46 Web1 = link.get('href') 47 Web3.append(Web1) 48 return Web3 49 50 51 def getText(TextUrl): 52 53 """ 54 函數getText由主函數傳入小說目錄網址,經解析后返回小說目錄以及正文 55 涉及內容: 56 requests庫:進行網頁請求 57 BeautifulSouping庫:解析請求返回的網頁內容 58 時間:2020-05-15 59 60 """ 61 i = 0 62 Mu = [] 63 for i in range(len(TextUrl)): 64 headers = { 65 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 66 'Cookie': 'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; ' 67 'UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ==' 68 } 69 70 QingQiu = urllib.request.urlopen(TextUrl[i]).read() 71 date = QingQiu.decode('utf-8') 72 73 SoupText = BeautifulSoup(date,'html.parser') 74 #通過解析SoupText獲取章節名稱 75 MingCheng = SoupText.find_all(class_ = 'title_txtbox') 76 MingCheng1 = BeautifulSoup(str(MingCheng),'lxml') 77 MingCheng2 = MingCheng1.get_text() 78 ls4 = ''.join(MingCheng2) 79 80 #通過解析SoupText獲取章節正文 81 BiaoTi = SoupText.find_all(class_ = 'content') #在全部的html中查找class_ = 'content'的div標簽 82 83 BiaoTi1 = BeautifulSoup(str(BiaoTi), 'lxml') 84 BiaoTi2 = BiaoTi1.find_all('p') #獲取p標簽 85 86 #通過遍歷p標簽獲取正文內容 87 qbs = 0 88 KongList = [] 89 for qbs in range(len(BiaoTi2)): 90 ZhangJie = BiaoTi2[qbs] 91 S = BeautifulSoup(str(ZhangJie), 'html.parser') 92 str1 = S.get_text() 93 KongList.append(str1) 94 qbs += 1 95 96 #將列表轉換為字符串類型 97 ls3 = ''.join(KongList) 98 #通過組合返回章節名稱以及正文內容 99 100 #將兩個字符串類型數據組合為一個並返回 101 NeiRong = ls4 + ls3 102 103 104 #將所有內容寫入列表Mu並返回 105 Mu.append(NeiRong) 106 107 return Mu 108 109 def BaoCunText(WenBen): 110 """ 111 函數BaoCunText由主函數傳入小說目錄以及內容,寫入txt文件 112 涉及內容: 113 文件處理: 114 打開,寫入,關閉文件 115 for遍歷 116 時間:2020-05-16 117 """ 118 #打開文件 119 FlieText = open('MiMiShiMing.txt','a',encoding='utf-8') 120 #遍歷列表WenBen,並利用索引寫入文件,在每章節后換行 121 i = 0 122 for i in range(len(WenBen)): 123 FlieText.write(str(WenBen[i])) 124 FlieText.write('\n') 125 print("第{}章寫入成功".format(i)) 126 i += 1 127 print("寫入完成") 128 #關閉文件 129 FlieText.close() 130 131 if __name__ == '__main__': 132 """ 133 主函數:調用其他函數以及向函數傳值 134 135 時間:2020-5-17 136 137 138 """ 139 140 url = 'http://book.zongheng.com/showchapter/749819.html' 141 MuLuLianjie = getMuLu(url) 142 XiaoShuo = getText(MuLuLianjie) 143 BaoCunText(XiaoShuo)