本人純python小白一枚!目前剛自學python爬蟲三個禮拜(python語法一個禮拜,爬蟲兩星期),以后還會繼續深入,因為它真的是一門“面向小白”、容易入門而且還十分有趣的腳本語言。
廢話不多說,先介紹代碼功能
支持輸入小說名或者作者名兩種方式進行爬取,因為網站排行榜小說數目比較龐大,使用單一主線程爬取速度過慢,所以import了threading模塊進行多線程crawl,實測排行榜上小說兩分鍾可以找完。
先上鏈接:https://m.37zw.net/top/allvisit_1/
貼上效果圖
還支持通過輸入書名查找,就不演示了,原理相似,代碼附上,有興趣歡迎互相交流!!
1 import requests 2 import re 3 from queue import Queue 4 from threading import Thread 5 from bs4 import BeautifulSoup 6 7 # 創建隊列存儲所有網頁信息 8 def creatqueue(): 9 queue_page = Queue() 10 # 初始鏈接 11 start_url = '這里是鏈接' 12 response = requests.get(start_url) 13 response.encoding = response.apparent_encoding 14 pattern = re.compile('第1/(.*?)頁') 15 pages = int(pattern.search(response.text).group(1)) 16 # 隨機排頁 17 # set_page = set() 18 # for page in range(1, pages+1): 19 # set_page.add(str(page)) 20 # for i in range(len(set_page)): 21 # queue_page.put(set_page.pop()) 22 23 # 順序排頁 24 for page in range(1, pages+1): 25 queue_page.put(str(page)) 26 27 return queue_page 28 29 # 重寫線程對象,通過書名查找書籍 30 class Search_By_Name(Thread): 31 def __init__(self, name, tname, queue_page, stop_thread, print_once): 32 super(Search_By_Name, self).__init__() 33 self.name = name 34 self.tname = tname 35 self.queue_page = queue_page 36 self.book_url = '' 37 self.stop_thread = stop_thread 38 self.print_once = print_once 39 40 def run(self): 41 print('%s上線了, 拼命搜刮中...' % self.tname) 42 while(1): 43 if len(self.stop_thread) >0: 44 break 45 try: 46 page = self.queue_page.get(True, 5) 47 except: 48 break 49 url = 'https://m.37zw.net/top/allvisit_'+page 50 print('正在查找第%d頁...' % int(page)) 51 if self.search_name(url) ==True : 52 print(('%s在第%d頁被%s找到了!' % (self.name, int(page), self.tname)).center(120, '-')) 53 self.down_book() 54 break 55 if self.queue_page.empty() and len(self.print_once) == 0: 56 self.print_once.append(1) 57 break 58 print('搜刮完畢,%s下線了' % self.tname) 59 60 def search_name(self, url): 61 r = requests.get(url) 62 r.encoding = r.apparent_encoding 63 pattern = re.compile('<a href="#">.*?</a><a href="(.*?)" class="blue">%s</a>' % self.name) 64 result = pattern.search(r.text) 65 if result !=None: 66 self.stop_thread.append(1) 67 self.book_url = 'https://m.37zw.net'+result.group(1)+'index_1.html' 68 print(self.book_url) 69 return True 70 71 def down_book(self): 72 # body > div:nth-child(6) > span.middle > select > option:nth-child(1) 73 print("{:^120}".format("%s正在下載,請耐心等待..." % self.name)) 74 url = self.book_url 75 r = requests.get(url) 76 r.encoding = r.apparent_encoding 77 soup = BeautifulSoup(r.text, 'lxml') 78 pages = soup.select('body > div:nth-child(6) > span.middle > select > option') 79 80 for page in pages: 81 start_url = 'https://m.37zw.net'+page['value'] 82 s = requests.Session() 83 r_1 = s.get(start_url) 84 r_1.encoding = r_1.apparent_encoding 85 h = BeautifulSoup(r_1.text, 'lxml') 86 chapters = h.select('body > div.cover > ul > li > a') 87 88 for chapter in chapters: 89 url = 'https://m.37zw.net' + chapter['href'] 90 r_2 = s.get(url) 91 r_2.encoding = r_2.apparent_encoding 92 h2 = BeautifulSoup(r_2.text, "lxml") 93 title = h2.select_one('div#nr_title') 94 print(title.text, '\t', '正在解析...') 95 ch = h2.select_one('div#nr1') 96 ch_new = re.sub('三七中文 www.37zw.net', '', ch.text).replace('o','。').replace('()','').replace(' ',' ').replace(' ', '\n\n').replace('[三七中文手機版 m.37zw.c。m]', '') 97 print(ch_new) 98 99 with open('D:\迅雷下載\書籍類\%s.txt' % self.name, 'a+', encoding='utf-8') as f: 100 str1 = title.text.center(30, ' ') 101 f.write(str1) 102 f.write(ch_new) 103 print('下載完畢!') 104 f.close() 105 106 107 class Search_By_Author(Thread): 108 def __init__(self, aname, tname, queue_page, book_url, book_name): 109 super(Search_By_Author, self).__init__() 110 self.aname = aname 111 self.tname = tname 112 self.queue_page = queue_page 113 self.book_url = book_url 114 self.book_name = book_name 115 def run(self): 116 print('%s上線了, 拼命搜刮中...' % self.tname) 117 while (1): 118 try: 119 page = self.queue_page.get(True, 10) 120 except: 121 break 122 url = 'https://m.37zw.net/top/allvisit_' + page 123 print('正在查找第%d頁...' % int(page)) 124 self.search_author(url, int(page)) 125 126 127 print('搜刮完畢,%s下線了' % self.tname) 128 129 def search_author(self, url, page): 130 r = requests.get(url) 131 r.encoding = r.apparent_encoding 132 # <p class="line"><a href="#">.*?</a><a href="(.*?)" class="blue">(.*?)</a>/%s</p> 133 pattern = re.compile('<p class="line"><a href="#">.*?</a><a href="(.*?)" class="blue">(.*?)</a>/%s</p>' % self.aname) 134 result = pattern.findall(r.text) 135 if len(result) >0: 136 for res in result: 137 bok_url, bok_name = res 138 self.book_url.append('https://m.37zw.net' + bok_url + 'index_1.html') 139 self.book_name.append(bok_name) 140 print('------%s在第%d頁找到%s的——《%s》------' %(self.tname, page, self.aname, bok_name)) 141 142 143 def down_book(self, n): 144 # body > div:nth-child(6) > span.middle > select > option:nth-child(1) 145 print("{:^120}".format("%s正在下載,請耐心等待..." % self.book_name[n])) 146 url = self.book_url[n] 147 r = requests.get(url) 148 r.encoding = r.apparent_encoding 149 soup = BeautifulSoup(r.text, 'lxml') 150 pages = soup.select('body > div:nth-child(6) > span.middle > select > option') 151 152 for page in pages: 153 start_url = 'https://m.37zw.net' + page['value'] 154 s = requests.Session() 155 r_1 = s.get(start_url) 156 r_1.encoding = r_1.apparent_encoding 157 h = BeautifulSoup(r_1.text, 'lxml') 158 chapters = h.select('body > div.cover > ul > li > a') 159 160 for chapter in chapters: 161 url = 'https://m.37zw.net' + chapter['href'] 162 r_2 = s.get(url) 163 r_2.encoding = r_2.apparent_encoding 164 h2 = BeautifulSoup(r_2.text, "lxml") 165 title = h2.select_one('div#nr_title') 166 print(title.text, '\t', '正在解析...') 167 ch = h2.select_one('div#nr1') 168 # ch_new = re.sub(r'<div id="nr1">|三七中文 www.37zw.net|</div>', '', str(ch)).replace(r'<br/>', '\n').replace('o','。').replace('()','').replace(' ',' ') 169 ch_new = re.sub('三七中文 www.37zw.net', '', ch.text).replace('o','。').replace('()','').replace(' ',' ').replace(' ', '\n\n').replace('[三七中文手機版 m.37zw.c。m]', '') 170 print(ch_new) 171 with open('D:\迅雷下載\書籍類\%s.txt' % self.book_name[n], 'a+', encoding='utf-8') as f: 172 str1 = title.text.center(30, ' ') 173 f.write(str1) 174 f.write(ch_new) 175 print('下載完畢!') 176 f.close() 177 178 179 def creatS_B_Nthread(name, queue_page): 180 tname = [] 181 # 設置線程數,cpu允許且網站不設限的情況下調高可以增加爬蟲效率 182 tnum = 66 183 for i in range(1, tnum + 1): 184 tname.append('%d號搜書蟲' % i) 185 stop_thread = [] 186 print_once = [] 187 tlist = list() 188 for name_t in tname: 189 t = Search_By_Name(name, name_t, queue_page, stop_thread, print_once) 190 tlist.append(t) 191 t.start() 192 for t in tlist: 193 t.join() 194 end_print(queue_page, print_once) 195 196 197 def creatS_B_Athread(aname, queue_page): 198 tname = [] 199 # 設置線程數,cpu允許且網站不設限的情況下調高可以增加爬蟲效率 200 tnum = 66 201 for i in range(1, tnum + 1): 202 tname.append('%d號搜書蟲' % i) 203 book_url = [] 204 book_name = [] 205 tlist = list() 206 for name_t in tname: 207 t = Search_By_Author(aname, name_t, queue_page, book_url, book_name) 208 tlist.append(t) 209 t.start() 210 for t in tlist: 211 t.join() 212 213 if queue_page.empty(): 214 if len(book_name) == 0: 215 print('------Too low!!! 這個網站沒有%s的書------' % aname) 216 else: 217 print(('搜書蟲們共為你找到%d本%s的書' % (len(book_name), aname)).center(120, '-')) 218 for i in range(len(book_name)): 219 s = '%d: 《%s》' % (i + 1, book_name[i]) 220 print(s.center(120, ' ')) 221 for i in range(len(book_name)): 222 n = int(input('輸入序號下載對應書籍(輸入0取消下載並結束運行):')) 223 if n == 0: 224 break 225 Search_By_Author(aname, name_t, queue_page, book_url, book_name).down_book(n - 1) 226 227 228 229 def end_print(queue_page, print_once): 230 if len(print_once) > 0: 231 print('------Too low!!! 這個網站沒有這部小說------'.center(120, '-')) 232 233 def main(): 234 # 創建隊列存儲所有網頁信息 235 queue_page = creatqueue() 236 # 選擇搜書方式 237 way = int(input('輸入數字選擇對應搜書方式:(按書名查找: 1 ; 按作者方式: 2)')) 238 if way == 1: 239 name = input('輸入要查找的書名:') 240 # 創鍵書名查找線程 241 creatS_B_Nthread(name, queue_page) 242 if way == 2: 243 aname = input('輸入要查找的作者:') 244 # 創鍵作者查找線程 245 creatS_B_Athread(aname, queue_page) 246 247 print('結束') 248 249 if __name__ == '__main__': 250 main()