原文:https://www.jianshu.com/p/06ae2373f560
1 import threading # 多線程模塊 2 import queue # 隊列模塊 3 import requests 4 from lxml import etree 5 import time 6 import random 7 import json 8 9 concurrent = 3 # 采集線程數 10 conparse = 3 # 解析線程 11 12 13 class Parse(threading.Thread): # 解析線程類 14 # 初始化屬性 15 def __init__(self, number, data_list, req_thread, f): 16 super(Parse, self).__init__() 17 self.number = number # 線程編號 18 self.data_list = data_list # 數據隊列 19 self.req_thread = req_thread # 請求隊列,為了判斷采集線程存活狀態 20 self.f = f # 獲取文件對象 21 self.is_parse = True # 判斷是否從數據隊列里提取數據 22 23 24 def run(self): 25 print('啟動%d號解析線程' % self.number) 26 # 無限循環, 27 while True: 28 # 如何判斷解析線程的結束條件 29 for t in self.req_thread: # 循環所有采集線程 30 if t.is_alive(): # 判斷線程是否存活 31 break 32 else: # 如果循環完畢,沒有執行break語句,則進入else 33 if self.data_list.qsize() == 0: # 判斷數據隊列是否為空 34 self.is_parse = False # 設置解析為False 35 # 判斷是否繼續解析 36 if self.is_parse: # 解析 37 try: 38 data = self.data_list.get(timeout=3) # 從數據隊列里提取一個數據 39 except Exception as e: # 超時以后進入異常 40 data = None 41 # 如果成功拿到數據,則調用解析方法 42 if data is not None: 43 self.parse(data) # 調用解析方法 44 else: 45 break # 結束while 無限循環 46 print('退出%d號解析線程' % self.number) 47 48 49 # 頁面解析函數 50 def parse(self, data): 51 html = etree.HTML(data) 52 # 獲取所有段子div 53 duanzi_div = html.xpath('//div[@id="content-left"]/div') 54 for duanzi in duanzi_div: 55 # 獲取昵稱 56 nick = duanzi.xpath('./div//h2/text()')[0] 57 nick = nick.replace('\n', '') 58 # 獲取年齡 59 age = duanzi.xpath('.//div[@class="author clearfix"]/div/text()') 60 if len(age) > 0: 61 age = age[0] 62 else: 63 age = 0 64 # 獲取性別 65 gender = duanzi.xpath('.//div[@class="author clearfix"]/div/@class') 66 if len(gender) > 0: 67 if 'women' in gender[0]: 68 gender = '女' 69 else: 70 gender = '男' 71 else: 72 gender = '中' 73 # 獲取段子內容 74 content = duanzi.xpath('.//div[@class="content"]/span[1]/text()')[0].strip() 75 # 獲取好笑數 76 good_num = duanzi.xpath('./div//span[@class="stats-vote"]/i/text()')[0] 77 # 獲取評論 78 common_num = duanzi.xpath('./div//span[@class="stats-comments"]//i/text()')[0] 79 item = { 80 'nick': nick, 81 'age': age, 82 'gender': gender, 83 'content': content, 84 'good_num': good_num, 85 'common_num': common_num, 86 } 87 self.f.write(json.dumps(item, ensure_ascii=False) + '\n') 88 89 90 class Crawl(threading.Thread): # 采集線程類 91 # 初始化 92 def __init__(self, number, req_list, data_list): 93 # 調用Thread 父類方法 94 super(Crawl, self).__init__() 95 # 初始化子類屬性 96 self.number = number 97 self.req_list = req_list 98 self.data_list = data_list 99 self.headers = { 100 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36' 101 } 102 # 線程啟動的時候調用 103 104 def run(self): 105 # 輸出啟動線程信息 106 print('啟動采集線程%d號' % self.number) 107 # 如果請求隊列不為空,則無限循環,從請求隊列里拿請求url 108 while self.req_list.qsize() > 0: 109 # 從請求隊列里提取url 110 url = self.req_list.get() 111 print('%d號線程采集:%s' % (self.number, url)) 112 # 防止請求頻率過快,隨機設置阻塞時間 113 time.sleep(random.randint(1, 3)) 114 # 發起http請求,獲取響應內容,追加到數據隊列里,等待解析 115 response = requests.get(url, headers=self.headers) 116 if response.status_code == 200: 117 self.data_list.put(response.text) # 向數據隊列里追加 118 119 120 def main(): 121 # 生成請求隊列 122 req_list = queue.Queue() 123 # 生成數據隊列 ,請求以后,響應內容放到數據隊列里 124 data_list = queue.Queue() 125 # 創建文件對象 126 f = open('duanzi.json', 'w', encoding='utf-8') 127 # 循環生成多個請求url 128 for i in range(1, 13 + 1): 129 base_url = 'https://www.qiushibaike.com/8hr/page/%d/' % i 130 # 加入請求隊列 131 req_list.put(base_url) 132 # 生成N個采集線程 133 req_thread = [] 134 for i in range(concurrent): 135 t = Crawl(i + 1, req_list, data_list) # 創造線程 136 t.start() 137 req_thread.append(t) 138 # 生成N個解析線程 139 parse_thread = [] 140 for i in range(conparse): 141 t = Parse(i + 1, data_list, req_thread, f) # 創造解析線程 142 t.start() 143 parse_thread.append(t) 144 for t in req_thread: 145 t.join() 146 for t in parse_thread: 147 t.join() 148 # 關閉文件對象 149 f.close() 150 151 if __name__ == '__main__': 152 main()