在多線程中,數據是共享,如何在多線程安全的通信,是首先要可慮的問題的
#線程間的通信
import time
import threading
from threading import RLock
detail_url_list = []
lock = RLock()
def get_detail_html(url):
#爬取文章詳情頁
global detail_url_list
#第一次我的想法也是用for循環,
# 但是你要知道,爬取文章的列表頁要快於爬取文章詳情頁
#所以開啟多個線程來爬取多個文章詳情頁
lock.acquire()
url = detail_url_list.pop()
print('get detail html started')
time.sleep(2)
print('get detail html end')
lock.release()
'''
for url in detail_url_list:
print('get detail html started')
time.sleep(2)
print('get detail html end')
'''
def get_detail_url(url):
#爬取文章列表頁
global detail_url_list
print('get detail url started')
time.sleep(4)
for i in range(20):
detail_url_list.append('http://projectsedu.com/{id}'.format(id=i))
print('get detail url end')
#需求就是爬取文章列表頁的url給文章詳情頁的url爬取:
#這個時候,設計到文章間的資源通信
#第一種方法就是 共享變量(共享變量其實就是全局變量,給各個函數調用)
#具體方法如下:
if __name__ == '__main__':
# thread1 = threading.Thread(target=get_detail_html,args=(('',)))
for i in range(10):
thread1 = threading.Thread(target=get_detail_html)
thread1.start()
thread2 = threading.Thread(target=get_detail_url,args=(('http://bolezaixian.com',)))
thread2.start()
# start_time = time.time()
# thread1.setDaemon(True)#設置線程1為守護線程
# thread1.start()
# thread2.start()
# thread2.join()
# print('last time:{}'.format(time.time()-start_time))
共享變量也是要枷鎖的。
import threading
from threading import Lock
#把共享變量存在settings配置文件中
import settings
import time
lock = Lock()
def get_detail_html():
#爬取文章詳情頁
detail_url_list=settings.detail_list_url
#第一次我的想法也是用for循環,
# 但是你要知道,爬取文章的列表頁要快於爬取文章詳情頁
#所以開啟多個線程來爬取多個文章詳情頁
while True:
try:
if len(detail_url_list):
# lock.acquire()
url = detail_url_list.pop()
print('get detail html started')
time.sleep(2)
print('get detail html end')
# lock.release()
except Exception as e:
print(e)
print('線程已運行完了')
break
'''
for url in detail_url_list:
print('get detail html started')
time.sleep(2)
print('get detail html end')
'''
def get_detail_url():
#爬取文章列表頁
detail_url_list = settings.detail_list_url
print('get detail url started')
time.sleep(4)
for i in range(20):
detail_url_list.append('http://projectsedu.com/{id}'.format(id=i))
print('get detail url end')
if __name__ == '__main__':
start_time = time.time()
for i in range(10):
t = threading.Thread(target=get_detail_html)
t.start()
t1 = threading.Thread(target=get_detail_url)
t1.start()
t1.join()
print('total_time:{}'.format(time.time()-start_time))
#通過queue的方式進行線程間同步通信
-----------------------------------------------------------------------------------------------------------------
from queue import Queue
import time
import threading
def get_detail_html(queue):
#爬取文章詳情頁
while True:
url = queue.get() #get()方法是一個阻塞的方法,如果queue是空隊列,它一直會阻塞在這
print('get detail html started')
time.sleep(2)
print('get detail html end')
def get_detail_url(queue):
#爬取文章列表頁
while True:
print('get detail url started')
time.sleep(2)
for i in range(20):
queue.put("https://projectsedu.com/{id}".format(id=i))
print('get detail url end')
if __name__ == "__main__":
detail_url_queue = Queue(maxsize=1000)#隊列里面一定要設置下,maxsize的最大值,防止內存過大
thread_detail_url = threading.Thread(target=get_detail_url,args=((detail_url_queue,)))
for i in range(10):
html_thread = threading.Thread(target=get_detail_html,args=((detail_url_queue,)))
html_thread.start()
detail_url_queue.task_done()
#隊列調用join()方法阻塞在這,只有調用task_done()方法隊列才結束,主線程才能運行。
detail_url_queue.join()
qsize()方法判斷隊列的大小,empty()方法判斷隊列是否為空,如果為空,get()是會阻塞在哪,full()方法判斷隊列是否已滿,如果以滿,put()方法是會阻塞在哪的