直接上代碼,每行代碼后面都有注釋
import urllib.request import urllib import re import time import random import socket import threading import redis r = redis.Redis(host='192.168.60.112', port=6379,db=0,charset='utf-8')#換成自己的IP # 抓取代理IP ip_totle = [] for page in range(2, 6): url = 'http://www.xicidaili.com/nn/%s' %page#字符串拼接,西刺代理 headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}#設置瀏覽器協議頭 request = urllib.request.Request(url=url, headers=headers) response = urllib.request.urlopen(request) #用Request類構建了一個完整的請求,增加了headers等一些信息 content = response.read().decode('utf-8') print('get page', page)#打印出獲取哪一頁 pattern = re.compile('<td>(\d.*?)</td>') # 截取<td>與</td>之間第一個數為數字的內容 ip_page = re.findall(pattern, str(content))#在content里查找pattern ip_totle.extend(ip_page)#將ip_page追加到ip_totle里 time.sleep(random.choice(range(1, 3)))#推遲運行隨機1-3s # 打印抓取內容 # print('代理IP地址 ', '\t', '端口', '\t', '速度', '\t', '驗證時間') # for i in range(0, len(ip_totle), 4): # print(ip_totle[i], ' ', '\t', ip_totle[i + 1], '\t', ip_totle[i + 2], '\t', ip_totle[i + 3]) # 整理代理IP格式 proxys = [] for i in range(0, len(ip_totle), 4): proxy_host = ip_totle[i] + ':' + ip_totle[i + 1]#IP和端口 proxy_temp = {"http": proxy_host}#加一個http proxys.append(proxy_temp)#把proxy_temp追加到proxys # proxy_ip = open('proxy_ip.txt', 'w') # 新建一個儲存有效IP的文檔 lock = threading.Lock() # 建立一個鎖 # 驗證代理IP有效性的方法 def test(i):#給一個方法 socket.setdefaulttimeout(5) # 設置全局超時時間 url = "https://www.baidu.com/" # 打算爬取的網址 try: proxy_support = urllib.request.ProxyHandler(proxys[i]) opener = urllib.request.build_opener(proxy_support) opener.addheaders = [("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64)")] urllib.request.install_opener(opener) res = urllib.request.urlopen(url).read() lock.acquire() # 獲得鎖 print(proxys[i], 'is OK') # proxy_ip.write('%s\n' % str(proxys[i])) # 寫入該代理IP lock.release() # 釋放鎖 except Exception as e: lock.acquire() print(proxys[i], e) lock.release() # 單線程驗證 '''for i in range(len(proxys)): test(i)''' # 多線程驗證 threads = [] for i in range(len(proxys)): thread = threading.Thread(target=test, args=[i]) threads.append(thread) thread.start() # 阻塞主進程,等待所有子線程結束 for thread in threads: thread.join() # proxy_ip.close() # 關閉文件 r.lpush('myIP',proxys)