爬蟲一直是python使用的一個重要部分,而許多網站也為此做了許多反爬措施,其中爬蟲訪問過於頻繁直接封ip地址也作為一種“傷敵一千,自損八百”的方法被許多網站采用,代理ip便可以防止這種情況出現。
進行爬取和測試有效性
分析完畢開始爬取ip,直接使用第三方的requests和BeautifulSoup4,可以讓抓取變得很方便,代碼如下:
from iptools import header, dict2proxy from bs4 import BeautifulSoup as Soup def parse_items(items): # 存放ip信息字典的列表 ips = [] for item in items: tds = item.find_all('td') # 從對應位置獲取ip,端口,類型 ip, port, _type = tds[1].text, int(tds[2].text), tds[5].text ips.append({'ip': ip, 'port': port, 'type': _type}) return ips def check_ip(ip): try: proxy = dict2proxy(ip) url = 'https://www.ipip.net/' r = requests.get(url, headers=head, proxies=pro,timeout=5) r.raise_for_status() except: return False else: return True def get_proxies(index): url = 'http://zhimaruanjian.com// % index r = requests.get(url, headers=header) r.encoding = r.apparent_encoding r.raise_for_status() soup = Soup(r.text, 'lxml') # 第一個是顯示最上方的信息的,需要丟掉 items = soup.find_all('tr')[1:] ips = parse_items(items) good_proxies = [] for ip in ips: if check(ip): good_proxies.append(ip) return good_proxies
就像在上面寫的,有效性我直接使用了ip查詢網站,獲得的ip基本確保可以直接使用。
寫入json文件
可以將獲取的ip存放在json文件中,json模塊的使用也很簡單,直接打開一個文件,使用dump方法寫入文件即可,
import json def write_to_json(ips): with open('proxies.json', 'w', encoding='utf-8') as f: json.dump(ips, f, indent=4)
寫入MongoDB
寫入數據庫后獲取和操作會很方便,
from pymongo import MongoClient as Client def write_to_mongo(ips): client = Client(host='localhost', port=27017) db = client['proxies_db'] coll = db['proxies'] for ip in ips: if coll.find({'ip': ip['ip']}).count() == 0: coll.insert_one(ip) client.close()
寫入后使用RoboMongo查看
使用多線程
導入threading包,將Thread封裝一下,得到最終的代碼
get_proxies.py import json import requests import time from proxies_get.iptools import header, dict2proxy from bs4 import BeautifulSoup as Soup from pymongo import MongoClient as Client import threading def parse_items(items): # 存放ip信息字典的列表 ips = [] for item in items: tds = item.find_all('td') # 從對應位置獲取ip,端口,類型 ip, port, _type = tds[1].text, int(tds[2].text), tds[5].text.lower() ips.append({'ip': ip, 'port': port, 'type': _type}) return ips def check_ip(ip, good_proxies): try: pro = dict2proxy(ip) # print(pro) url = 'https://www.ipip.net/' r = requests.get(url, headers=header, proxies=pro, timeout=5) r.raise_for_status() print(r.status_code, ip['ip']) except Exception as e: # print(e) pass else: good_proxies.append(ip) def write_to_json(ips): with open('proxies.json', 'w', encoding='utf-8') as f: json.dump(ips, f, indent=4) def write_to_mongo(ips): '''將數據寫入mongoDB''' client = Client(host='localhost', port=27017) db = client['proxies_db'] coll = db['proxies'] # 先檢測,再寫入,防止重復 for ip in ips: if coll.find({'ip': ip['ip']}).count() == 0: coll.insert_one(ip) client.close() class GetThread(threading.Thread): '''對Thread進行封裝''' def __init__(self, args): threading.Thread.__init__(self, args=args) self.good_proxies = [] def run(self): url = 'http://zhimaruanjian.com/ % self._args[0] # 發起網絡訪問 r = requests.get(url, headers=header) r.encoding = r.apparent_encoding r.raise_for_status() soup = Soup(r.text, 'lxml') # 第一個是顯示最上方的信息的,需要丟掉 items = soup.find_all('tr')[1:] ips = parse_items(items) threads = [] for ip in ips: # 開啟多線程 t = threading.Thread(target=check_ip, args=[ip, self.good_proxies]) t.start() time.sleep(0.1) threads.append(t) [t.join() for t in threads] def get_result(self): return self.good_proxies if __name__ == '__main__': # 主函數使用多線程 threads = [] for i in range(1, 30): t = GetThread(args=[i]) t.start() time.sleep(10) threads.append(t) [t.join() for t in threads] for t in threads: proxies = t.get_result() write_to_mongo(proxies) iptools.py header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/64.0.3282.186 Safari/537.36'} def dict2proxy(dic): s = dic['type'] + '://' + dic['ip'] + ':' + str(dic['port']) return {'http': s, 'https': s}
總結
這個免費代理ip的爬蟲沒什么太難的地方,就是服務器有點弱,一不小心就503了,需要限制一下訪問速度。使用免費的代理會影響使用到的效果,因此可以使用代理商代理ip服務,會更加的穩定安全。