寫了個爬蟲代理ip的腳本給大家使用
一.代碼
import requests
from lxml.html import etree
url = 'http://www.kuaidaili.com/free/'
rp =requests.get(url)
rp_html = etree.HTML(rp.text)
#找xpath
ip_xpath = '//*[@id="list"]/table/tbody/tr/td[1]/text()'
port_xpath = '//*[@id="list"]/table/tbody/tr/td[2]/text()'
http_or_https_xpath ='//*[@id="list"]/table/tbody/tr/td[4]/text()'
#匹配內容
ip_list = rp_html.xpath(ip_xpath)
port_list = rp_html.xpath(port_xpath)
http_or_https_list = rp_html.xpath(http_or_https_xpath)
#進行組合
list_zip = zip(ip_list,port_list,http_or_https_list)
proxy_dict= {}
proxy_list = []
for ip,port,http_or_https in list_zip:
proxy_dict[http_or_https] = f'{ip}:{port}'
proxy_list.append(proxy_dict)
proxy_dict = {}
print(proxy_list)
#list就是啦,你們可以用random模塊隨機選一個進行后續的爬取
#一頁不夠嘛那我們就爬十寫
#先看規則
'''
第一頁:https://www.kuaidaili.com/free/inha/1/
第二頁: https://www.kuaidaili.com/free/inha/2/
后面就不用說了吧
'''
http://www.kuaidaili.com/free/
這個ip代理網站不錯哈