import requests from lxml import etree import time import random import csv def test_ip(ip_address): ''' 测试ip是否可用 :param ip_address: 代理ip ''' url = 'http://icanhazip.com/' headers = { # headers 头部文件 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0', } ip_pool = [] for ip_test in ip_address: # print(ip_test) try: response = requests.get(url=url,headers=headers,proxies=ip_test,timeout=5) if response.status_code == '200': ip_pool.append(ip_test) time.sleep(random.randint(2,8)) except Exception as e: pass print(ip_pool) files_save(ip_pool) def files_save(ip_list): ''' 将可用代理ip保存 :param ip_list:代理ip :return: ''' with open('./代理ip.csv','a+',encoding='utf-8')as f: write = csv.writer(f) write.writerow(ip_list) pass def get_page_data(nums): ''' 获取西刺代理的页面信息 :return: ''' ip_list = [] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0', } for i in range(1,nums+1): url = "https://www.xicidaili.com/nn/{}".format(i) response = requests.request('get',url=url,headers=headers) page_data = etree.HTML(response.text) # 获取https信息 # https_infos = page_data.xpath(".//tr[@class='odd']") # 获取http信息 # http_infos = page_data.xpath(".//tr[@class='']") page_infos = page_data.xpath(".//tr[@class='odd']|.//tr[@class='']") for info in page_infos: ip_dict = {} ip_address = info.xpath(".//td[2]/text()")[0] ip_port = info.xpath(".//td[3]/text()")[0] ip_type = info.xpath(".//td[6]/text()")[0].lower() ip_dict[ip_type] = ip_type+'://'+ip_address+':'+ip_port ip_list.append(ip_dict) # print(ip_list) test_ip(ip_list) pass pass if __name__ == '__main__': ''' 爬取代理ip时应注意 需要测试此ip是否可用 爬取速度 分析: url信息 页面 url 1 https://www.xicidaili.com/nn/ 2 https://www.xicidaili.com/nn/2 3 https://www.xicidaili.com/nn/3 ''' # nums = int(input("请输入爬取页数>>")) nums = 2 get_page_data(nums)