制作ip地址池


从网上的免费代理(http://www.xicidaili.com/nn/1)中筛选了能用,寿命长,速度快的代理。

 1 from bs4 import BeautifulSoup
 2 import re,time,requests
 3 from requests.exceptions import ReadTimeout,HTTPError,RequestException,ConnectionError
 4 from selenium import webdriver
 5 from selenium.common.exceptions import TimeoutException
 6 def estimate_time(ip_test_date,ip_rest_time):#rest_time+test_date-current_time
 7     new_ip_test_date='20'+ip_test_date+':00'
 8     time_stamp = time.mktime(time.strptime(new_ip_test_date, '%Y-%m-%d %H:%M:%S'))
 9     d_ip_rest_time=re.findall('\d',ip_rest_time)
10     a_ip_rest_time=re.findall('\D',ip_rest_time)
11     if a_ip_rest_time[0]=='分钟':
12         da_ip_rest_time=int(d_ip_rest_time[0])*60
13     elif a_ip_rest_time[0]=='小时':
14         da_ip_rest_time = int(d_ip_rest_time[0]) * 3600
15     else:
16         da_ip_rest_time = int(d_ip_rest_time[0]) * 86400
17     result_time=time_stamp+da_ip_rest_time-time.time()
18     return result_time
19 def status_code(ip_type,ip_add,ip_port):
20     proxies = {
21         "%s" % ip_type: "%s://%s:%s" % (ip_type, ip_add, ip_port)
22     }
23     try:
24         response = requests.get("https://www.baidu.com", proxies=proxies, timeout=2)
25         return response.status_code
26     except ReadTimeout:
27         return ('Timeout')
28     except HTTPError:
29         return ('Timeout')
30     except RequestException:
31         return ('Timeout')
32     except ConnectionError:
33         return ('Timeout')
34 try:
35     chrome_options = webdriver.ChromeOptions()
36     chrome_options.add_argument('--headless')
37     chrome_options.add_argument('--disable-gpu')
38     browser = webdriver.Chrome(chrome_options=chrome_options)
39     browser.get("http://www.xicidaili.com/nn")
40     html=browser.page_source
41     browser.quit()
42 except TimeoutException:
43     print('Time Out')
44 dict_http={}
45 dict_https={}
46 soup=BeautifulSoup(html,'lxml')
47 lists=soup.tbody.contents
48 count=0
49 while count<199:
50     count+=2
51     need_jiexi=str(lists[count])
52     pattern=re.compile('<td>(.*?)</td>',re.S)
53     items=re.findall(pattern,need_jiexi)
54     pattern2=re.compile('<div class="bar_inner fast" style="width:(.*?)%">',re.S)
55     items2=re.findall(pattern2,need_jiexi)
56     ip_place_list=re.findall('<a href.*?">(.*?)</a>',items[2])
57     if len(ip_place_list)==1:
58         ip_place = ip_place_list[0]
59     else:
60         continue
61     if len(items2)==2:
62         ip_speed = items2[0]
63         ip_connect_time = items2[1]
64         if int(ip_speed) and int(ip_connect_time) > 79:
65             ip_speed=ip_speed
66             ip_connect_time=ip_connect_time
67         else:
68             continue
69     else:
70         continue
71     ip_rest_time = items[4]
72     ip_test_date = items[5]
73     if estimate_time(ip_test_date,ip_rest_time)>1020:
74         ip_test_date=ip_test_date
75         ip_rest_time=ip_rest_time
76     else:
77         continue
78     ip_type = items[3].lower()
79     ip_add = items[0]
80     ip_port = items[1]
81     if status_code(ip_type,ip_add,ip_port)==200:
82          ip_type = ip_type
83          ip_add = ip_add
84          ip_port = ip_port
85     else:
86          continue
87     if ip_type == 'http':
88         name = 'ip_address_%d' % (len(dict_http) + 1)
89         dict_http.update({name: [ip_add, ip_port, ip_place]})
90     else:
91         name = 'ip_address_%d' % (len(dict_https) + 1)
92         dict_https.update({name: [ip_add, ip_port, ip_place]})
93 print('http:',dict_http)
94 print('https:',dict_https)
代码
1 http: {'ip_address_1': ['27.209.19.71', '61202', '山东淄博'], 'ip_address_2': ['14.112.76.68', '61234', '广东惠州市惠东县'], 'ip_address_3': ['122.114.31.177', '808', '河南郑州'], 'ip_address_4': ['61.135.217.7', '80', '北京'], 'ip_address_5': ['116.55.77.81', '61202', '云南丽江'], 'ip_address_6': ['223.246.238.147', '61202', '安徽宿州'], 'ip_address_7': ['58.216.202.149', '8118', '江苏常州'], 'ip_address_8': ['182.247.75.106', '61202', '云南'], 'ip_address_9': ['39.78.30.207', '61202', '山东'], 'ip_address_10': ['119.191.31.22', '61202', '山东潍坊'], 'ip_address_11': ['125.127.79.4', '61202', '浙江台州市温岭'], 'ip_address_12': ['123.161.153.59', '40435', '河南许昌'], 'ip_address_13': ['121.237.53.10', '61202', '江苏南京'], 'ip_address_14': ['218.66.149.224', '8118', '福建厦门'], 'ip_address_15': ['218.4.46.45', '61202', '江苏苏州'], 'ip_address_16': ['113.128.10.120', '61234', '山东济南'], 'ip_address_17': ['14.112.76.201', '61234', '广东惠州市惠东县'], 'ip_address_18': ['117.64.238.30', '61202', '安徽合肥'], 'ip_address_19': ['113.121.240.77', '61234', '山东德州'], 'ip_address_20': ['180.136.56.33', '61202', '广西桂林'], 'ip_address_21': ['113.122.34.190', '61234', '山东菏泽'], 'ip_address_22': ['125.121.118.86', '6666', '浙江杭州'], 'ip_address_23': ['112.248.7.102', '61234', '山东枣庄'], 'ip_address_24': ['58.209.38.75', '8118', '江苏苏州']}
2 https: {}
输出形式如下

纯100%自己做的,累死我了


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM