从网上的免费代理(http://www.xicidaili.com/nn/1)中筛选了能用,寿命长,速度快的代理。

1 from bs4 import BeautifulSoup 2 import re,time,requests 3 from requests.exceptions import ReadTimeout,HTTPError,RequestException,ConnectionError 4 from selenium import webdriver 5 from selenium.common.exceptions import TimeoutException 6 def estimate_time(ip_test_date,ip_rest_time):#rest_time+test_date-current_time 7 new_ip_test_date='20'+ip_test_date+':00' 8 time_stamp = time.mktime(time.strptime(new_ip_test_date, '%Y-%m-%d %H:%M:%S')) 9 d_ip_rest_time=re.findall('\d',ip_rest_time) 10 a_ip_rest_time=re.findall('\D',ip_rest_time) 11 if a_ip_rest_time[0]=='分钟': 12 da_ip_rest_time=int(d_ip_rest_time[0])*60 13 elif a_ip_rest_time[0]=='小时': 14 da_ip_rest_time = int(d_ip_rest_time[0]) * 3600 15 else: 16 da_ip_rest_time = int(d_ip_rest_time[0]) * 86400 17 result_time=time_stamp+da_ip_rest_time-time.time() 18 return result_time 19 def status_code(ip_type,ip_add,ip_port): 20 proxies = { 21 "%s" % ip_type: "%s://%s:%s" % (ip_type, ip_add, ip_port) 22 } 23 try: 24 response = requests.get("https://www.baidu.com", proxies=proxies, timeout=2) 25 return response.status_code 26 except ReadTimeout: 27 return ('Timeout') 28 except HTTPError: 29 return ('Timeout') 30 except RequestException: 31 return ('Timeout') 32 except ConnectionError: 33 return ('Timeout') 34 try: 35 chrome_options = webdriver.ChromeOptions() 36 chrome_options.add_argument('--headless') 37 chrome_options.add_argument('--disable-gpu') 38 browser = webdriver.Chrome(chrome_options=chrome_options) 39 browser.get("http://www.xicidaili.com/nn") 40 html=browser.page_source 41 browser.quit() 42 except TimeoutException: 43 print('Time Out') 44 dict_http={} 45 dict_https={} 46 soup=BeautifulSoup(html,'lxml') 47 lists=soup.tbody.contents 48 count=0 49 while count<199: 50 count+=2 51 need_jiexi=str(lists[count]) 52 pattern=re.compile('<td>(.*?)</td>',re.S) 53 items=re.findall(pattern,need_jiexi) 54 pattern2=re.compile('<div class="bar_inner fast" style="width:(.*?)%">',re.S) 55 items2=re.findall(pattern2,need_jiexi) 56 ip_place_list=re.findall('<a href.*?">(.*?)</a>',items[2]) 57 if len(ip_place_list)==1: 58 ip_place = ip_place_list[0] 59 else: 60 continue 61 if len(items2)==2: 62 ip_speed = items2[0] 63 ip_connect_time = items2[1] 64 if int(ip_speed) and int(ip_connect_time) > 79: 65 ip_speed=ip_speed 66 ip_connect_time=ip_connect_time 67 else: 68 continue 69 else: 70 continue 71 ip_rest_time = items[4] 72 ip_test_date = items[5] 73 if estimate_time(ip_test_date,ip_rest_time)>1020: 74 ip_test_date=ip_test_date 75 ip_rest_time=ip_rest_time 76 else: 77 continue 78 ip_type = items[3].lower() 79 ip_add = items[0] 80 ip_port = items[1] 81 if status_code(ip_type,ip_add,ip_port)==200: 82 ip_type = ip_type 83 ip_add = ip_add 84 ip_port = ip_port 85 else: 86 continue 87 if ip_type == 'http': 88 name = 'ip_address_%d' % (len(dict_http) + 1) 89 dict_http.update({name: [ip_add, ip_port, ip_place]}) 90 else: 91 name = 'ip_address_%d' % (len(dict_https) + 1) 92 dict_https.update({name: [ip_add, ip_port, ip_place]}) 93 print('http:',dict_http) 94 print('https:',dict_https)

1 http: {'ip_address_1': ['27.209.19.71', '61202', '山东淄博'], 'ip_address_2': ['14.112.76.68', '61234', '广东惠州市惠东县'], 'ip_address_3': ['122.114.31.177', '808', '河南郑州'], 'ip_address_4': ['61.135.217.7', '80', '北京'], 'ip_address_5': ['116.55.77.81', '61202', '云南丽江'], 'ip_address_6': ['223.246.238.147', '61202', '安徽宿州'], 'ip_address_7': ['58.216.202.149', '8118', '江苏常州'], 'ip_address_8': ['182.247.75.106', '61202', '云南'], 'ip_address_9': ['39.78.30.207', '61202', '山东'], 'ip_address_10': ['119.191.31.22', '61202', '山东潍坊'], 'ip_address_11': ['125.127.79.4', '61202', '浙江台州市温岭'], 'ip_address_12': ['123.161.153.59', '40435', '河南许昌'], 'ip_address_13': ['121.237.53.10', '61202', '江苏南京'], 'ip_address_14': ['218.66.149.224', '8118', '福建厦门'], 'ip_address_15': ['218.4.46.45', '61202', '江苏苏州'], 'ip_address_16': ['113.128.10.120', '61234', '山东济南'], 'ip_address_17': ['14.112.76.201', '61234', '广东惠州市惠东县'], 'ip_address_18': ['117.64.238.30', '61202', '安徽合肥'], 'ip_address_19': ['113.121.240.77', '61234', '山东德州'], 'ip_address_20': ['180.136.56.33', '61202', '广西桂林'], 'ip_address_21': ['113.122.34.190', '61234', '山东菏泽'], 'ip_address_22': ['125.121.118.86', '6666', '浙江杭州'], 'ip_address_23': ['112.248.7.102', '61234', '山东枣庄'], 'ip_address_24': ['58.209.38.75', '8118', '江苏苏州']} 2 https: {}
纯100%自己做的,累死我了