Python3 requests爬取代理IP並驗證可用性


import requests
from lxml import etree


# 代理IP的信息存儲
def write_proxy(proxies):
print(proxies)
for proxy in proxies:
with open("ip_proxy.txt", 'a+') as f:
print("正在寫入:", proxy)
f.write(proxy + '\n')
print("錄入完成!!!")


# 解析網頁,並得到網頁中的代理IP
def get_proxy(html):
# 對獲取的頁面進行解析
selector = etree.HTML(html)
# print(selector.xpath("//title/text()"))
proxies = []
# 信息提取
for each in selector.xpath("//tr[@class='odd'] | //tr[@class='']"):
# ip.append(each[0])
# 獲取IP地址
ip = each.xpath("./td[2]/text()")[0]
# 獲取端口
port = each.xpath("./td[3]/text()")[0]
# 拼接IP地址,端口號
proxy = ip + ":" + port
# 拼接的IP地址放入到定義的空列表中
proxies.append(proxy)
# 計算每個頁面一共有幾個IP地址
print(len(proxies))
test_proxies(proxies)


# 驗證已得到IP的可用性,本段代碼通過訪問百度網址,返回的response狀態碼判斷(是否可用)。
def test_proxies(proxies):
proxies = proxies
url = "http://www.baidu.com/"
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
}
normal_proxies = []
count = 1
for proxy in proxies:
print("第%s個。。" % count)
count += 1
try:
response = requests.get(url, headers=header, proxies={"http": proxy}, timeout=1)
if response.status_code == 200:
print("該代理IP可用:", proxy)
normal_proxies.append(proxy)
else:
print("該代理IP不可用:", proxy)
except Exception:
print("該代理IP無效:", proxy)
pass
# print(normal_proxies)
write_proxy(normal_proxies)


def get_html(url):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
}
response = requests.get(url, headers=header)
# print(response.text)
get_proxy(response.text)


if __name__ == "__main__":
# 循環獲取網址
base_url = "http://www.xicidaili.com/nn/%s"
for i in range(1,4):
url = base_url % i
get_html(url)


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM