Python爬蟲 | IP池的使用

本文轉載自查看原文 2019-09-06 12:37 1849 爬蟲

一、簡介

- 爬蟲中為什么需要使用代理

　　一些網站會有相應的反爬蟲措施，例如很多網站會檢測某一段時間某個IP的訪問次數，如果訪問頻率太快以至於看起來不像正常訪客，它可能就會禁止這個IP的訪問。所以我們需要設置一些代理IP，每隔一段時間換一個代理IP，就算IP被禁止，依然可以換個IP繼續爬取。

- 代理的分類：

　　正向代理：代理客戶端獲取數據。正向代理是為了保護客戶端防止被追究責任。

　　反向代理：代理服務器提供數據。反向代理是為了保護服務器或負責負載均衡。

- 免費代理ip提供網站

　　http://www.goubanjia.com/

　　西刺代理

　　快代理

- 匿名度：

　　- 透明：知道是代理ip，也會知道你的真實ip

　　- 匿名：知道是代理ip，不會知道你的真實ip

　　- 高匿：不知道是代理ip，不會知道你的真實ip

- 類型:

　　- http:只能請求http開頭的url

　　- https:只能請求https開頭的url

示例

import requests


headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
}
url = 'https://www.baidu.com/s?wd=ip'

# 不同的代理IP,代理ip的類型必須和請求url的協議頭保持一致
proxy_list = [
     {"http": "112.115.57.20:3128"},        
     {'http': '121.41.171.223:3128'}
]

# 隨機獲取代理IP
proxy = random.choice(proxy_list)

page_text = requests.get(url=url,headers=headers,proxies=proxy).text

with open('ip.html','w',encoding='utf-8') as fp:
    fp.write(page_text)

print('over!')

二、IP池

1、免費IP池

　　從西刺代理上面爬取IP，迭代測試能否使用，建立一個自己的代理IP池，隨時更新用來抓取網站數據

import requests
from lxml import etree
import time
import random
from fake_useragent import UserAgent


class GetProxyIP(object):
    def __init__(self):
        self.url = 'https://www.xicidaili.com/nn/'
        self.proxies = {
            'http': 'http://163.204.247.219:9999',
            'https': 'http://163.204.247.219:9999'}

    # 隨機生成User-Agent
    def get_random_ua(self):
        ua = UserAgent()        # 創建User-Agent對象
        useragent = ua.random
        return useragent

    # 從西刺代理網站上獲取隨機的代理IP
    def get_ip_file(self, url):
        headers = {'User-Agent': self.get_random_ua()}
        html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)        
        tr_list = parse_html.xpath('//tr')              # 基准xpath，匹配每個代理IP的節點對象列表
        
        for tr in tr_list[1:]:
            ip = tr.xpath('./td[2]/text()')[0]
            port = tr.xpath('./td[3]/text()')[0]            
            self.test_proxy_ip(ip, port)                # 測試ip:port是否可用

    # 測試抓取的代理IP是否可用
    def test_proxy_ip(self, ip, port):
        proxies = {
            'http': 'http://{}:{}'.format(ip, port), 'https': 'https://{}:{}'.format(ip, port), }
        test_url = 'http://www.baidu.com/'
        try:
            res = requests.get(url=test_url, proxies=proxies, timeout=8)
            if res.status_code == 200:
                print(ip, ":", port, 'Success')
                with open('proxies.txt', 'a') as f:
                    f.write(ip + ':' + port + '\n')
        except Exception as e:
            print(ip, port, 'Failed')

    def main(self):
        for i in range(1, 1001):
            url = self.url.format(i)
            self.get_ip_file(url)
            time.sleep(random.randint(5, 10))


if __name__ == '__main__':
    spider = GetProxyIP()
    spider.main()

從IP池中取IP，也就是在爬蟲程序中從文件隨機獲取代理IP

import random
import requests


class BaiduSpider(object):
    def __init__(self):
        self.url = 'http://www.baidu.com/'
        self.headers = {'User-Agent': 'Mozilla/5.0'}
        self.flag = 1

    def get_proxies(self):
        with open('proxies.txt', 'r') as f:
            result = f.readlines()                  # 讀取所有行並返回列表
        proxy_ip = random.choice(result)[:-1]       # 獲取了所有代理IP
        L = proxy_ip.split(':')
        proxy_ip = {
            'http': 'http://{}:{}'.format(L[0], L[1]),
            'https': 'https://{}:{}'.format(L[0], L[1])
        }
        return proxy_ip

    def get_html(self):
        proxies = self.get_proxies()
        if self.flag <= 3: try:
                html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text
                print(html)
            except Exception as e:
                print('Retry')
 self.flag += 1 self.get_html() if __name__ == '__main__':
    spider = BaiduSpider()
    spider.get_html()

2.收費代理API

寫一個獲取收費開放API代理的接口

import requests
from fake_useragent import UserAgent

ua = UserAgent()                        # 創建User-Agent對象
useragent = ua.random
headers = {'User-Agent': useragent}


def ip_test(ip):
    url = 'http://www.baidu.com/'
    ip_port = ip.split(':')
    proxies = {
        'http': 'http://{}:{}'.format(ip_port[0], ip_port[1]),
        'https': 'https://{}:{}'.format(ip_port[0], ip_port[1]),
    }
    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
    if res.status_code == 200: return True else: return False # 提取代理IP
def get_ip_list():
    # 快代理：https://www.kuaidaili.com/doc/product/dps/
    api_url = 'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2'
    html = requests.get(api_url).content.decode('utf-8', 'ignore')
    ip_port_list = html.split('\n')

    for ip in ip_port_list:
        with open('proxy_ip.txt', 'a') as f:
            if ip_test(ip):
                f.write(ip + '\n')


if __name__ == '__main__':
    get_ip_list()

3.私密代理

1、語法結構

　　用戶名和密碼會在給API_URL的時候給。不是自己的賬號和賬號密碼。

proxies = {
'協議':'協議://用戶名:密碼@IP:端口號'
}
proxies = {
    'http':'http://用戶名:密碼@IP:端口號',
    'https':'https://用戶名:密碼@IP:端口號'
}
proxies = {
    'http': 'http://309435365:szayclhp@106.75.71.140:16816',
    'https':'https://309435365:szayclhp@106.75.71.140:16816',
}

# 獲取開放代理的接口
import requests
from fake_useragent import UserAgent

ua = UserAgent()  # 創建User-Agent對象
useragent = ua.random
headers = {'User-Agent': useragent}


def ip_test(ip):
    url = 'https://blog.csdn.net/qq_34218078/article/details/90901602/'
    ip_port = ip.split(':')
    proxies = {
        'http': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
        'https': 'http://1786088386:b95djiha@{}:{}'.format(ip_port[0], ip_port[1]),
    }

    res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5)
    if res.status_code == 200:
        print("OK")
        return True
    else:
        print(res.status_code)
        print("錯誤")
        return False


# 提取代理IP
def get_ip_list():
    # 快代理：https://www.kuaidaili.com/doc/product/dps/
    api_url = 'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2'
    html = requests.get(api_url).content.decode('utf-8', 'ignore')
    ip_port_list = html.split('\n')

    for ip in ip_port_list:
        with open('proxy_ip.txt', 'a') as f:
            if ip_test(ip):
                f.write(ip + '\n')


if __name__ == '__main__':
    get_ip_list()

思路：

寫一個類；
get_ip() requests請求接口，得到ip和port；
test_ip() 請求某一網站，根據狀態碼或in判斷是否有某一內容來判斷此ip是否可用,返回Ture和False即可；
save_ip()測試成功后保存;

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python網絡爬蟲(session與ip代理池) 【python3】如何建立爬蟲代理ip池爬蟲IP代理池爬蟲關於ip管理池的應用 python爬蟲實戰（三）--------搜狗微信文章（IP代理池和用戶代理池設定----scrapy） python爬蟲requests使用代理ip Python 爬蟲入門（二）—— IP代理使用 python爬蟲——建立IP池，將可用IP存放到redis Python爬蟲代理池反爬蟲之搭建IP代理池