1. 爬取模块说明
爬取模块篇,主要从网上找到一些免费代理网站,网站内仅开放的一点免费代理抓取下来,爬取下来能用的代理可谓稀少,假设从一个代理网站首页爬取20个免费代理,经过测试后剩下1、2个可用,因为免费的代理一般具有时效性,肯定不如花钱买的代理来得相对稳定。
既然爬取单个代理网站最后能用的只手可数,但是只要从爬取数量方向着手,就是说只要把爬取的代理网站数量提升,如果爬取一个代理网站得到20个左右的免费代理,假设可用率10%,就是2个可用,爬取10个代理网站,就是20个可用了。
既然说得那么容易,我加他个几十代理网站最后可用的数量那不是很可观吗,但实际中,考虑可能不止这些.....,首先你要找到那么多的代理网站,并且这些网站开放了一些免费代理,满足能爬取,然而你还要考虑写抓取代理网站的代码成本。其次,你还要考虑代理网站的稳定性,不然等你写完代码后发现这个代理突然网站崩了...
上面说的,可能只是我所了解到的其中一种爬取免费代理思路,不论最后的结果是怎么样,有多少可用的代理。我们知道这只是应对反反爬的策略其中的一种,最终它能否满足我们的需求才是关键的。
2. 实现思路
既然想实现爬取多个代理网站,那么必然要维护多份代理网站代码,每个代理网站各自独立起来。
当我写了2,3份爬取代理网站代码后,头疼的发现这其中已有不少重复代码,比如使用requests请求这一步,请求每个代理网站都会写,
这时候我们就想能不能把requests请求独立起来,以便减少重复的代码,也方便后续拓展加入新的代理网站。
沿着上面聊到的思路,接下来步入正题。
下面展示的目录结构主要是爬取模块,它也是整块抓取逻辑。
proxypool # 项目名字
│ context.py # 项目依赖环境--针对window command
│ getter.py # 爬取模块(入口) # 3.3
│ __init__.py
├─crawler # 整个抓取代理网站模块
│ │ base.py # 通用请求抓取类 # 3.1
│ │ __init__.py
│ │
│ ├─proxysite # 代理网站,目录下每一份.py文件维护一个代理网站
│ │ │ proxy_89ip.py
│ │ │ proxy_ip3366.py
│ │ │ proxy_ipihuan.py
│ │ │ proxy_seofangfa.py
│ │ │ proxy_shenjidaili.py
│ │ │ __init__.py # 返回当前目录的绝对路径,提供给pkgutil包所需参数 # 3.4.3
├─untils # 其它模块
│ │ parse.py # 代理校验方法
│ │ loggings.py # 封装日志类 # 3.4.1
│ │ __init__.py
├─...
代码注释后面的#数字对应大纲目录序号
3. 代码实现
代码环境:Python 3.9.1, Redis:3.5.3
依赖的第三方包:requests、fake_headers、retrying、loguru、pyquery
3.1 通用请求爬取代理类
import requests
from fake_headers import Headers
from retrying import retry
from proxypool.untils.parse import is_valid_proxy
from requests.exceptions import ConnectionError
try:
from proxypool.untils.loggings import Logging
logging = Logging()
except ImportError:
from loguru import logger as logging
Exceptions = (
TimeoutError,
AssertionError,
ConnectionError,
)
class Base(object):
"""
一个通用请求抓取代理网站类
Instance variable:
- :url: # 爬取的url,也就是代理网站
- :proxies: # 使用代理 (可单独在子类配置,也就是抓取代理网站代码中配置)
- :isvalid = True # 标识代理网站是否可用,如果为False,则重新启动程序时,这个代理网站会被屏蔽掉,不再去请求
decorator:
- @retry(...): # 一个装饰器,请求重试代理网站说明
:param: retry_on_result, 触发重试条件,即website_response函数的返回值为None时触发
:param: stop_max_attempt_number,重试2次
:param: wait_exponential_multiplier,等待最小时间
:param: wait_exponential_max,等待最大时间,具体有个计算公式,可自行参考文档
"""
url = ""
proxies = None
isvalid = True
def __init__(self):
# 忽略安全警告
requests.packages.urllib3.disable_warnings()
self.logger = logging
@retry(stop_max_attempt_number=2, retry_on_result=lambda x: x is None,
wait_exponential_multiplier=1000,
wait_exponential_max=10000)
def website_response(self, url, **kwargs):
"""
一个通用请求方法
Args:
- :url: # 爬取的代理网站地址
- kwargs: # 使用kwargs定制一些配置
Other variables:
- headers # 反爬虫伪装,如果无法安装fake_headers包(可能被国内墙了),可以手动构造一个headers.
示例:
headers = {'Accept': '*/*', 'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4;
rv:52.7.3) Gecko/20100101Firefox/52.7.3', 'DNT': '1',
'Referer': 'https://google.com', 'Pragma': 'no-cache'}
- proxies # 开启代理,如开启本地代理:
proxies = {
'http': 'http://127.0.0.1:1080',
'https': 'https://127.0.0.1:1080',
}
"""
try:
headers = Headers(headers=True).generate()
kwargs.setdefault('timeout', 10)
kwargs.setdefault('verify', False) # verify: Defaults to ``True``.
kwargs.setdefault('headers', headers)
# 爬取的代理网站是否加入代理去爬取
if self.proxies is not None:
kwargs.setdefault('proxies', self.proxies)
res = requests.get(url, **kwargs)
# 代理网站http响应码=200认为它正常
if res.status_code == 200:
res.encoding = 'utf-8'
return res.text
except Exceptions:
return None
@logging.catch
def crawl(self):
"""
一个抓取代理网站方法
1.先是调用self.website_response实例方法,得到response.text赋值给html
2.然后调用子类写好的parse爬取方法,也就是每一个代理网站的各自维护的抓取逻辑
3.接着调用is_valid_proxy方法校验ip有效性,符合条件才会返回,否则返回None
4.最后通过yield关键字返回代理
"""
url = self.url
self.logger.info(f'Request URL:{url}')
html = self.website_response(url)
for proxy in self.parse(html):
proxy = is_valid_proxy(proxy)
if proxy is not None:
self.logger.info(f"Fetching proxy: {proxy} from {url}")
yield proxy
3.2 抓取代理网站类
以下展示其中一些代理网站,后续如有更新,按照类似的模板补充即可。
如果想调用看结果,把base.py文件中的Base.crawl方法最后的yield注释掉即可。
3.2.1 www.89ip.cn
#proxypool/crawler/proxysite/proxy_89ip.py
from pyquery import PyQuery as pq
from proxypool.crawler.base import Base
class proxy_89ip(Base):
url = 'https://www.89ip.cn/index_1.html'
def parse(self, html):
doc = pq(html)
hosts = doc('.layui-table td:nth-child(1)').text().split(' ')
ports = doc('.layui-table td:nth-child(2)').text().split(' ')
for host, port in zip(hosts, ports):
yield f'{host.strip()}:{port.strip()}'
if __name__ == '__main__':
test = proxy_89ip()
test.crawl()
3.2.2 www.ip3366.net
#proxypool/crawler/proxysite/proxy_ip3366.py
from pyquery import PyQuery as pq
from proxypool.crawler.base import Base
class proxy_ip3366(Base):
url = 'http://www.ip3366.net/?stype=1&page=1'
def parse(self, html):
doc = pq(html)
hosts = doc('.table td:nth-child(1)').text().split(' ')
ports = doc('.table td:nth-child(2)').text().split(' ')
for host, port in zip(hosts, ports):
yield f'{host.strip()}:{port.strip()}'
if __name__ == '__main__':
test = proxy_ip3366()
test.crawl()
3.2.3 ip.ihuan.me
#proxypool/crawler/proxysite/proxy_ipihuan.py
from pyquery import PyQuery as pq
from proxypool.crawler.base import Base
class proxy_ipihuan(Base):
url = 'https://ip.ihuan.me/'
isvalid = False
def parse(self, html):
doc = pq(html)
hosts = doc('.table td:nth-child(1)').text().split(' ')
ports = doc('.table td:nth-child(2)').text().split(' ')
for host, port in zip(hosts, ports):
yield f'{host.strip()}:{port.strip()}'
if __name__ == '__main__':
test = proxy_ipihuan()
test.crawl()
3.2.4 proxy.seofangfa.com
#proxypool/crawler/proxysite/proxy_seofangfa.py
from pyquery import PyQuery as pq
from proxypool.crawler.base import Base
class proxy_seofangfa(Base):
url = 'https://proxy.seofangfa.com/'
# proxies = {
# 'http': 'http://127.0.0.1:1080',
# 'https': 'https://127.0.0.1:1080',
# }
def parse(self, html):
doc = pq(html)
hosts = doc('.table td:nth-child(1)').text().split(' ')
ports = doc('.table td:nth-child(2)').text().split(' ')
for host, port in zip(hosts, ports):
yield f'{host.strip()}:{port.strip()}'
if __name__ == '__main__':
test = proxy_seofangfa()
test.crawl()
3.2.5 shenjidaili.com
#proxypool/crawler/proxysite/proxy_shenjidaili.py
from pyquery import PyQuery as pq
from proxypool.crawler.base import Base
class proxy_shenjidaili(Base):
url = 'http://www.shenjidaili.com/product/open/'
isvalid = False
def parse(self, html):
doc = pq(html)
proxies = doc('.table td:nth-child(1)').text().split(' ')
for proxy in proxies:
yield f'{proxy}'
if __name__ == '__main__':
test = proxy_shenjidaili()
test.crawl()
调式3.2.1代码,记得把父类Base.crawl方法最后的yield注释掉,不然直接运行什么也不会返回。
运行proxy_89ip.py的结果如下:
3.3 爬取模块(入口)
#proxypool/getter.py
import context
import pkgutil
import inspect
from proxypool.crawler.base import Base
from proxypool.crawler.proxysite import crawlerPath
from loguru import logger
def get_classes():
"""
加载指定目录下的所有可调用对象
return: 返回proxysite包下的所有class obejct)
"""
classes = []
for loader, name, is_pkg in pkgutil.walk_packages([crawlerPath]):
# get module type
module = loader.find_module(name).load_module(name)
for _class, _class_object in inspect.getmembers(module, callable):
# 过滤可调用对象,留下有用的
if inspect.isclass(_class_object) and issubclass(_class_object, Base) \
and _class_object is not Base and _class_object.isvalid:
classes.append(_class_object)
return classes
classes = get_classes()
class Getter(object):
def __init__(self):
self.classes = [cls() for cls in classes]
self.in_storage_count = 0
self.logger = logger
@logger.catch
def run(self):
if len(self.classes):
for cls in self.classes:
self.logger.info(f'Get the proxy instance object: {cls}')
for proxy in cls.crawl():
# .... write code
# .... save proxy to local or redis
# .........
self.logger.info(f"获取代理成功: {proxy}")
if __name__ == '__main__':
test = Getter()
test.run()
getter.py结果如下:
3.4 其它模块
3.4.1 封装日志类
#proxypool/untils/loggings.py
import sys
import time
from loguru import logger
from pathlib import Path
# 是否开启日志记录
OPEN_LOG = True
class Logging(object):
"""
日志记录
"""
_instance = None
_log = OPEN_LOG
def __new__(cls, *arg, **kwargs):
if cls._instance is None:
cls._instance = object.__new__(cls, *arg, **kwargs)
return cls._instance
def __init__(self):
if self._log:
self.log()
def info(self, msg):
return logger.info(msg)
def debug(self, msg):
return logger.debug(msg)
def error(self, msg):
return logger.error(msg)
def exception(self, msg):
return logger.exception(msg)
@classmethod
def catch(cls, func):
@logger.catch
def decorator(*args, **kwargs):
return func(*args, **kwargs)
return decorator
def log(self):
"""
运行项目下生成log
"""
if self._log:
t = time.strftime('%Y_%m_%d')
present_path = sys.path[0]
p = Path(present_path).resolve()
log_path = p.joinpath('log')
logger.add(f'{log_path}/crawl_{t}.log',
level='ERROR', # 只记录error级别以上的log
enqueue=True,
rotation='00:00',
retention='1 months',
compression='tar.gz',
encoding='utf-8',
backtrace=True)
3.4.2 校验代理格式
#proxypool/untils/parse.py
try:
from proxypool.untils.loggings import Logging
logging = Logging()
except ImportError:
from loguru import logger as logging
Exceptions = (
ValueError,
AssertionError
)
def bytes_convert_string(data):
"""
byte类型转换为字符串
示例:b'123' ---> '123'
"""
if data is None:
return None
elif isinstance(data, bytes):
return data.decode('utf8')
def is_valid_proxy(ip_port):
"""
校验代理格式
:param: ip_port, {ip}:{port}
示例:
正常的代理:27.191.60.60:3256
不正常的代理:299.299.299.299:123 or 1.2.4.8:66666
"""
if ip_port is None:
return
elif isinstance(ip_port, str):
try:
ip_port_list = ip_port.split(':')
if len(ip_port_list) == 2:
port = ip_port_list.pop()
if not port.isdigit():
return
assert 1 <= int(port) <= 65535
ip_list = ip_port_list
ip_str = ",".join(ip_list)
li = ip_str.split('.')
if len(li) == 4:
_ip = [int(s) for s in li if 0 < int(s) <= 254]
if len(_ip) == 4:
return ip_port
except Exceptions: # int(x), x = 'a' --> ValueError
logging.error(f'ip not valid -- {ip_port}')
if __name__ == '__main__':
by = b'27.191.60.60:325611'
ip = bytes_convert_string(by)
is_valid_proxy(ip)
3.4.3 _init_.py
#proxypool/crawler/proxysite/__init__.py
import os.path
# 返回proxysite目录所在的绝对路径,提供给pkgutil包所需参数
crawlerPath = os.path.dirname(__file__)
__all__ = ["crawlerPath"]
3.4.4 context.py
#proxypool/context.py
import sys
from pathlib import Path
sys.path.insert(0, str(Path(sys.path[0], '..').resolve()))