import random
import requests
from fake_useragent import UserAgent
from retrying import retry
import hashlib #信息摘要 md5
import queue #隊列
import re #正則表達式
from urllib import robotparser #解析網站robots.txt文件
from urllib.parse import urlparse,urljoin,urldefrag #解析url
from threading import Thread #多線程
from datetime import datetime
import time
from day03 import mongo_cache
MAX_DEP = 2 #定義爬蟲爬取深度
def get_robots(url):
'''
解析robots.txt文件
:param url:
:return:
'''
rp = robotparser.RobotFileParser()
# rp = robotparser.RobotFileParser()
rp.set_url(urljoin(url,'robots.txt'))
# rp.set_url(urljoin(url,'robots.txt'))
rp.read()
return rp
def save_url(html_content,url_str):
'''
存儲下載內容
:param html_content:
:param url_str:
:return:
'''
md5 = hashlib.md5()
md5.update(html_content)
# file_path = './download/' + md5.hexdigest() + '.html'
file_path = './download/' + gen_html_name(url_str) + '.html'
with open(file_path,'wb') as f:
f.write(html_content)
def gen_html_name(url_str):
path = urlparse(url_str).path
path_array = path.split('/')
return path_array[len(path_array)-1]
def extractor_url_lists(html_content):
'''
抽取網頁中的其他鏈接
:param html_content:
:return:
'''
url_regex = re.compile('<a[^>]+href=["\'](.*?)["\']',re.IGNORECASE)
return url_regex.findall(html_content)
class CrawlerCommon(Thread):
'''
實現一個通用爬蟲,涵蓋基本的爬蟲功能及涉及一些反爬蟲技術
'''
def __init__(self,init_url):
super(CrawlerCommon,self).__init__()
__ua = UserAgent() #隨機User-Agent
self.seed_url = init_url #初始爬取的種子網址
self.crawler_queue = queue.Queue() #使用不同的隊列會造成BFS和DFS的效果
self.crawler_queue.put(init_url) #將種子網址放入隊列
self.visited = {init_url : 0} #初始化爬取深度為0
self.rp = get_robots(init_url) #初始化robots解析器
self.headers = {'User-Agent':__ua.random} #生成一個隨機user-agent
self.link_regex = '(index|view)' #抽取網址的過濾條件
self.throttle = Throttle(2.0) #下載限流器間隔2秒
self.mcache = mongo_cache.MongoCache() #初始化Mongo_cache
@retry(stop_max_attempt_number=3)
def retry_download(self,url_str,data,method,proxies):
'''
使用裝飾器的重試下載類
:param url_str:
:param data:
:param method:
:param proxies:
:return:
'''
if method == 'POST':
result = requests.post(url_str,data=data,headers = self.headers,proxies=proxies)
else:
result = requests.get(url_str,headers=self.headers,timeout=3,proxies=proxies)
assert result.status_code == 200 #此處為斷言,判斷狀態碼是否為200
return result.content
def download(self,url_str,data=None,method='GET',proxies={}):
'''
真正的下載類
'''
print('download url is :::::',url_str)
try:
#此處隨機添加代理代碼
# ip = random.choice(['27.155.84.233:8081','61.135.217.7:80','183.47.40.35:8088','123.244.148.5:60230'])
# proxies = {"http.html": ip}
# print(proxies)
result = self.retry_download(url_str,data,method,proxies)
except Exception as e:
print(e.message)
result = None
return result
def nomalize(self,url_str):
'''
補全下載鏈接
:param url_str:
:return:
'''
real_url,_ = urldefrag(url_str)
return urljoin(self.seed_url,real_url)
def save_result(self,html_content,url_str):
'''
將結果存入數據庫,存入前檢查內容是否存在
:param html_content: 下載的二進制內容
:param url_str: 下載網頁的url
:return: 無
'''
if url_str not in self.mcache:
self.mcache[url_str] = html_content
else:
data_from_mongo = self.mcache[url_str]
#初始化md5算法
md5_func_mongo = hashlib.md5()
md5_func_download = hashlib.md5()
#生成數據庫記錄的md5摘要
md5_func_mongo.update(data_from_mongo)
mongo_md5_str = md5_func_mongo.hexdigest()
#生成下載數據的md5摘要
md5_func_download.update(html_content)
download_md5_str = md5_func_download.hexdigest()
#對比下載結果是否和數據庫一樣
if download_md5_str != mongo_md5_str:
self.mcache[url_str] = html_content
def run(self):
'''
進行網絡爬去主要方法
:return:
'''
while not self.crawler_queue.empty():
url_str = self.crawler_queue.get()
#檢測robots.txt文件規則
if self.rp.can_fetch(self.headers['User-Agent'],url_str):
self.throttle.wait(url_str)
depth = self.visited[url_str]
if depth < MAX_DEP:
#下載鏈接
html_content = self.download(url_str)
#存儲鏈接
if html_content is not None :
self.mcache[url_str] = html_content
save_url(html_content,url_str)
#篩選出頁面所有鏈接
url_list = extractor_url_lists(html_content.decode('utf-8'))
#篩選需要爬取得網站
filter_urls = [link for link in url_list if re.search('/(css3)',link)]
for url in filter_urls:
#補全鏈接
real_url = self.nomalize(url)
#判斷連接是否訪問過
if real_url not in self.visited:
self.visited[real_url] = depth +1
self.crawler_queue.put(real_url)
else:
print('robots.txt 禁止下載:',url_str)
class RandomProxy(object):
'''
隨機代理
'''
def __init__(self):
self.proxies = []
self.headers = {
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0s"
}
def crawl_proxies(self):
'''
抓取生成代理
:return:
'''
self.proxies.append('192.168.1.1')
self.proxies.append('192.168.1.2')
def verify_proxies(self):
'''
校驗每一個代理是否可用
:return:
'''
invalid_ip = []
for ip_str in self.proxies:
proxies = {'http':ip_str}
r = requests.get('http://www.baidu.com',proxies=proxies,headers=self.headers)
if r.status_code == 200:
continue
else:
invalid_ip.append(ip_str)
for remove_ip in invalid_ip:
self.proxies.remove(remove_ip)
def get_one_proxy(self):
return random.choice(self.proxies)
class Throttle(object):
'''
下載限流器
'''
def __init__(self,delay):
self.domains = {}
self.delay = delay
def wait(self,url_str):
domain = urlparse(url_str).netloc #取出網址域名部分(netloc)
last_down = self.domains.get(domain) #取出域名的上次下載時間
if self.delay > 0 and last_down is not None:
#將當前時間和上次下載時間相減,得出兩次下載時間間隔,然后用休眠時間(delay)減去這個間隔。
#如果大於0就休眠,負責直接下載后續的鏈接
sleep_sec = self.delay - (datetime.now() - last_down).seconds
if sleep_sec > 0:
time.sleep(sleep_sec)
self.domains[domain] = datetime.now() #當前時間為值,以域名為key存到domains字典中
if __name__=="__main__":
crawler = CrawlerCommon('http://www.runoob.com/css3/css3-tutorial.html')
crawler.run()