在scrapy爬蟲項目中經常遇到 爬取數據時報錯無法及時處理 導致數據爬取不完整 只能先查看log才能發現報錯
首先寫一個簡單的郵件發送模塊
"""
@file: emailHandler.py
"""
1 #郵件服務封裝 2 3 import smtplib 4 from email.mime.text import MIMEText 5 from email.utils import formataddr 6 7 class EmailHandler(object): 8 9 def __init__(self,user,password,type = 0): 10 """ 11 :param user:str 發送人郵箱地址(用戶名) 12 :param password:str 發送人在QQ或163申請的授權碼 13 :param type:int 0 為QQ郵箱 1 為163郵箱 14 """ 15 self.__QQ = {'smtp':'smtp.qq.com','port':465} 16 self.__163 = {'smtp':'smtp.163.com','port':25} 17 self.user = user 18 self.password = password 19 if type == 0: 20 self.server=smtplib.SMTP_SSL (self.__QQ['smtp'],self.__QQ['port']) 21 self.server.login (self.user,self.password) 22 elif type == 1: 23 self.server=smtplib.SMTP_SSL (self.__163['smtp'],self.__163['port']) 24 self.server.login (self.user,self.password) 25 26 def send_mail(self,To,subject,content): 27 """ 28 :param To:str 接收人郵箱地址 29 :param subject:str 郵件標題 30 :param content:str 郵件內容 31 :return:bool True 成功 False 失敗 32 """ 33 try: 34 msg = MIMEText(content,'plain','utf-8') 35 msg['From'] = formataddr(['spider郵件報警系統',self.user]) 36 msg['To'] = formataddr(['',To]) 37 msg['Subject'] = subject 38 39 self.server.sendmail(self.user,To,msg.as_string()) 40 print("【%s】郵件發送成功"%subject) 41 return True 42 except Exception as f: 43 print("【%s】郵件發送失敗,請檢查信息"%subject) 44 return False
需要指定以下幾個參數
1 #郵箱信息 2 MAIL_CONFIG = { 3 'user':'xxxxx', #郵箱賬號 4 'password':'xxxx', #郵箱授權碼 5 'to_add':'xxx', #要發送的郵箱地址 6 'mail_title':'scrapy_標題' #郵件標題 7 }
本項目中主要使用的 pydispatch模塊 綁定信號的方式發送郵件(代碼片段)
1 from pydispatch import dispatcher 2 err_spider = object() 3 4 def __init__(self): 5 #初始化郵件發送次數 6 self.mail_count = 0 7 dispatcher.connect(self.send_mail, signal=err_spider) 8 super(xxx, self).__init__() 9 10 def send_mail(self, error): 11 "當spider出現error時發送郵件到郵箱" 12 if self.mail_count < 1: 13 mailmanager = EmailHandler(mail_conf.get('user', ''), mail_conf.get('password', '')) 14 mailmanager.send_mail(mail_conf.get('to_add', ''), mail_conf.get('mail_title', ''), 'spider出現錯誤請及時查看\r%s' % error) 15 self.mail_count += 1
准備工作已經完成,接下來就是在scrapy 爬取數據出現問題時 調用這個模塊向指定郵箱發送郵件(代碼片段)
#列表頁數據 def parse(self, response): #列表頁條目 data_lists = response.xpath('//div[@id="listbox30"]/div') try: #最后一個div是分頁數據 for data in data_lists[:-1]: item = WangdaitianyanItem() item['title'] = data.xpath('div[1]/div/div[1]/a/@title').extract_first() #標題 log.msg('[info] 正在爬取【%s】' % (item['title']), level=log.INFO) item['img'] = data.xpath('div[2]/div/a/img/@data-src').extract_first() #封面圖 item['introduction'] = data.xpath('div[1]/div/div[2]/text()').extract_first() #簡介 item['source'] = data.xpath('div[1]/div/div[3]/div[1]/span[1]/a/text()').extract_first() #源 item['release_time'] = data.xpath('div[1]/div/div[3]/div[1]/span[3]/text()').extract_first() #發布時間 item['read_count'] = data.xpath('div[1]/div/div[3]/div[2]/span[2]/text()').extract_first() #回復數 item['comment_count'] = data.xpath('div[1]/div/div[3]/div[2]/span[5]/text()').extract_first() #評論數 #抓取詳情頁數據 #//news.p2peye.com/article-513444-1.html url = data.xpath('div[1]/div/div[1]/a/@href').extract_first() #url yield scrapy.Request(url='http:%s'%url, callback=self.details_page, meta={'item':item}) #分頁部分 #如果檢測不到下一頁 不在請求 try: next_page = data_lists[-1].xpath('div/a[contains(@title,"下一頁")]/@href').extract_first() #拿去下一頁url yield scrapy.Request(url='https://news.p2peye.com%s'%next_page, callback=self.parse) except Exception as e: pass except Exception as e: #發送郵件 dispatcher.send(signal=err_spider, error=traceback.format_exc())
當爬蟲出現問題時會以郵件的形式發送到郵箱