想要做個新聞文本識別分類的項目,就先寫了個爬取百度新聞的爬蟲。
環境:win7 32 bit python3.4 若干第三方庫
可以實現的功能:定期按照百度新聞的分類抓取新聞的標題,所屬類別及文本內容,並自動存入數據庫(MySQL),同時發郵件到自己郵箱提醒。
缺陷:因新聞來源不同,網頁編碼不同,會出現少量的亂碼現象;存入數據庫未添加自動去重功能(自己手動去重其實也並不難,所以沒去研究這個)
STEP1: creat_dbtable.py鏈接數據庫創建表(也可直接通過操作MySQL)
# -*- coding: utf-8 -*- """ Created on Sun Nov 6 23:31:33 2016 @author: Administrator """ #數據庫創建操作 import MySQLdb #打開數據庫鏈接 db = MySQLdb.Connect(host="localhost",user="root",passwd='你的密碼',db="test",use_unicode=True, charset="utf8") cursor = db.cursor() #如果數據已經存在,使用excute()方法刪除表 cursor.execute("DROP TABLE IF EXISTS news") #創建數據表 SQL語句 sql = """CREATE TABLE news( class VARCHAR(10) NOT NULL, title VARCHAR(100), text VARCHAR(15000) )""" cursor.execute(sql) #關閉數據庫連接 db.close()
在MySQL看到表已經生成:
step2:為了了解每次的抓取情況,寫一個send_email.py來實現發送郵件的功能,這個文件在spider主文件里面來調用。
NOTE:這個往自己的郵箱發送郵件要在相應郵箱開啟服務獲取一個password才可以,這個網上教程也比較多,之后有空會補充。
#coding:utf-8 from email.header import Header from email.mime.text import MIMEText from email.utils import parseaddr, formataddr import smtplib def _format_addr(s): name, addr = parseaddr(s) return formataddr((Header(name,'utf-8').encode(), addr)) def send_ms(T): from_addr = "1021550072@qq.com" password = 'your-password' to_addr = '1021550072@qq.com' smtp_server = 'smtp.qq.com' msg = MIMEText(T, 'plain', 'utf-8') msg['From'] = _format_addr('Anyone') msg['To'] = _format_addr('Echo') msg['Subject'] = Header('The New Report', 'utf-8').encode() server = smtplib.SMTP_SSL(smtp_server, 465, timeout=10) server.set_debuglevel(0) server.login(from_addr,password) server.sendmail(from_addr, [to_addr], msg.as_string()) server.quit() # send_ms(T)
step3:創建spider.py文件,實現具體功能。
# -*- coding: utf-8 -*- """ Created on Sun Nov 6 21:24:27 2016 @author: Administrator """ import re import time import requests import numpy as np import send_email from bs4 import BeautifulSoup from collections import Counter import MySQLdb start = time.time() #打開數據庫鏈接 db = MySQLdb.Connect(host="localhost",user="root",passwd='password',db="test",use_unicode=True, charset="utf8") cursor = db.cursor() headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36"} # 獲取首頁數據head_data def get_head_data(): head_url = 'http://internet.baidu.com/' data = requests.get(head_url,headers=headers) data.encoding = 'gbk' # print(data.status_code) head_data = data.text return head_data # 獲取各新聞分類的title及href def get_class(head_data): title_href = {} pa = re.compile(r'<a href="(http.*?.com/).*?>.*?(\w+)</a></li>') ma = re.findall(pa,head_data)[1:-7] ma = list(set(ma))[:-1] # print(len(ma)) for i in range(len(ma)): key = ma[i][1] value = ma[i][0] title_href[key] = value # print(title_href) return title_href # 對於每個分類提取標題信息class_data def get_class_data(class_url): class_data = requests.get(class_url, headers=headers) pa = re.compile(r'charset=(.*?)">') charset = re.findall(pa,class_data.text)[0] class_data.encoding = charset # class_data.encoding = 'gbk' class_data =class_data.text soup = BeautifulSoup(class_data, 'lxml') data = soup.findAll('a',{'target':'_blank'}) class_data = {} for i in range(len(data)): title = data[i].get_text() href = data[i].get('href') if len(title) > 10: if not '下載' in title: class_data[title] = href return class_data # 獲取每條新聞的具體文本內容,粗略抓取 def get_news_text(href): try: data = requests.get(href,headers=headers) # data.encoding = 'gbk' pa = re.compile(r'charset=(.*?)">') charset = re.findall(pa,data.text)[0] data.encoding = charset data = BeautifulSoup(data.text,'lxml').get_text() text = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\ \^\"\-\+\_\\&\\n\\t\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]", "", data) except: # print('get New Text fail...') text = None pass return text head_data = get_head_data() title_href = get_class(head_data) count = 0 for class_title,class_href in dict(title_href).items(): print(class_title) # try: class_data = get_class_data(class_href) # except: # print('get Class data fail...') # pass for news_title, news_url in class_data.items(): # print(news_title) text = get_news_text(news_url) sql = """INSERT INTO news\ SET class=%s, title=%s, text=%s""" try: cursor.execute(sql,(class_title,news_title,text)) db.commit() count += 1 except: # print('Save fail...') pass db.close() end = time.time() total_time = end - start T1 = '本次抓取耗時%s'%str(total_time) T2 = ' & 本次共抓取%s條新聞'%str(count) T = T1+T2 # print(t1,t2) send_email.send_ms(T)
數據庫存儲情況:
郵件詳情:
REMARK:關於windows定時任務,請參考這篇教程。
這是我自己計划任務的設置和運行情況