我們都知道大名鼎鼎的爬蟲框架scrapy,它是基於twisted框架基礎上進行的封裝,它是基於異步調用,所以爬取的速度會很快,下面簡單介紹一下scrapy的組成.
首先我們先安裝scrapy,如果是基於python3.x 安裝scrapy會出錯因為依賴的twisted不兼容現有的python版本導致的,我們使用wheel單獨安裝twisted,然后輸入 pip3 install scrapy,安裝成功
如果在windows環境下調試,記得安裝pywin32(pip3 install pypiwin32)
安裝完成之后我們要開始新建project, 在當前的終端中切換目錄到project的路徑(例如:將和爬蟲有關的project都放在e:\python\spider中)在終端中輸入
startproject ArticleSpider創建爬蟲工程,創建完工程之后,創建模板,輸入
scrapy genspider jobbole blog.jobbole.com前一個是域名后一個是要爬取的url
我們的project准備工作就完畢了
項目的目錄結構如下
scrapy.cfg是總的控制文件,一般不用改動
settings.py是project的配置文件
piplines.py是存放數據的有關操作
middleware.py存放的中間件
items.py 定義數據類型
spiders文件件 存放爬蟲的文件
這個文件會在生成模板的時候自動創建,主要是用來解析網頁中的字段和url
下面我們正式開始將http://blog.jobbole.com/all-posts/下的所有文章按照標題,日期,url,....等字段進行解析,然后將解析后的數據保存在遠程的mysql數據庫中
首先配置域名和要爬取的url
然后是定義爬取規則和解析的字段
這里我們使用css的方式來解析字段,同理還可以使用xpath來進行解析(jobble.py)
from urllib import parse
from scrapy.http import Request
def parse(self, response):
#獲取文章的父級鏈接(在類中)
post_nodes = response.css('#archive .floated-thumb .post-thumb a')
for post_node in post_nodes:
#獲取圖片的路徑
image_url = post_node.css('img::attr(src)').extract_first("")
post_url = post_node.css('a::attr(href)').extract_first("")
yield Request(url = parse.urljoin(response.url,post_url),meta={'front_image_url':image_url},callback=self.parse_detail)
#獲取下一頁的鏈接,讓爬蟲可以自動爬去下一頁的文章數據
next_url = response.css('.next.page-numbers::attr(href)').extract_first("")
if next_url:
yield Request(url=parse.urljoin(response.url,next_url),callback=self.parse)
在items.py中定義要爬取的數據,類型,格式,字段默認值
import scrapy
import datetime
import re
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose,TakeFirst,Join
class ArticleItemLoader(ItemLoader):
#改變默認的獲取方式,將返回數據有列表改為字符串
default_output_processor = TakeFirst()
def date_convert(value):
#create_date轉成date類型(該函數和類平級)
try:
create_date = datetime.datetime.strptime(value,'%Y/%m/%d').date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_nums(value):
#過濾點贊數(和類平級)
match_re = re.match('.*?(\d+).*', value)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0
return nums
def remove_tag_comment(value):
#去除tag里面的評論(和類平級)
if '評論' in value:
return ''
else:
return value
def return_value(value):
#把圖片的url變成覆蓋默認的string類型,因為在scrapy底層,圖片的url是列表,否則會報錯(和類平級)
return value
class JobBoleArticleItem(scrapy.Item):
#標題,日期,url,url_obj_id,圖片url,圖片路徑,點贊數,收藏數,評論數,標簽,正文
title = scrapy.Field()
create_date = scrapy.Field(
#定義日期的格式
input_processor = MapCompose(date_convert)
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor=MapCompose(remove_tag_comment),
#定義輸出數據的格式
output_processor = Join(',')
)
content = scrapy.Field()
#封裝sql語句,將獲取到的值插入到數據庫中(在類中)
def get_insert_sql(self):
insert_sql = '''
insert into jobbole_article(title, url,url_object_id, create_date, fav_nums, front_image_url, front_image_path,
praise_nums, comment_nums, tags, content)
VALUES (%s, %s, %s,%s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE content=VALUES(fav_nums)
'''
fron_image_url = ""
#圖片可能有很多,在插入的時候必須把列表轉為字符串,只獲取第一個圖片鏈接
if self['front_image_url']:
fron_image_url = self['front_image_url'][0]
params = (
self["title"], self["url"], self["url_object_id"], self["create_date"], self["fav_nums"],
fron_image_url, self["front_image_path"], self["praise_nums"], self["comment_nums"],
self["tags"], self["content"]
)
return insert_sql,params
在jobbole.py中繼續定義帶解析的字段
def parse_detail(self,response):
#使用css方式單獨從網頁獲取每個每個元素的值(不建議使用,使得代碼變得很多,可維護性差)
# title = response.css('.entry-header h1::text').extract()[0]
# create_date = response.css('p.entry-meta-hide-on-mobile::text').extract()[0].strip().replace('·','').strip()
# praise_nums = response.css('.vote-post-up h10::text').extract()[0]
# fav_nums = response.css('bookmark-btn::text').extract()[0]
# match_re = re.match('.*?(\d+).*',fav_nums)
# if match_re:
# fav_nums = int(match_re.group(1))
# else:
# fav_nums = 0
# comment_nums = response.css('a[href="#article-comment"] span::text').extract()[0]
# match_re = re.match('.*?(\d+).*', comment_nums)
# if match_re:
# comment_nums = int(match_re.group(1))
# else:
# comment_nums = 0
#
# content = response.css('div.entry').extract()[0]
# tag_list = response.css('p.entry-meta-hide-on-mobile a::text').extract()
# tag_list = [element for element in tag_list if not element.strip().endswith('評論')]
# tags = ','.join(tag_list)
#
#
# article_item['url_object_id'] = get_md5(response.url)
# article_item['title'] = title
# article_item['url'] = response.url
# try:
# create_date = datetime.datetime.strptime(create_date,'%Y/%m/%d').date()
# except Exception as e:
# create_date = datetime.datetime.now()
#
# article_item['create_date'] = create_date
# article_item['front_image_url'] = [front_image_url]
# article_item['praise_nums'] = praise_nums
# article_item['comment_nums'] = comment_nums
# article_item['fav_nums'] = fav_nums
# article_item['tags'] = tags
# article_item['content'] = content
# yield article_item
#獲取各個字段的值,
article_item = JobBoleArticleItem()
front_image_url = response.meta.get("front_image_url", "")
#通過items.py獲取字段的loader
item_loader = ArticleItemLoader(item=JobBoleArticleItem(), response=response)
#某些字段通過頁面css獲取,有些通過response獲取
item_loader.add_css("title", ".entry-header h1::text")
item_loader.add_value("url", response.url)
item_loader.add_value("url_object_id", get_md5(response.url))
item_loader.add_css("create_date", "p.entry-meta-hide-on-mobile::text")
item_loader.add_value("front_image_url", [front_image_url])
item_loader.add_css("praise_nums", ".vote-post-up h10::text")
item_loader.add_css("comment_nums", "a[href='#article-comment'] span::text")
item_loader.add_css("fav_nums", ".bookmark-btn::text")
item_loader.add_css("tags", "p.entry-meta-hide-on-mobile a::text")
item_loader.add_css("content", "div.entry")
item_loader.add_value('front_image_path','')
#將字段封裝到item中
article_item = item_loader.load_item()
#使得該方法變成可迭代對象,只要有數據便可不停的調用
yield article_item
在project下創util包,新建common.py
import hashlib
#將url轉變為16進制,url_object_id作為表的主鍵進行存儲
def get_md5(url):
if isinstance(url,str):
url = url.encode('utf8')
m= hashlib.md5()
m.update(url)
return m.hexdigest()
pipeline.py中定義數據存儲的格式(這里使用mysqldb進行數據插入,同理可測試sqlachemy)
mport codecs,json
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exporters import JsonItemExporter
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class ArticleImagePipeline(ImagesPipeline):
#獲取圖片url的路徑,保存到item中
def item_completed(self, results, item, info):
if "front_image_url" in item:
for ok, value in results:
image_file_path = value["path"]
item["front_image_path"] = image_file_path
return item
class JsonExportPipeline(object):
#將文件導出成json格式
def __init__(self):
self.file = open('articleexport.json','wb')
self.exporter = JsonItemExporter(self.file,encoding='utf8',ensure_ascii=False)
self.exporter.start_exporting()
def close_spider(self):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
class MysqlTwistedPipeline(object):
#異步插入數據,如果是異步的方式,插入數據就無需等待數據是否commit,這樣就不會導致數據已經爬取完畢但是數據沒有插入到數據庫造成的阻塞問題,不需要手動提交數據
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparams = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWORD'],
charset='utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb',**dbparams)
return cls(dbpool)
def process_item(self,item,spider):
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error,item,spider)
def do_insert(self,cursor,item):
insert_sql,params = item.get_insert_sql()
print(insert_sql,params)
cursor.execute(insert_sql,params)
def handle_error(self,failure,item,spider):
print(failure)
class MysqlPipeline(object):
#采用同步的機制寫入mysql,有可能造成數據阻塞
def __init__(self):
self.conn = MySQLdb.connect('192.168.0.106', 'root', 'root', 'article_spider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into jobbole_article(title, url, create_date, fav_nums)
VALUES (%s, %s, %s, %s)
"""
self.cursor.execute(insert_sql, (item["title"], item["url"], item["create_date"], item["fav_nums"]))
self.conn.commit()
在settings.py中配置pipiline,數據庫連接信息
# Obey robots.txt rules
#可以爬取非結構的url
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
#數字越小,優先級越高
#'ArticleSpider.pipelines.JsonExportPipeline': 2,
#'scrapy.pipelines.images.ImagesPipeline': 1,
#'ArticleSpider.pipelines.ArticleImagePipeline':1,
'ArticleSpider.pipelines.ArticleImagePipeline': 1,
'ArticleSpider.pipelines.MysqlTwistedPipeline': 2,
}
#定義圖片url的路徑,在project下創建images文件夾
IMAGES_URLS_FIELD = 'front_image_url'
project_dir = os.path.abspath(os.path.dirname(__file__))
IMAGES_STORE = os.path.join(project_dir,'images')
#虛擬機數據庫信息(數據庫的字符集一定要是utf8否則content保存會出錯)
MYSQL_HOST = '192.168.17.54'
MYSQL_DBNAME = 'article'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'oldboy'
測試在project下新建main.py文件
import os
import sys
from scrapy.cmdline import execute
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(["scrapy","crawl","jobbole"])
數據庫表結構設計
測試成功之后
select title,tags from jobbole_article;
看是否有數據