環境和工具:python2.7,scrapy
實驗網站:http://www.XXXX.com/tag/333.html 爬去所有兔女郎圖片,下面的推薦需要過濾
邏輯:分析網站信息,下載圖片和入庫需要開啟ITEM_PIPELINES,開啟縮略圖配置,轉移圖片
-----settings.py
##不按照robots.txt ROBOTSTXT_OBEY = False ##默認 DOWNLOAD_DELAY = 3 ##關閉cookie COOKIES_ENABLED = False ##開啟ITEM_PIPELINES ITEM_PIPELINES = { 'MyPicSpider.pipelines.MyImagesPipeline': 300, 'MyPicSpider.pipelines.MysqlPipeline': 400 } ##存儲路徑 IMAGES_STORE ='G:\\www\\scrapy_rpo\\pic\\meinv\\rabbit\\' ##過濾圖片 IMAGES_MIN_HEIGHT = 110 IMAGES_MIN_WIDTH = 110 ##縮略圖片 IMAGES_THUMBS = { 'big': (270, 270), }
------items.py
import scrapy class PicspiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() tag = scrapy.Field() image_urls = scrapy.Field() images_data = scrapy.Field() img_path = scrapy.Field() img_big_path = scrapy.Field() file_path = scrapy.Field()
----pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy,os,datetime from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem import shutil,os,pymysql # 導入項目設置 from scrapy.utils.project import get_project_settings #conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test', charset="UTF8") #cursor = conn.cursor() class MyImagesPipeline(ImagesPipeline): # 從項目設置文件中導入圖片下載路徑 img_store = get_project_settings().get('IMAGES_STORE') def get_media_requests(self, item, info): ''' 多個url''' for image_url in item['image_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info, ): image_paths = [x["path"] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") file_path = item['file_path'] # 定義分類保存的路徑 if os.path.exists(file_path) == False: os.mkdir(file_path) print image_paths ## pic == full/80dd7db02e4da4e63f05d9d49c1092fc7fdcb43e.jpg pic_list = [] for v in image_paths: pic_name = v.replace('full/','') pic_small_name =pic_name.replace('.jpg','')+'_s.jpg' pic_big_name = pic_name.replace('.jpg', '') + '_b.jpg' ##獲取創建的圖片名字 # 將文件從默認下路路徑移動到指定路徑下 # 移動圖片 shutil.move(self.img_store + 'full\\'+pic_name, file_path + "\\" + pic_name) # 移動縮略圖 #shutil.move(self.img_store + 'thumbs\\small\\'+ pic_name, file_path + "\\" + pic_small_name) shutil.move(self.img_store + 'thumbs\\big\\' + pic_name, file_path + "\\" + pic_big_name) #img_path_dict['img_path'] = file_path + "\\" + pic_name #img_path_dict['img_small_path'] = file_path + "\\" + pic_small_name #img_path_dict['img_big_path'] = file_path + "\\" + pic_big_name img_path_dict = ('picture/meinv/rabbit/'+item['tag']+"/" + pic_name,'picture/meinv/rabbit/'+item['tag']+"/" +pic_big_name) pic_list.append(img_path_dict) item["img_path"] = pic_list return item ##入庫 class MysqlPipeline(object): def __init__(self): self.conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8") # 創建指針 self.cursor = self.conn.cursor() def process_item(self, item, spider): ###組裝數據 list = [] datetime_now =datetime.datetime.now() datetime_now = datetime.datetime.now() datetime_str = '{0}-{1}-{2} {3}:{4}:{5}'.format(datetime_now.year, datetime_now.month, datetime_now.day,datetime_now.hour, datetime_now.minute, datetime_now.second) ##增加type result = self.cursor.execute(u"select id from network_type where RESOURCETYPE ='p' and TYPENAME='{0}'".format(item['tag'])) if result==0: self.cursor.execute("insert into network_type(PID,RESOURCETYPE,TYPENAME)values(%s,%s,%s) ",(2415,'p',item['tag'])) typeid = self.cursor.lastrowid self.conn.commit() else: #tag_id = self.cursor.fetchall() #typeid = tag_id[0][0] return False types = ','+str(typeid)+',' #print item['img_path'] self.cursor.execute('select id from network_picture order by cast(id as SIGNED INTEGER) desc limit 0,1') old_id = self.cursor.fetchone() if old_id: id_n = str(int(old_id[0]) + 1) else: id_n = str(1) for v in item['img_path']: path1 = v[0] path2 = v[1] self.cursor.execute(u'select id from network_picture where FILEPATH="{0}" and fileScalPath="{1}"'.format(path1,path2)) data = self.cursor.fetchone() if data: print u'該數據已經存在' else: a = (str(id_n),'',path1,'',types,0,datetime_str,path2) list.append(a) id_n = int(id_n) + 1 print list self.cursor.executemany("insert into network_picture(ID,NAME,FILEPATH,FILESIZE,TYPES,STATUS,DATETIME,fileScalPath)values(%s,%s,%s,%s,%s,%s,%s,%s)", list) self.conn.commit() return item
----spider.py
# -*- coding: utf-8 -*- import scrapy,os,urllib2 from scrapy.linkextractors import LinkExtractor ##引入linkextractors 用於篩選鏈接和跟進鏈接,還有很多功能,可以去百度下 from scrapy.spiders import CrawlSpider, Rule ##定義spider的模板,引入Rule規則 from MyPicSpider.items import PicspiderItem ##引入定義的items.py # 導入項目設置 from scrapy.utils.project import get_project_settings from bs4 import BeautifulSoup import time,pymysql headers = {'User_agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'} conn = pymysql.Connection(host="localhost", user="root", passwd="root", db='test1', charset="UTF8") # 創建指針 cursor = conn.cursor() class PicSpider(CrawlSpider): ##繼承模板CrawlSpider 普通模板繼承Spider name = 'pic' ###定義spider名 運行---$ scrapy crawl blog allowed_domains = ['www.xxxx.com'] ## 定義查找范圍 start_urls = ['http://www.xxxx.com/tag/333.html'] ###初始url ####當有follow=True 則會跟進該頁面 ####原理就是 spider在初始頁面查找,同時查找帖子詳情頁的url和下一個分頁,同時跟進下一個分頁頁面,繼續查找下一個分頁頁面和上面的詳情頁url,詳情頁面使用回調函數進行采集 rules = ( ###爬去索引頁並跟蹤其中鏈接 ###查找start_urls 所有的分頁頁面 Rule(LinkExtractor(allow=r'/tag/[0-9]*_[0-9]*.html'),follow=True), ###爬去items頁面並將下載響應返回個頭parse_item函數 ####查詢每個分頁頁面的詳情頁 Rule(LinkExtractor(allow=r'http://www.xxxx.com/ent/[a-z]*/[0-9]*/[0-9]*.html'), callback='parse_item', follow=False,), ) ####詳情頁面回調函數 def parse_item(self,response): start_url = response.url item = PicspiderItem() tag_name = response.xpath('//h1[@class="articleV4Tit"]/text()').extract()[0] # cursor.execute(u'select id from network_type where PID=258 AND TYPENAME="{0}" limit 0,1'.format(tag_name)) # old_id = cursor.fetchone() # if old_id: # exit() name = u'兔' if name in tag_name: pass else: print u'----這是其他的分類----' return False li_list = response.xpath('//ul[@class="articleV4Page l"]/li').extract() srcs = [] for v in range(1, (len(li_list) - 3)): if v == 1: url_s = start_url else: url_s = start_url.replace('.html', '') + '_' + str(v) + '.html' try: request = urllib2.Request(url_s, headers=headers) response = urllib2.urlopen(request, timeout=200).read() except urllib2.URLError, err: print err, '錯誤的url' + url obj = BeautifulSoup(response, 'html.parser') try: pic_url = obj.find('center').find('img')['src'] except: print u'----第一種獲取方式失敗----' try: pic_url = obj.find('div', {'id': 'picBody'}).find('img')['src'] except: print u'----第二種方式獲取失敗----' try: pic_url = obj.find('p', attrs={"style": "text-align: center"}).find('img')['src'] except: print u'----第三種獲取方式失敗----' srcs.append(pic_url) item['tag'] = tag_name item['file_path'] = '%s%s' %(get_project_settings().get('IMAGES_STORE'),tag_name) item['image_urls'] = srcs return item
------scrapy的去重方面我還不是特別了解,有知道的大佬可以告知本白,謝謝。