閑來無事,做的一個小爬蟲項目
爬蟲主程序:
1 import scrapy 2 from ..items import MeiziItem 3 4 class MztSpider(scrapy.Spider): 5 name = 'mzt' 6 allowed_domains = ['meizitu.com'] 7 start_urls = ['http://meizitu.com/'] 8 9 def parse(self, response): 10 tags = response.xpath(".//*[@class='tags']/span/a") 11 for i in tags: 12 item = MeiziItem() 13 tag_href = i.xpath(".//@href").extract()[0] 14 tag_name = i.xpath(".//@title").extract()[0] 15 item['tag_name'] = tag_name 16 item['tag_href'] = tag_href 17 #print(item['tag_name']) 18 #yield item 19 yield scrapy.Request(url=item['tag_href'], meta={'item': item}, callback=self.parse_page) 20 21 def parse_page(self, response): 22 23 item = response.meta['item'] 24 # 進入某個標簽后,爬取底部分頁按鈕 25 page_lists = response.xpath(".//*[@id='wp_page_numbers']/ul/li") 26 # 獲取底部分頁按鈕上的文字,根據文字來判斷當前標簽頁下總共有多少分頁 27 page_list = page_lists.xpath('.//text()') 28 # 如果當前標簽頁下有多個頁面,則再根據第一個按鈕是否為“首頁”來進行再次提取,因為這里有的頁面第一個按鈕是首頁,有的第一個按鈕是“1” 29 if len(page_lists) > 0: 30 if page_list[0].extract() == '首頁': 31 page_num = len(page_lists) - 3 32 else: 33 page_num = len(page_lists) - 2 34 else: 35 page_num = 1 36 37 # 根據當前標簽頁的鏈接,來拼成帶頁碼的鏈接 38 if '_' in item['tag_href']: 39 index = item['tag_href'][::-1].index('_') 40 href_pre = item['tag_href'][:-index] 41 else: 42 if page_num == 1: 43 href_pre = item['tag_href'].split('.html')[0] 44 else: 45 href_pre = item['tag_href'].split('.html')[0] + '_' 46 for i in range(1, page_num + 1): 47 item = response.meta['item'] 48 if page_num == 1: 49 href = href_pre + '.html' 50 else: 51 href = href_pre + str(i) + '.html' 52 item['page_list'] = href 53 #yield item 54 yield scrapy.Request(url=item['page_list'], meta={'item': item}, callback=self.parse_album) 55 56 def parse_album(self, response): 57 albums = response.xpath(".//*[@class='pic']") 58 for album in albums: 59 item = response.meta['item'] 60 album_href = album.xpath(".//a/@href").extract()[0] 61 album_name = album.xpath(".//a/img/@alt").extract()[0] 62 item['album_name'] = album_name 63 item['album_href'] = album_href 64 #yield item 65 yield scrapy.Request(url=item['album_href'], meta={'item': item}, callback=self.parse_img) 66 67 def parse_img(self, response): 68 img_list = response.xpath(".//*/p/img") 69 for img in img_list: 70 item = response.meta['item'] 71 img_title = img.xpath(".//@alt").extract()[0] 72 if img_title == '': 73 for i in range(1, len(img_list + 1)): 74 img_title = item['album_name'] + '_' + str(i) 75 else: 76 img_title = img_title 77 img_urls = img.xpath(".//@src").extract() 78 img_src = img.xpath(".//@src").extract()[0] 79 item['img_title'] = img_title 80 item['img_src'] = img_src 81 item['img_urls'] = img_urls 82 yield item
items設置
1 import scrapy 2 3 4 class MeiziItem(scrapy.Item): 5 # define the fields for your item here like: 6 # name = scrapy.Field() 7 # 標簽名稱 8 tag_name = scrapy.Field() 9 # 標簽鏈接 10 tag_href = scrapy.Field() 11 # 進入某標簽后的所有鏈接,加頁碼的 12 page_list = scrapy.Field() 13 # 圖片專輯名稱 14 album_name = scrapy.Field() 15 # 圖片專輯鏈接 16 album_href = scrapy.Field() 17 # 照片標題 18 img_title = scrapy.Field() 19 # 照片鏈接 20 img_src = scrapy.Field() 21 # 照片鏈接集合,用於ImagesPipeline下載圖片 22 img_urls = scrapy.Field()
輸出管道:
print('正在爬取...') print('老濕機,請耐心等待喲...') class MeiziPipeline(object): def process_item(self, item, spider): print('標簽名稱:',item['tag_name']) print('標簽鏈接:',item['tag_href']) print('頁碼:',item['page_list']) print('圖片專輯名稱:',item['album_name']) print('圖片專輯鏈接:',item['album_href']) print('照片標題:',item['img_title']) print('照片鏈接:',item['img_src']) print('照片鏈接集合:',item['img_urls']) print('----------------') return item
保存到本地的管道:
import scrapy from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem class MztImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): for image_url in item['img_urls']: yield scrapy.Request(image_url) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("該Item沒有圖片") return item
setting的設置,往setting.py加入
BOT_NAME = 'meizi' SPIDER_MODULES = ['meizi.spiders'] NEWSPIDER_MODULE = 'meizi.spiders' IMAGES_STORE = r'G:\\mzt' # 圖片存儲路徑 IMAGES_EXPIRES = 90 # 過期天數 IMAGES_MIN_HEIGHT = 100 # 圖片的最小高度 IMAGES_MIN_WIDTH = 100 # 圖片的最小寬度
爬取的最終結果
本來想把這些圖片分門分類的保存,然而不太會,所有的圖片全保存在一個文件夾下面,,,