直接上代碼,順便在這里記錄,時間2190906.
剛開始爬貝殼網的,發現有反爬蟲,我也不會繞,換了鏈家網,原來中文也可以做變量。
spider.py
1 # -*- coding: utf-8 -*- 2 import scrapy 3 4 from beike.items import BeikeItem 5 6 class BeikewSpider(scrapy.Spider): 7 name = 'beikew' 8 allowed_domains = ['lianjia.com'] 9 start_urls = ['https://su.lianjia.com/ershoufang/'] 10 page = 1 11 12 13 def parse(self, response): 14 li_list = response.xpath('//*[@id="content"]/div[1]/ul/li') 15 for li in li_list: 16 item = BeikeItem() 17 name = li.xpath('./div[1]/div[1]/a/text()').extract_first() 18 單價 = li.xpath('./div[1]/div[6]/div[2]/span/text()').extract_first() 19 totalprice = li.xpath('./div[1]/div[6]/div[1]/span/text()').extract_first() 20 xiaoqu = li.xpath('./div[1]/div[2]/div/a/text()').extract_first() 21 local = li.xpath('./div[1]/div[3]/div/a/text()').extract_first() 22 item['name'] = name 23 item['單價'] = 單價 #在這里試試中文的,才知道原來中文也可以做變量 24 item['totalprice'] = totalprice 25 item['xiaoqu'] = xiaoqu 26 item['local'] = local 27 yield item 28 29 if self.page <= 50:#這里爬取了50頁數據,可以隨意更改 30 self.page += 1 31 url_new = str(self.page) 32 new_page_url = 'https://su.lianjia.com/ershoufang/pg' + url_new 33 yield scrapy.Request(url = new_page_url, callback = (self.parse))
item.py
1 import scrapy 2 3 class BeikeItem(scrapy.Item): 4 xiaoqu = scrapy.Field() 5 name = scrapy.Field() 6 單價 = scrapy.Field() 7 totalprice = scrapy.Field() 8 local = scrapy.Field()
settings.py
1 BOT_NAME = 'beike' #這些代碼在settings里啟用或者添加的。 2 SPIDER_MODULES = ['beike.spiders'] 3 NEWSPIDER_MODULE = 'beike.spiders' 4 FEED_EXPORT_ENCODING ='utf-8' 5 FEED_EXPORT_ENCODING = 'gb18030' 6 ROBOTSTXT_OBEY = True 7 DOWNLOAD_DELAY = 1
只用到了3個y文件,其他的都是命令生成的,保持默認。
執行結果: