name = 'doubanzufang'
start_urls = ['https://www.douban.com/group/tianhezufang/discussion?start=50']
def parse(self, response):
item = ZufangItem()
node_list = response.css(".olt>tr>td.title")
for node in node_list:
item['title'] = node.css("a[href]::attr(title)").extract()[0]
item['url'] = node.css("a[href]::attr(href)").extract()[0]
yield scrapy.Request(item['url'], meta={'item': item}, callback=self.parse_detail)
def parse_detail(self, response):
item = response.meta['item']
item['poster'] = response.xpath(".//*[@id='content']/div/div[1]/div[1]/div[2]/h3/span[1]/a/text()").extract()[0]
item['image'] = response.xpath(".//*[@id='content']/div/div[1]/div[1]/div[1]/a/img/@src").extract()[0]
item['information'] = response.xpath(".//*[@id='link-report']/div[1]/p[1]/text()").extract()[0]
yield item
確認item['title'],item['url']都能獲取到不止一個str
問題:
1:為什么我輸出得到的item['title'],item['url']永遠是最后一個?yield meta不應該每次都傳遞過去嗎?
2:我想實現獲取圖1標題,作者以及圖2內容,把他們保存在一個item返回,怎么實現?
問題1:應該是item保存的是字典,在for循環外聲明的item每次在最后都把前面的更新了,所以url,title永遠都是獲取最后的值
問題2:將item = ZufangItem()放在for循環內即可解決
引用自:https://blog.csdn.net/q810935819/article/details/82082969