scrapy遞歸爬取網頁
爬取網易新聞的五個分類下的標題和正文,結合selenium
# -*- coding: utf-8 -*-
import scrapy
from wangyiPro.items import WangyiproItem
from selenium import webdriver
class WangyiSpider(scrapy.Spider):
name = 'wangyi'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
#瀏覽器實例化的操作只會被執行一次
bro = webdriver.Chrome(executable_path='chromedriver.exe')
urls = []#最終存放的就是五個板塊對應的url
def parse(self, response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li')
for index in [3,4,6,7,8]:
li = li_list[index]
new_url = li.xpath('./a/@href').extract_first()
self.urls.append(new_url)
#是五大板塊對應的url進行請求發送
yield scrapy.Request(url=new_url,callback=self.parse_news)
#是用來解析每一個板塊對應的新聞數據(新聞的標題)
def parse_news(self,response):
div_list = response.xpath('//div[@class="ndi_main"]/div')
for div in div_list:
title = div.xpath('./div/div[1]/h3/a/text()').extract_first()
news_detail_url = div.xpath('./div/div[1]/h3/a/@href').extract_first()
#實例化item對象將解析到的標題和內容存儲到item對象中
item = WangyiproItem()
item['title'] = title
#對詳情頁的url進行手動請求發送以便回去新聞的內容
yield scrapy.Request(url=news_detail_url,callback=self.parse_detail,meta={'item':item})
def parse_detail(self,response):
item = response.meta['item']
#通過response解析出新聞的內容
content = response.xpath('//div[@id="endText"]//text()').extract()
content = ''.join(content)
item['content'] = content
yield item
def closed(self,spider):
print('爬蟲整體結束!!!')
self.bro.quit()
結合selenium,在middlewares.py文件中
from scrapy import signals
from scrapy.http import HtmlResponse
from time import sleep
class WangyiproDownloaderMiddleware(object):
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
#攔截整個工程中所有的響應對象
def process_response(self, request, response, spider):
if request.url in spider.urls:
#就要將其對應的響應對象進行處理
#獲取了在爬蟲類中定義好的瀏覽器對象
bro = spider.bro
bro.get(request.url)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(1)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(1)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(1)
bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
sleep(1)
#獲取攜帶了新聞數據的頁面源碼數據
page_text = bro.page_source
#實例化一個新的響應對象
new_response = HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request)
return new_response
else:
return response
scrapy請求傳參
1.爬取www.id97.com電影網,將一級頁面中的電影名稱,類型,評分一級二級頁面中的上映時間,導演,片長進行爬取。
爬蟲文件:
# -*- coding: utf-8 -*-
import scrapy
from moviePro.items import MovieproItem
class MovieSpider(scrapy.Spider):
name = 'movie'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://www.4567tv.tv/frim/index1.html']
#通用的url模板只適用於非第一頁
url = 'https://www.4567tv.tv/frim/index1-%d.html'
page = 2
#電影名稱(首頁),簡介(詳情頁)
def parse(self, response):
li_list = response.xpath('/html/body/div[1]/div/div/div/div[2]/ul/li')
for li in li_list:
name = li.xpath('./div/a/@title').extract_first()
detail_url = 'https://www.4567tv.tv'+li.xpath('./div/a/@href').extract_first()
item = MovieproItem()
item['name'] = name
#對詳情頁的url發起get請求
#請求傳參:meta參數對應的字典就可以傳遞給請求對象中指定好的回調函數
yield scrapy.Request(url=detail_url,callback=self.detail_parse,meta={'item':item})
if self.page <= 5:
new_url = format(self.url%self.page)
self.page += 1
yield scrapy.Request(url=new_url,callback=self.parse)
#解析詳情頁的頁面數據
def detail_parse(self,response):
#回調函數內部通過response.meta就可以接收到請求傳參傳遞過來的字典
item = response.meta['item']
desc = response.xpath('/html/body/div[1]/div/div/div/div[2]/p[5]/span[2]/text()').extract_first()
item['desc'] = desc
yield item
items文件:
import scrapy
class MovieproItem(scrapy.Item):
name = scrapy.Field()
desc = scrapy.Field()
scrapy手動請求發送
# -*- coding: utf-8 -*-
import scrapy
class PostdemoSpider(scrapy.Spider):
name = 'postDemo'
# allowed_domains = ['www.xxx.com']
#https://fanyi.baidu.com/sug
start_urls = ['https://fanyi.baidu.com/sug']
#父類方法,就是將start_urls中的列表元素進行get請求的發送
# def start_requests(self):
# for url in self.start_urls:
# yield scrapy.Request(url=url,callback=self.parse)
def start_requests(self):
for url in self.start_urls:
data = {
'kw':'cat'
}
#post請求的手動發送使用的是FormRequest
yield scrapy.FormRequest(url=url,callback=self.parse,formdata=data)
def parse(self, response):
print(response.text)
scrapy中間件設置UA及代理池
注意要在settings文件中將中間件打開
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
import random
#批量攔截所有的請求和響應
class MiddlewearproDownloaderMiddleware(object):
#UA池
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
#代理池
PROXY_http = [
'153.180.102.104:80',
'195.208.131.189:56055',
]
PROXY_https = [
'120.83.49.90:9000',
'95.189.112.214:35508',
]
#攔截正常請求:request就是該方法攔截到的請求,spider就是爬蟲類實例化的一個對象
def process_request(self, request, spider):
print('this is process_request!!!')
#UA偽裝
request.headers['User-Agent'] = random.choice(self.user_agent_list)
return None
#攔截所有的響應
def process_response(self, request, response, spider):
return response
#攔截發生異常的請求對象
def process_exception(self, request, exception, spider):
print('this is process_exception!!!!')
#代理ip的設定
if request.url.split(':')[0] == 'http':
request.meta['proxy'] = random.choice(self.PROXY_http)
else:
request.meta['proxy'] = random.choice(self.PROXY_https)
#將修正后的請求對象重新進行請求發送
return request