scrapy中使用selenium+webdriver獲取網頁源碼,爬取簡書網站
由於簡書中一些數據是通過js渲染出來的,所以通過正常的request請求返回的response源碼中沒有相關數據,
所以這里選擇selenium+webdriver獲取網頁源碼
1. 設置需要爬取的數據
import scrapy
class JianshuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
author_img = scrapy.Field()
time = scrapy.Field()
read_count = scrapy.Field()
subjects = scrapy.Field()
2. 在下載器中間件中使用 selenium+webdriver
from scrapy import signals
from scrapy.http.response.html import HtmlResponse
from selenium import webdriver
# 顯示等待
from selenium.webdriver.support.ui import WebDriverWait
class SeleniumDownloaderMiddleware:
def __init__(self):
# 加載chrome驅動,若chromedriver.exe文件和python.exe 在相同目錄下,可以省略executable_path="D:\python\chromedriver.exe"
# 即 self.driver=webdriver.Chrome()就可以
self.driver = webdriver.Chrome(executable_path="D:\python\chromedriver.exe")
def process_request(self, request, spider):
print("-"*40)
print(id(self))
print("-"*40)
self.driver.get(request.url)
try:
while True:
WebDriverWait(self.driver, 3).until(lambda x: x.find_element_by_class_name("H7E3vT"))
# 獲取加載更多按鈕
# show_more = self.driver.find_element_by_xpath("//div[@class='H7E3vT']")
show_more = self.driver.find_element_by_class_name("H7E3vT")
show_more.click()
except:
print("找不到更多按鈕")
pass
# 獲取網頁源代碼
html = self.driver.page_source
# 使用url=self.driver.current_url而不使用url=request.url,是有可能發生重定向,導致url發生變化
response = HtmlResponse(url=self.driver.current_url, body=html, request=request, encoding="utf-8")
# 返回response,請求就直接返回給scrapy引擎,而不會再發給下載器執行下載
return response
3. 編寫解析數據的爬蟲
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapylearn.jianshu.jianshu.items import JianshuItem
class JianshuspiderSpider(CrawlSpider):
name = 'jianshuspider'
allowed_domains = ['jianshu.com']
start_urls = ['http://jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
title = response.xpath("//h1[@class='_1RuRku']/text()").get()
author = response.xpath("//span[@class='FxYr8x']/a/text()").get()
author_img = response.xpath("//img[@class='_13D2Eh']/@src").get()
time = response.xpath("//div[@class='s-dsoj']/time/text()").get()
read_count = response.xpath("//div[@class='s-dsoj']/span[2]/text()").get().split()[1].replace(",", "")
subjects = ",".join(response.xpath("//div[@class='_2Nttfz']/a/span/text()").getall())
yield JianshuItem(title=title, author=author, author_img=author_img, time=time, read_count=read_count,
subjects=subjects)
def parse_item(self, response):
item = {}
# item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
# item['name'] = response.xpath('//div[@id="name"]').get()
# item['description'] = response.xpath('//div[@id="description"]').get()
return item
4. 將數據保存到mysql
import pymysql
class JianshuPipeline:
def __init__(self):
self.conn = pymysql.connect(
host='localhost',
port=3307,
user='root',
password='1612480331',
database='houses',
charset='utf8'
)
def process_item(self, item, spider):
print("=" * 40)
print(id(self))
print("=" * 40)
# 打開數據庫連接
# conn = pymysql.connect("localhost", "root", "1612480331", "houses", 3307)
# 創建一個游標對象
cursor = self.conn.cursor()
sql = "insert into jianshu values (%s,%s,%s,%s,%s,%s)"
cursor.execute(sql, (
item["title"], item["author"], item["author_img"], item["time"], item["read_count"], item["subjects"]))
self.conn.commit()
# print(values)
# for v in values:
# print(v)
cursor.close()
return item
# 當爬蟲關閉的時候會調用
def close_spider(self, spider):
self.conn.close()
print("爬蟲執行結束")
5. 在settings.py中進行配置
DOWNLOADER_MIDDLEWARES = {
# 'jianshu.middlewares.JianshuDownloaderMiddleware': 543,
'jianshu.middlewares.SeleniumDownloaderMiddleware': 1
}
ITEM_PIPELINES = {
'jianshu.pipelines.JianshuPipeline': 300,
}
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False