一、使用cookies登錄網站
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['xxx.com']
start_urls = ['https://www.xxx.com/xx/']
cookies = ""
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, cookies=self.cookies, callback=self.parse)
def parse(self, response):
with open("01login.html", "wb") as f:
f.write(response.body)
二、發送post請求登錄, 要手動解析網頁獲取登錄參數
import scrapy
class LoginSpider(scrapy.Spider):
name='login_code'
allowed_domains = ['xxx.com']
#1. 登錄頁面
start_urls = ['https://www.xxx.com/login/']
def parse(self, response):
#2. 代碼登錄
login_url='https://www.xxx.com/login'
formdata={
"username":"xxx",
"pwd":"xxx",
"formhash":response.xpath("//input[@id='formhash']/@value").extract_first(),
"backurl":response.xpath("//input[@id='backurl']/@value").extract_first()
}
#3. 發送登錄請求post
yield scrapy.FormRequest(login_url, formdata=formdata, callback=self.parse_login)
def parse_login(self, response):
#4.訪問目標頁面
member_url="https://www.xxx.com/member"
yield scrapy.Request(member_url, callback=self.parse_member)
def parse_member(self, response):
with open("02login.html",'wb') as f:
f.write(response.body)
三、發送post請求登錄, 自動解析網頁獲取登錄參數
import scrapy
class LoginSpider(scrapy.Spider):
name='login_code2'
allowed_domains = ['xxx.com']
#1. 登錄頁面
start_urls = ['https://www.xxx.com/login/']
def parse(self, response):
#2. 代碼登錄
login_url='https://www.xxx.com/login'
formdata={
"username":"xxx",
"pwd":"xxx"
}
#3. 發送登錄請求post
yield scrapy.FormRequest.from_response(
response,
formxpath="//*[@id='login_pc']",
formdata=formdata,
method="POST", #覆蓋之前的get請求
callback=self.parse_login
)
def parse_login(self, response):
#4.訪問目標頁面
member_url="https://www.xxx.com/member"
yield scrapy.Request(member_url, callback=self.parse_member)
def parse_member(self, response):
with open("03login.html",'wb') as f:
f.write(response.body)