一,scrapy发送post请求
scrapy框架中默认发送的是get请求,源码:
scrapy.Spider父类中的start_request方法:
def start_requests(self): cls = self.__class__ if method_is_overridden(cls, Spider, 'make_requests_from_url'): warnings.warn( "Spider.make_requests_from_url method is deprecated; it " "won't be called in future Scrapy releases. Please " "override Spider.start_requests method instead (see %s.%s)." % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)
那么,想要发送post请求,我们就需要在我们的爬虫文件中重写父类的start_request方法。
详见代码:
ex:利用爬虫发送post请求到百度翻译
import scrapy class PosttestSpider(scrapy.Spider): name = 'postTest' # allowed_domains = ['www.qiubai.com'] start_urls = ['http://www.qiubai.com/'] def start_requests(self): url = 'https://fanyi.baidu.com/sug' data = { "kw": "hello" } yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse) def parse(self, response): print(response.text)
二,cookie
scrapy框架中发送的request请求,默认回保留cookie。
ex:利用爬虫登陆豆瓣电影,获取个人主页
# -*- coding: utf-8 -*- import scrapy from cookieDemo.utils.YMD import YDMHttp class DoubanSpider(scrapy.Spider): name = 'douban' # allowed_domains = ['www.douban.com'] start_urls = ['https://www.douban.com/'] def parse(self, response): img_code_url = response.xpath("//*[@id='captcha_image']/@src").extract_first() captcha_id = response.xpath("//*[@id='lzform']/fieldset/div[3]/div/input[2]/@value").extract_first() print(img_code_url) print(captcha_id) yield scrapy.Request(url=img_code_url, callback=self.parse_img, meta={"captcha_id": captcha_id}) # 利用meta传参 def parse_img(self, response): with open("code.png", "wb") as f: f.write(response.body) # 发送登陆请求 login_url = "https://www.douban.com/accounts/login" img_code = self.get_code("code.png") captcha_id = response.meta.get("captcha_id") # 利用request传参 data = { 'redir': 'https://movie.douban.com/', "source": "movie", "form_email": "glh0220@qq.com", "form_password": "goulonghui371379.", "captcha-solution": img_code, "captcha-id": captcha_id, 'login': '登录', } yield scrapy.FormRequest(url=login_url, formdata=data, callback=self.parse_login) def parse_login(self, response): people_url = "https://www.douban.com/people/186597252/" yield scrapy.Request(url=people_url, callback=self.get_people_page) def get_people_page(self, response): with open("people.html", "w", encoding="utf-8") as f: f.write(response.text) print("over...............................") def get_code(self, img_path): # 用户名 username = 'EksYiQiang' # 密码 password = 'xyq19990113' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 6041 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = 'c9f0265f96d9e97118aeb8eff629da64' # 图片文件 filename = img_path # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = 3000 # 超时时间,秒 timeout = 60 # 检查 if (username == 'username'): print('请设置好相关参数再测试') return else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login() print('uid: %s' % uid) # 查询余额 balance = yundama.balance() print('balance: %s' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout) print('cid: %s, result: %s' % (cid, result)) return result
三,代理操作
ex:利用百度可以搜索ip可以显示本机ip,测试代理操作
import scrapy class IpdemoSpider(scrapy.Spider): name = 'IPdemo' allowed_domains = ['www.baidu.com'] start_urls = ['https://www.baidu.com/s?wd=ip'] def parse(self, response): with open("ip.html", "w", encoding="utf-8") as f: f.write(response.text) print("over")
DownloadMiddleware:
class MyProxyMiddleware(object): def process_request(self, request, spider): # 这里的request就是就是中间件拦截的请求对象 # 该方法可以拦截请求对象 # 将该请求对象的UA进行伪装 # 对该请求的uri进行篡改 request.meta["proxy"] = "https://151.106.15.12:1080"
settings.py:
DOWNLOADER_MIDDLEWARES = { # 'ipDemo.middlewares.IpdemoDownloaderMiddleware': 543, 'ipDemo.middlewares.MyProxyMiddleware': 543, }