scrapy 驗證碼登錄程序,
https://accounts.douban.com/login
1 # -*- coding: utf-8 -*- 2 import scrapy 3 import urllib 4 5 class MydoubanSpider(scrapy.Spider): 6 name = "mydouban_" 7 8 def __init__(self, ): 9 super(MydoubanSpider, self).__init__() 10 self.start_urls = ['https://accounts.douban.com/login'] 11 self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"} 12 13 14 def parse(self, response): 15 return [scrapy.Request("https://accounts.douban.com/login",callback=self.Login,meta={"cookiejar":1})] 16 17 def Login(self,response): 18 captcha = response.xpath("//img[@id='captcha_image']/@src").extract() 19 if len(captcha) > 0: 20 #人工輸入驗證碼 下載驗證碼的圖片 21 urllib.urlretrieve(captcha[0],filename="./captcha.jpg") 22 captcha_value=raw_input('查看captcha.png,有驗證碼請輸入:') 23 24 data={ 25 "form_email": "user", 26 "form_password": "psaaword", 27 "captcha-solution": captcha_value, 28 #"redir": "https://www.douban.com/people/151968962/", #設置需要轉向的網址 29 } 30 31 return [ scrapy.FormRequest.from_response(response,headers=self.headers, meta={"cookiejar":response.meta["cookiejar"]}, 32 # headers=self.header, 33 formdata=data, callback=self.get_content, )] 34 pass 35 36 def get_content(self,response): 37 print("完成登錄.........") 38 test = response.xpath('//*[@id="db-global-nav"]/div/div[1]/ul/li[2]/a/span[1]//text()').extract() 39 print ''.join(test)
豆瓣的登錄程序
github完整代碼鏈接地址: https://github.com/sea1234/myyangzhengma