- 刚开始学习selenium动态网页的爬虫,就想着自己做个实战练习练习,然后就准备爬取马蜂窝旅游网重庆的全部旅游景点,本来以为不是特别难,没想到中间还是出现了很多问题,包括重写下载中间件,加cookies,selenium动态刷新下一页网页后提取到的数据仍然是前一页的数据,提取元素的方法选择,子页面跳转,selenium动作链等,折磨了很久,但是还是没有放弃,花了3天做完这个项目,下面记录一下遇到的问题和解决方法。
- 动态网页加载问题
- 首先分清楚网页的数据是否是动态加载的,笔记本电脑右键查看网页源代码,按Ctrl+F搜索想要抓取的数据是否在网页源代码中,不在则该网页就是动态加载的,需要使用selenium进行爬取,在spider中编写构造函数__init__初始化webdriver对象
-
from selenium import webdriver from selenium.webdriver import ChromeOptions #规避检测 #定义 option = ChromeOptions() option.add_argument('--ignore-certificate-errors') option.add_argument('--ignore-ssl-errors') option.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) bro = webdriver.Chrome(executable_path=r'chromedriver.exe路径',options=option) #将chromedriver.exe导入文件右键copy path bro.get(url) #bro查找元素 bro.find_element(By.方法,value) 例如bro.find_element(By.LINK_TEXT, ‘text’) #bro动作 bro.find_element(By.方法,value).click() #关闭网页 bro.quit()
def __init__(self): #设置cookies防止账号被封,settings里的COOKIES_ENABLED = True,且在下载中间件中重写process_request把cookie加上才可以 self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36', 'Cookie': 'SECKEY_ABVK=hUAgUzjagDt7tRAoeBixuHARq3o5gtYSbMcKcAkM2Ho%3D; BMAP_SECKEY=adS1Ht6D0s1kWECRhDaf4vSf6OhvVYklxDSAiZ_3W0fIGZJ8rWr9TbzVPPYVaIW5ObgotD3EzPQrdL2XdiXldciYniNJWqvUHZ8Wk_ri0IuuKOY9h0aB4i09OHC30d-kbWCSrrEQe40grf1Gj9izw6SGB5cmzIjIenxaZzpq8lmEDDU5Kvl7gAMUQauc7TUC; mfw_uuid=62357780-7ac5-d4dc-9a8e-5a02aa298353; _r=baidu; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A18%3A%22www.baidu.com%2Flink%22%3Bs%3A1%3A%22t%22%3Bi%3A1647671168%3B%7D; oad_n=a%3A5%3A%7Bs%3A5%3A%22refer%22%3Bs%3A21%3A%22https%3A%2F%2Fwww.baidu.com%22%3Bs%3A2%3A%22hp%22%3Bs%3A13%3A%22www.baidu.com%22%3Bs%3A3%3A%22oid%22%3Bi%3A1026%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222022-03-19+14%3A26%3A08%22%3B%7D; __jsluid_h=01784e2b1c452421aa25034fbbde3ed9; __mfwothchid=referrer%7Cwww.baidu.com; __omc_chl=; __mfwc=referrer%7Cwww.baidu.com; uva=s%3A307%3A%22a%3A4%3A%7Bs%3A13%3A%22host_pre_time%22%3Bs%3A10%3A%222022-03-19%22%3Bs%3A2%3A%22lt%22%3Bi%3A1647671169%3Bs%3A10%3A%22last_refer%22%3Bs%3A180%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKZtwUSmw3x4cyZcTJdfrzYa8Pr4pEgDbvJU1Pv7yOxRPpeRIeoKj_rydoZuVdCf0_IXBx40vQyB-xiuXsf_AyQ1y3t3mO4En4c5USvOZ_ya%26wd%3D%26eqid%3Df0f5303d000ee843000000036235777b%22%3Bs%3A5%3A%22rhost%22%3Bs%3A13%3A%22www.baidu.com%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1647671169%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A13%3A%22www.baidu.com%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=62357780-7ac5-d4dc-9a8e-5a02aa298353; UM_distinctid=17fa0dad264324-08a77f4b1aeaeb-9771539-e1000-17fa0dad26543a; __omc_r=; PHPSESSID=cdbbvncvrd5rqepai636p6pos7; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1647743622,1647851451,1647908302,1648001811; bottom_ad_status=0; __jsl_clearance=1648007349.914|0|yoFqmnWY6O7Msv1j5KemUKE3POE%3D; __mfwa=1647671168836.14813.17.1648001810109.1648007353609; CNZZDATA30065558=cnzz_eid%3D2058254581-1647670157-null%26ntime%3D1648003704; __mfwb=b20cf490195f.2.direct; __mfwlv=1648009103; __mfwvn=13; __mfwlt=1648009103; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1648009104; ariaDefaultTheme=undefined'} option = ChromeOptions() option.add_argument('--ignore-certificate-errors') option.add_argument('--ignore-ssl-errors') option.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging']) self.bro = webdriver.Chrome(executable_path=r'E:/爬虫/vocation/vocation/spiders/chromedriver.exe',options=option) self.item = VocationItem()
- 子页面跳转
- 本实例进行旅游景点名称,详细介绍,电话,游览用时,门票,开放时间,交通字段的爬取,由于除了第一个字段以外其他字段都需要点击主页面的标题进行跳转到子页面才能获取其他信息,所以需要编写回调函数,而主页面数据是动态加载的但是子页面的数据不是动态加载的,所以需要重写下载中间件对不同请求的response进行处理
- 在主页面获取景点的title和详情页href,在获取到每一个旅游景点的详情页的href后就调用回调函数进行详情页信息的爬取,注意需要设置等待时间,否则会因为网页数据还未加载出来就操作而出错。
- 关于元素定位不使用xpath,因为在点击每一页之后页面都会刷新,有些标签会随之而改变,会报selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element等错误,就算不报错也会导致页面更新后数据不更新,爬取到的还是第一页的数据,所以可以定位CLASS_NAME来获取WebElement对象,根据对象的属性来进行操作
-
WebElement对象的属性 #导入模块 from selenium import webdriver #创建一个浏览器对象 driver = webdriver.Firefox() #访问url地址 url = "https://www.douban.com/" #调用浏览器对象 driver.get(url) #使用name定位豆瓣的输入框 elem = driver.find_element_by_name("q") #打印elem对象 print(elem) #打印标签名 print(elem.tag_name) #打印当前元素的上一级 print(elem.parent) #打印当前元素的属性值 print(elem.get_attribute('type')) #使用xpath的方式定位豆瓣7.0文本内容 elem_1 = driver.find_element('xpath','//p[@class="app-title"]') #打印当前元素的文本内容 print(elem_1.text) #退出浏览器 driver.quit() tag_name的作用就是获取对应元素的标签名 parent的作用就是获取对应元素的父级 get_attribute('type')的作用就是获取对应元素中的属性值,框号中的属性可更改 text的作用就是获取当前元素的文本内容
-
WebElement对象的操作 from selenium import webdriver import time def test_start_selenium(): #打开浏览器驱动,并输入百度地址 driver = webdriver.Firefox() url = "https://www.baidu.com/" driver.get(url) input_el = driver.find_element_by_id("kw") time.sleep(3) #输入内容 input_el.send_keys("老友记") #点击百度一下按钮 input_e2 = driver.find_element('xpath','//input[@type="submit"]') input_e2.click() time.sleep(3) #清除输入框中输入的内容 input_el.clear() time.sleep(3) input_el.send_keys("西游记") time.sleep(3) #提交 input_el.submit() driver.quit() test_start_selenium() send_keys(""):输入文本内容 click():点击 clear():清空 submit():提交
- 编写回调函数
- 回调函数时出现521错误 (<521.......>HTTP status code is not handled or not allowed)是因为发送request请求时没有加上cookies或cookies过期了,需要重新获取cookie,获取网页cookie方法:在目标网页打开网页抓包工具(Fn+F12),选择network——>选择doc——>cookie,复制后粘贴到__init__构造函数的headers中即可。
-
def parse(self, response): for i in range(20): title_list = str((self.bro.find_element(By.CLASS_NAME,'scenic-list').text)) title = title_list.split('\n') for j in range(15): self.item['title']=title[j] # print(title[j]) detail_url = self.bro.find_element(By.LINK_TEXT, title[j]).get_attribute('href') yield scrapy.Request(str(detail_url),callback=self.detail_parse,meta=self.item,headers=self.headers) #item也需要作为参数传送 self.bro.find_element(By.CLASS_NAME,'pg-next').click() sleep(2)
- 重写下载中间件
- 本例中主页面数据为动态加载,需要用对该response进行处理,将数据加载在网页中再返回,所以需要重写下载中间件中的process_response函数,另外也需要重写其中的process_request函数加上请求的cookie
- 重写下载中间件后需要修改setting中的相应参数并取消注释
COOKIES_ENABLED = True
-
DOWNLOADER_MIDDLEWARES = { 'vocation.middlewares.VocationDownloaderMiddleware': 543, }
-
class VocationDownloaderMiddleware(object): def process_request(self, request,spider): Cookie='SECKEY_ABVK=hUAgUzjagDt7tRAoeBixuHARq3o5gtYSbMcKcAkM2Ho%3D; BMAP_SECKEY=adS1Ht6D0s1kWECRhDaf4vSf6OhvVYklxDSAiZ_3W0fIGZJ8rWr9TbzVPPYVaIW5ObgotD3EzPQrdL2XdiXldciYniNJWqvUHZ8Wk_ri0IuuKOY9h0aB4i09OHC30d-kbWCSrrEQe40grf1Gj9izw6SGB5cmzIjIenxaZzpq8lmEDDU5Kvl7gAMUQauc7TUC; mfw_uuid=62357780-7ac5-d4dc-9a8e-5a02aa298353; _r=baidu; _rp=a%3A2%3A%7Bs%3A1%3A%22p%22%3Bs%3A18%3A%22www.baidu.com%2Flink%22%3Bs%3A1%3A%22t%22%3Bi%3A1647671168%3B%7D; oad_n=a%3A5%3A%7Bs%3A5%3A%22refer%22%3Bs%3A21%3A%22https%3A%2F%2Fwww.baidu.com%22%3Bs%3A2%3A%22hp%22%3Bs%3A13%3A%22www.baidu.com%22%3Bs%3A3%3A%22oid%22%3Bi%3A1026%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222022-03-19+14%3A26%3A08%22%3B%7D; __jsluid_h=01784e2b1c452421aa25034fbbde3ed9; __mfwothchid=referrer%7Cwww.baidu.com; __omc_chl=; __mfwc=referrer%7Cwww.baidu.com; uva=s%3A307%3A%22a%3A4%3A%7Bs%3A13%3A%22host_pre_time%22%3Bs%3A10%3A%222022-03-19%22%3Bs%3A2%3A%22lt%22%3Bi%3A1647671169%3Bs%3A10%3A%22last_refer%22%3Bs%3A180%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DKZtwUSmw3x4cyZcTJdfrzYa8Pr4pEgDbvJU1Pv7yOxRPpeRIeoKj_rydoZuVdCf0_IXBx40vQyB-xiuXsf_AyQ1y3t3mO4En4c5USvOZ_ya%26wd%3D%26eqid%3Df0f5303d000ee843000000036235777b%22%3Bs%3A5%3A%22rhost%22%3Bs%3A13%3A%22www.baidu.com%22%3B%7D%22%3B; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1647671169%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A13%3A%22www.baidu.com%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=62357780-7ac5-d4dc-9a8e-5a02aa298353; UM_distinctid=17fa0dad264324-08a77f4b1aeaeb-9771539-e1000-17fa0dad26543a; __omc_r=; PHPSESSID=cdbbvncvrd5rqepai636p6pos7; Hm_lvt_8288b2ed37e5bc9b4c9f7008798d2de0=1647743622,1647851451,1647908302,1648001811; bottom_ad_status=0; __jsl_clearance=1648007349.914|0|yoFqmnWY6O7Msv1j5KemUKE3POE%3D; __mfwa=1647671168836.14813.17.1648001810109.1648007353609; CNZZDATA30065558=cnzz_eid%3D2058254581-1647670157-null%26ntime%3D1648003704; __mfwb=b20cf490195f.2.direct; __mfwlv=1648009103; __mfwvn=13; __mfwlt=1648009103; Hm_lpvt_8288b2ed37e5bc9b4c9f7008798d2de0=1648009104; ariaDefaultTheme=undefined' cookies = {i.split('=')[0]: i.split('=')[1] for i in Cookie.split('; ')} request.cookies =cookies return None def process_response(self, request, response, spider): bro=spider.bro if request.url in spider.start_urls: bro.get(request.url) sleep(2) page_text=bro.page_source new_response=HtmlResponse(url=request.url,body=page_text,encoding='utf-8',request=request) return new_response else: return response
- 编写子页面处理函数
- 其中有些字段可能是空的导致抓取错位,所以尽量选择标签值来定位元素
-
def detail_parse(self,response): item=response.meta item['title']=item['title'] item['introduction']=response.xpath('/html/body/div[2]/div[3]/div[2]/div[1]/text()').get().strip() item['phone']=response.xpath('/html/body/div[2]/div[3]/div[2]/ul/li[@class="tel"]/div[2]/text()').get() item['time']=response.xpath('/html/body/div[2]/div[3]/div[2]/ul/li[@class="item-time"]/div[@class="content"]/text()').get() #尽量选择标签定位而非位置定位 item['traffic']=response.xpath('/html/body/div[2]/div[3]/div[2]/dl[1]/dd/text()').get() item['ticket']=response.xpath('/html/body/div[2]/div[3]/div[2]/dl[2]/dd/div/text()').get() item['open_time']=response.xpath('/html/body/div[2]/div[3]/div[2]/dl[3]/dd/text()').get() yield item
- 关闭网页
-
def close(self,spider): self.bro.quit()
-
- 至此全部实验到此结束,再回头看时好像没有多少问题,但是在遇到问题的时候在网页上搜索不到解决问题时真的会很崩溃。