餓了么外賣網站是一個ajax動態加載的網站
Version1:直接頁面提取
from lxml import etree import requests import sys import time reload(sys) sys.setdefaultencoding('utf-8') url = 'https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232' response = requests.get(url) print response.status_code time.sleep(10) html = response.content selector = etree.HTML(html) rez = selector.xpath('//*[@class="place-rstbox clearfix"]') print 'haha',rez #[] for i in rez: Name = i.xpath('//*[@class="rstblock-title"]/text()') print name msales = i.xpath('//*[@class="rstblock-monthsales"]/text()') tip = i.xpath('//*[@class="rstblock-cost"]/text()') stime = i.xpath('//*[@class="rstblock-logo"]/span/text()') print u'店名' for j in Name: print j break
問題:根據//*[@class="place-rstbox clearfix"]xpath提取成功,但是rez輸出為空
Version2:通過接口提取
geohash=ws101hcw982&latitude=22.52721&longitude=113.95232:位置信息參數及參數值
terminal=web:渠道信息
extras[]=activities和offset=0未知
import requests import json url = 'https://www.ele.me/restapi/shopping/restaurants?extras[]=activities&geohash=ws101hcw982&latitude=22.52721&limit=30&longitude=113.95232&offset=0&terminal=web' resp = requests.get(url) print resp.status_code Jdata = json.loads(resp.text) #print Jdata for n in Jdata: name = n['name'] msales = n['recent_order_num'] stime = n['order_lead_time'] tip = n['description'] phone = n['phone'] print name
輸出:原以為通過limit=100就可以提取100條商家信息,然而最多只顯示30
Version3:通過selenium提取
from selenium import webdriver import selenium.webdriver.support.ui as ui import time driver = webdriver.PhantomJS(executable_path=r"C:\Python27\phantomjs.exe") #driver = webdriver.Chrome() driver.get('https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232') time.sleep(10) driver.get_screenshot_as_file("E:\\Elm_ok.jpg") wait = ui.WebDriverWait(driver,10) wait.until(lambda driver: driver.find_element_by_xpath('//div[@class="place-rstbox clearfix"]')) name = driver.find_element_by_xpath('//*[@class="rstblock-title"]').text msales = driver.find_element_by_xpath('//*[@class="rstblock-monthsales"]').text tip = driver.find_element_by_xpath('//*[@class="rstblock-cost"]').text stime = driver.find_element_by_xpath('//*[@class="rstblock-logo"]/span').text print name #樂凱撒比薩(生態園店)
注:find_element只提取一個
改進版
#coding=utf-8 from selenium import webdriver import selenium.webdriver.support.ui as ui import time driver = webdriver.PhantomJS(executable_path=r"C:\Python27\phantomjs.exe") #driver = webdriver.Chrome() driver.get('https://www.ele.me/place/ws101hcw982?latitude=22.52721&longitude=113.95232') time.sleep(10) #driver.get_screenshot_as_file("E:\\Elm_ok.jpg") wait = ui.WebDriverWait(driver,10) wait.until(lambda driver: driver.find_element_by_xpath('//div[@class="place-rstbox clearfix"]')) #driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") #滾動至底部頁面 def execute_times(times): for i in range(times + 1): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(5) execute_times(20) name = driver.find_elements_by_xpath('//*[@class="rstblock-title"]') msales = driver.find_elements_by_xpath('//*[@class="rstblock-monthsales"]') tip = driver.find_elements_by_xpath('//*[@class="rstblock-cost"]') stime = driver.find_elements_by_xpath('//*[@class="rstblock-logo"]/span') #print name,msales,stime,tip #[<selenium.webdriver.remote.webelement.WebElement (session="c941cfb0-a428-11e7-affa-f38716880ab3",...] print type(tip) #<type 'list'> print len(name) #120 for i in name: print i.text
說明:通過execute_times函數,滾動條每下移一次,休息5s,從而使頁面加載更多的商家信息
輸出: