原理:
變量__EVENTVALIDATION
和__VIEWSTATE
表示現在頁面的狀態,其值一般存儲在當前頁面上。
變量__EVENTTARGET
會被作為第一個參數傳入js方法__doPostBack(eventTarget, eventArgument)
,表示是哪一個控件被觸發,比如第二個參數為空說明控件被點擊。
我們在請求頁面時將上面3個變量作為請求參數加入post請求,服務器受到請求后會解析變量,響應請求,返回頁面。
# -*- coding: utf-8 -*-
import urllib
import urllib2
user_agent = r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.2669.400 QQBrowser/9.6.10990.400'
headers = {r'User-Agent': user_agent}
url = r"http://www.jnta.gov.cn/InfoSeach.aspx?CMID=9&Type=%u666f%u70b9%u4fe1%u606f&KeyWord="
next_button = r'ctl00$ContentPlaceHolder1$ShowListSeach2$DDPager$ctl02$ctl00'
values = {}
data = urllib.urlencode(values)
req = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(req, data)
cookie = response.headers.get('Set-Cookie')
page_html = response.read() #獲取首頁html
print page_html
values['__EVENTTARGET'] = next_button
values['__EVENTARGUMENT'] = ''
values['__LASTFOCUS'] = ''
values['__VIEWSTATEGENERATOR'] = 'EBDD162D'
values['ctl00$tbKeyWord'] = ''
values['ctl00$ContentPlaceHolder1$Seach1$tbSeachKeyWord'] = ''
values['ctl00$ContentPlaceHolder1$Seach1$ddlModel'] = '9'
values['ctl00$ContentPlaceHolder1$Seach1$ddlType'] = '景點信息'
from lxml import html
req = urllib2.Request(url, headers = headers)
req.add_header('cookie', cookie)
for i in range(1, 5):
page_index_tree = html.fromstring(page_html.decode('utf-8'))
__VIEWSTATE = page_index_tree.cssselect('#__VIEWSTATE')
__EVENTVALIDATION = page_index_tree.cssselect('#__EVENTVALIDATION')
values['__VIEWSTATE'] = __VIEWSTATE[0].get('value')
values['__EVENTVALIDATION'] = __EVENTVALIDATION[0].get('value')
data = urllib.urlencode(values)
response = urllib2.urlopen(req, data)
page_html = response.read() #獲取下一頁
print page_html