# -*- coding:utf-8 -*-
import urllib.request
from urllib import request, parse
import urllib import re import os import urllib.request from urllib import request, parse url='http://www.baidu.com/' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} dict = { 'wd': 'word' } data = bytes(parse.urlencode(dict), encoding='utf8') req = request.Request(url=url, headers=headers) page = request.urlopen(req).read() req = request.Request(url=url, data=data, headers=headers, method='Get') response = request.urlopen(req) req = request.Request(url='www.baidu.com', data=data, method='POST') req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') url='http://www.baidu.com/s?wd='+urllib.request.quote('') req=urllib.request.Request(url) response=urllib.request.urlopen(req) html = response.read() p=re.compile("<table width=\"30%\".+?</table>",re.S) #HTML_ad存放的是整个推广版块的HTML代码 HTML_ad=p.search(html.decode('utf-8')) if HTML_ad!='none' and HTML_ad !='None' and HTML_ad !=None: HTML_ad=HTML_ad.group()
pyquey:
from pyquery import PyQuery headerss = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} dict = { 'wd': 'word' } d=pq(url=urls,data=dict, headers=headerss) p=d('div')
。