python爬取网页数据方法

本文转载自查看原文 2019-11-04 20:40 325 python

"""
#最基本,请求地址无参数
# response=urllib.request.urlopen("https://www.scetc.edu.cn")
#
# html=response.read().decode("utf-8")
#
# print(html)

#第二种，传参数的情况
#参数的转换参数的原始数据
# key_value={'kw' : '胡歌'}
# #要使用urllib.parse模块下的urllencode对原始数据进行转换，并且encode进行编码
# data=bytes(urllib.parse.urlencode(key_value).encode('utf-8'))
#
# response=urllib.request.urlopen("http://tieba.baidu.com/f?",data=data)
#
# html=response.read().decode('utf-8')
# print(html)

#第三种，传参数的情况
#timeout是指等待响应的时间

response=urllib.request.urlopen("http://www.scetc.cn",timeout=5)

html=response.read().decode('utf-8')
print(html)

import urllib.request

"""
HttpResponse对象的三个参数属性
"""

response=urllib.request.urlopen("https://www.tmall.com")

back_url=response.geturl()
print("响应的url:",back_url)

back_code=response.getcode();
print("响应的状态码：",back_code)

back_info=response.info()
print("响应的信息：",back_info)

"""
构造Request对象
"""

import urllib.request
import urllib.parse

#头文件的数据
header={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident/5.0)"}
#发送请求参数数据
params={"news_id":174,"page":1}
data=bytes(urllib.parse.urlencode(params).encode('utf-8'))
#封装request对象
#地址
url="http://www.scetc.cn/index!detail"
request=urllib.request.Request(url,data=data,headers=header)
#连接类型
request.add_header("Connection", "keep-alive")

#封装完毕之后openurl方法只需要传入这个Request对象就可以了
response=urllib.request.urlopen(request)

html=response.read().decode('utf-8')

print(html)

#代理ip
proxy_list=[
    {"http": "124.88.67.81:80"},
    {"http" : "127.88.67.81:80"},
    {"http" : "121.82.67.81:80"},
    {"http" : "124.55.67.81:80"},
    {"http" : "124.56.67.81:80"},
    {"http" : "124.78.67.81:80"},
]
#随机选取代理服务器地址
ran_proxy=random.choice(proxy_list)
#创建handler对象
httpproxy_handler = urllib.request.ProxyHandler(ran_proxy)
#获取opener对象
opener = urllib.request.build_opener(httpproxy_handler)

#构建Request对象
header={"User-Agent":"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT6.1; Trident/5.0)"}
request=urllib.request.Request('http://www.scetc.net',headers=header)
#请求访问
response=opener.open(request)
#获取响应内容
html=response.read().decode('utf-8')
print(html)

免责声明！

本站转载的文章为个人学习借鉴使用，本站对版权不负任何法律责任。如果侵犯了您的隐私权益，请联系本站邮箱yoyou2525@163.com删除。

猜您在找 python爬取网页数据 Python：将爬取的网页数据写入Excel文件中 C# 爬取网页数据 curl ——爬取网页数据 Python 爬虫爬取多页数据 python爬虫教程：实例讲解Python爬取网页数据 python3下scrapy爬虫(第八卷:循环爬取网页多页数据） Asp .Net Core网页数据爬取笔记如何使用python爬取网页动态数据使用webdriver+urllib爬取网页数据(模拟登陆，过验证码)