2 request的get和post方法


requests的get方法

1 在百度里面查詢關鍵字的方法,並獲取帶百度當前頁面

import requests

keywords = input('請輸入>>>').strip()
response = requests.get('https://www.baidu.com/s?',
                        params={
                            'wd': keywords,
                            'pn':20

},
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}

)

if response.status_code == 200:
    with open('b.html', 'wt', encoding='utf-8') as f:
        f.write(response.text)

2 get請求給知乎

import requests

response = requests.get('https://www.zhihu.com',
                        headers={
                            'Referer': 'https://www.zhihu.com/',
                            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
                        }
                        )

with open('c.html', 'wt', encoding='utf-8')as f:
    f.write(response.text)

print(response.status_code)
print(response.text)

 

3 get請求給githup

import requests

response = requests.get(url='https://github.com/',
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',

                            'Cookie': '_octo=GH1.1.1333562301.1559296277; _ga=GA1.2.392559115.1559296287; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; _device_id=0dcf09aab9c4d288aaa33f26fecd1309; user_session=Yp-WRUHkznMCmRXO6-WsL8QRfVCau3k7gQ56zIZHMHfVTRCB; __Host-user_session_same_site=Yp-WRUHkznMCmRXO6-WsL8QRfVCau3k7gQ56zIZHMHfVTRCB; logged_in=yes; dotcom_user=andygouyong; _gh_sess=TTFoakY4c0ZtcHVMc2wrdjJiMmtSejhvN0VsVnhqU01PdW9yL01CMFNHYjZOaUNGUTFmNjlQK0o5NXFmVU40L1AzeUxCV2x0VHBka2VkR3ZBRUtxVnU2YUJPTUM0T3RWM0E5OVJtSklJTmswMXl6WS9lY3lrMGYvd1FoU0NnNVNla0lrZE13TzlIekhoRDA5a1JHcXBIeDNBUXlLZnoxVkd5elNNRmdCUHVZbGttREtyd2JDUWcxS1ZaZFpJZ3pnWUx1Z2p3MEppTGZOZkVMWEMrQ01HRGJxcU5kMWJPa3V5d001OHVsNElaWUowYitYYlFxeDgxNXd4YVdlZEJ5bFViVFdtTCtGQTFHYWZWTjFiSzhodVBPNXdQLzMxSkx3ZkJCeFpUdWJQdzR2dkRhcFhTeTUvZkROczZpWC9GMlVaZjgzTmxhWG5wakh1WnpDOFZpdzZ3PT0tLVFZRmowSjkva3RGY3dqaU15b0VHTkE9PQ%3D%3D--4508766204caae7d9c3ecc0c6e7c0fc8ae887a7f'
                        }
                        )
print(response.status_code)
print(response.text)
with open('d.html','wt',encoding='utf-8')as f:
    f.write(response.text)

 requests的post方法(模擬登陸githup)

#!/user/bin/env python3
# -*- coding: utf-8 -*-


import re, requests
import time

# 先獲取登陸頁面,拿到authenticity_token
# 然后請求的url為'https://www.githuo.com/login
# 請求方式為git

r1 = requests.get('https://github.com/login',
                  headers={
                      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
                  }

                  )

authenticity_token = re.findall('name="authenticity_token" value="(.*?)"', r1.text, re.S)[0]

r1_cookies = r1.cookies.get_dict()
print(authenticity_token)
print(r1_cookies)

# 提交數據表單,完成登陸
# 請求方法POST
# https://github.com/session
# 請求頭
# Referer: https://github.com/login
# User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36

# cookies=r1_cookies
# 請求體
# form date
#     commit: Sign in
#     utf8: ✓
#     authenticity_token: qGeaCNP3aTAb5B13GiLwYrrO9uth09TU9Wm0CnXBg3cNQowPJJDHHMj0BXjziy1M6uuQVpEScoa9SzubrXDNMg==
#     login: 你的githup登錄名
#     password: 你的githup密碼

r2 = requests.post(
    # 請求的url
    'https://github.com/session',
    # 請求的cookies

    # 請求頭
    headers={
        'Referer': 'https://github.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'

    },
    cookies=r1_cookies,
    # 請求體,勇哥這里得特別別注意
    data={
        'commit': 'Sign in',
        'utf8': '',
        # 這里里面的authenticity_token就是前面get請求中的,在login頁面中拿到
        'authenticity_token': authenticity_token,
        'login': '你的githup登錄名',
        'password': '你以為我會吧密碼貼出來嗎,還是太年輕啊'

    },
    allow_redirects=True
)

with open('e.html', 'wt', encoding='utf-8')as f:
    f.write(r2.text)
print(r2.status_code)
print('Repositories' in r2.text)

 3 爬取梨視頻

廢話不多說,看勇哥寫的代碼

import requests
import re
import os
from threading import Thread

ppth = os.path.dirname(__file__)


def get_index_page(url):
    # 向目標網站發起請求
    response = requests.get(url)
    # 如果相應的狀態嗎是200,說明請求成功
    if response.status_code == 200:
        return response.text


def parse_index_page(htmll):
    url = re.findall('class="vervideo-bd".*?href="(.*?)"', htmll, re.S)
    return url


def get_detail_page(url):
    movie_text = requests.get(url).text
    return movie_text


def parse_detail_page(text):
    movie_mp4 = re.findall('srcUrl="(.*?)"', text, re.S)
    title = re.findall('<h1 class="video-tt">(.*?)</h1>', text, re.S)
    # print(title)
    if movie_mp4:
        # print(movie_mp4[0])
        return {'title': title[0], 'movie': movie_mp4[0]}


def download(movie_mp4):
    print(movie_mp4)
    title=movie_mp4['title']
    movie_url=movie_mp4['movie']
    response=requests.get(movie_url)
    if response.status_code==200:
        title=title.replace('"', ' ').replace("'"," ").replace(""," ").strip()
        print(title)
        filename=ppth+'/Download/'+title+'.mp4'
        with open(filename,'wb') as f:
            f.write(response.content)


# def main():
#     # 基礎的url
#     base_url = 'https://www.pearvideo.com/category_{page}'
#     for i in range(5):
#         # 獲取五條網站數據
#         url = base_url.format(page=i)
#         # 獲取網站的html代碼
#         htmll = get_index_page(url)
#         # 解析出視頻網址
#         video_num = parse_index_page(htmll)
#         for j in video_num:
#             # 獲取到每條視頻的url
#             url_end = base_url[0:26] + j
#             # print(url_end)
#             # 解析視頻的url數據,拿到.mp4結尾的數據
#             movie_text = get_detail_page(url_end)
#             # 這是一個字典{'title':none,'movie':none}
#             movie_mp4 = parse_detail_page(movie_text)
#             # print(movie_mp4)
#             if movie_mp4:
#                 download(movie_mp4)

def main(base_url,i):
    # 獲取五條網站數據
    url = base_url.format(page=i)
    # 獲取網站的html代碼
    htmll = get_index_page(url)
    # 解析出視頻網址
    video_num = parse_index_page(htmll)
    for j in video_num:
        # 獲取到每條視頻的url
        url_end = base_url[0:26] + j
        # print(url_end)
        # 解析視頻的url數據,拿到.mp4結尾的數據
        movie_text = get_detail_page(url_end)
        # 這是一個字典{'title':none,'movie':none}
        movie_mp4 = parse_detail_page(movie_text)
        # print(movie_mp4)
        if movie_mp4:
            download(movie_mp4)

if __name__ == '__main__':

    # 基礎的url
    base_url = 'https://www.pearvideo.com/category_{page}'
    for i in range(5):
        t=Thread(target=main,args=(base_url,i,))
        t.start()

 4 響應response

1 response的屬性

import requests
respone=requests.get('http://www.jianshu.com')
# respone屬性
print(respone.text)
print(respone.content)

print(respone.status_code)
print(respone.headers)
print(respone.cookies)
print(respone.cookies.get_dict())
print(respone.cookies.items())

print(respone.url)
print(respone.history)

print(respone.encoding)

#關閉:response.close()
from contextlib import closing
with closing(requests.get('xxx',stream=True)) as response:
    for line in response.iter_content():
    pass

2 編碼問題

#編碼問題
import requests,re


response=requests.get(
    'https://www.autohome.com.cn/shanghai/',
    headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
)
#汽車之家網站返回的頁面內容為gb2312編碼的,而requests的默認編碼為ISO-8859-1,如果不設置成gbk則中文亂碼 

response.encoding='gbk'
print(response.text)
with open('f.html','wt',encoding='gbk')as f:
    f.write(response.text)

3 獲取二進制數據

import requests
response=requests.get('https://images.cnblogs.com/cnblogs_com/ouyang99-/1395591/o_1521768608804.jpg')
with open('a.jpg','wb')as f:
    #寫二進制數據的時候使用content
    f.write(tesponse.content)

4 當數據過大時,就容易發生內存撐爆的現象,這時

import requests
response=requests.get('https://images.cnblogs.com/cnblogs_com/ouyang99-/1395591/o_1521768608804.jpg')
with open('a.jpg','wb')as f:
    #寫二進制數據的時候使用content
    for line in response.iter_content():
        f.write(line)
#這樣來一段一段的寫入文件,就可以很好的避免上述的問題

5 解析json

#解析json
import requests
response=requests.get('http://httpbin.org/get')

import json
res1=json.loads(response.text) #太麻煩

res2=response.json() #直接獲取json數據


print(res1 == res2) #True

 

5response進階用法

1、SSL Cert Verification

#證書驗證(大部分網站都是https)
import requests
respone=requests.get('https://www.12306.cn') #如果是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端


#改進1:去掉報錯,但是會報警告
import requests
respone=requests.get('https://www.12306.cn',verify=False) #不驗證證書,報警告,返回200
print(respone.status_code)


#改進2:去掉報錯,並且去掉警報信息
import requests
from requests.packages import urllib3
urllib3.disable_warnings() #關閉警告
respone=requests.get('https://www.12306.cn',verify=False)
print(respone.status_code)

#改進3:加上證書
#很多網站都是https,但是不用證書也可以訪問,大多數情況都是可以攜帶也可以不攜帶證書
#知乎\百度等都是可帶可不帶
#有硬性要求的,則必須帶,比如對於定向的用戶,拿到證書后才有權限訪問某個特定網站
import requests
respone=requests.get('https://www.12306.cn',
                     cert=('/path/server.crt',
                           '/path/key'))
print(respone.status_code)
View Code

 

2、使用代理

#官網鏈接: http://docs.python-requests.org/en/master/user/advanced/#proxies

#代理設置:先發送請求給代理,然后由代理幫忙發送(封ip是常見的事情)
import requests
proxies={
    'http':'http://egon:123@localhost:9743',#帶用戶名密碼的代理,@符號前是用戶名與密碼
    'http':'http://localhost:9743',
    'https':'https://localhost:9743',
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)



#支持socks代理,安裝:pip install requests[socks]
import requests
proxies = {
    'http': 'socks5://user:pass@host:port',
    'https': 'socks5://user:pass@host:port'
}
respone=requests.get('https://www.12306.cn',
                     proxies=proxies)

print(respone.status_code)
View Code

 

3、超時設置

#超時設置
#兩種超時:float or tuple
#timeout=0.1 #代表接收數據的超時時間
#timeout=(0.1,0.2)#0.1代表鏈接超時  0.2代表接收數據的超時時間

import requests
respone=requests.get('https://www.baidu.com',
                     timeout=0.0001)
View Code

4、 認證設置

#官網鏈接:http://docs.python-requests.org/en/master/user/authentication/

#認證設置:登陸網站是,彈出一個框,要求你輸入用戶名密碼(與alter很類似),此時是無法獲取html的
# 但本質原理是拼接成請求頭發送
#         r.headers['Authorization'] = _basic_auth_str(self.username, self.password)
# 一般的網站都不用默認的加密方式,都是自己寫
# 那么我們就需要按照網站的加密方式,自己寫一個類似於_basic_auth_str的方法
# 得到加密字符串后添加到請求頭
#         r.headers['Authorization'] =func('.....')

#看一看默認的加密方式吧,通常網站都不會用默認的加密設置
import requests
from requests.auth import HTTPBasicAuth
r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
print(r.status_code)

#HTTPBasicAuth可以簡寫為如下格式
import requests
r=requests.get('xxx',auth=('user','password'))
print(r.status_code)
View Code

5、異常處理

#異常處理
import requests
from requests.exceptions import * #可以查看requests.exceptions獲取異常類型

try:
    r=requests.get('http://www.baidu.com',timeout=0.00001)
except ReadTimeout:
    print('===:')
# except ConnectionError: #網絡不通
#     print('-----')
# except Timeout:
#     print('aaaaa')

except RequestException:
    print('Error')
View Code

6、上傳文件

import requests
files={'file':open('a.jpg','rb')}
respone=requests.post('http://httpbin.org/post',files=files)
print(respone.status_code)
View Code

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM