requests的get方法
1 在百度里面查詢關鍵字的方法,並獲取帶百度當前頁面
import requests keywords = input('請輸入>>>').strip() response = requests.get('https://www.baidu.com/s?', params={ 'wd': keywords, 'pn':20 }, headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } ) if response.status_code == 200: with open('b.html', 'wt', encoding='utf-8') as f: f.write(response.text)
2 get請求給知乎
import requests response = requests.get('https://www.zhihu.com', headers={ 'Referer': 'https://www.zhihu.com/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } ) with open('c.html', 'wt', encoding='utf-8')as f: f.write(response.text) print(response.status_code) print(response.text)
3 get請求給githup
import requests response = requests.get(url='https://github.com/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', 'Cookie': '_octo=GH1.1.1333562301.1559296277; _ga=GA1.2.392559115.1559296287; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; _device_id=0dcf09aab9c4d288aaa33f26fecd1309; user_session=Yp-WRUHkznMCmRXO6-WsL8QRfVCau3k7gQ56zIZHMHfVTRCB; __Host-user_session_same_site=Yp-WRUHkznMCmRXO6-WsL8QRfVCau3k7gQ56zIZHMHfVTRCB; logged_in=yes; dotcom_user=andygouyong; _gh_sess=TTFoakY4c0ZtcHVMc2wrdjJiMmtSejhvN0VsVnhqU01PdW9yL01CMFNHYjZOaUNGUTFmNjlQK0o5NXFmVU40L1AzeUxCV2x0VHBka2VkR3ZBRUtxVnU2YUJPTUM0T3RWM0E5OVJtSklJTmswMXl6WS9lY3lrMGYvd1FoU0NnNVNla0lrZE13TzlIekhoRDA5a1JHcXBIeDNBUXlLZnoxVkd5elNNRmdCUHVZbGttREtyd2JDUWcxS1ZaZFpJZ3pnWUx1Z2p3MEppTGZOZkVMWEMrQ01HRGJxcU5kMWJPa3V5d001OHVsNElaWUowYitYYlFxeDgxNXd4YVdlZEJ5bFViVFdtTCtGQTFHYWZWTjFiSzhodVBPNXdQLzMxSkx3ZkJCeFpUdWJQdzR2dkRhcFhTeTUvZkROczZpWC9GMlVaZjgzTmxhWG5wakh1WnpDOFZpdzZ3PT0tLVFZRmowSjkva3RGY3dqaU15b0VHTkE9PQ%3D%3D--4508766204caae7d9c3ecc0c6e7c0fc8ae887a7f' } ) print(response.status_code) print(response.text) with open('d.html','wt',encoding='utf-8')as f: f.write(response.text)
requests的post方法(模擬登陸githup)
#!/user/bin/env python3 # -*- coding: utf-8 -*- import re, requests import time # 先獲取登陸頁面,拿到authenticity_token # 然后請求的url為'https://www.githuo.com/login # 請求方式為git r1 = requests.get('https://github.com/login', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } ) authenticity_token = re.findall('name="authenticity_token" value="(.*?)"', r1.text, re.S)[0] r1_cookies = r1.cookies.get_dict() print(authenticity_token) print(r1_cookies) # 提交數據表單,完成登陸 # 請求方法POST # https://github.com/session # 請求頭 # Referer: https://github.com/login # User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36 # cookies=r1_cookies # 請求體 # form date # commit: Sign in # utf8: ✓ # authenticity_token: qGeaCNP3aTAb5B13GiLwYrrO9uth09TU9Wm0CnXBg3cNQowPJJDHHMj0BXjziy1M6uuQVpEScoa9SzubrXDNMg== # login: 你的githup登錄名 # password: 你的githup密碼 r2 = requests.post( # 請求的url 'https://github.com/session', # 請求的cookies # 請求頭 headers={ 'Referer': 'https://github.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' }, cookies=r1_cookies, # 請求體,勇哥這里得特別別注意 data={ 'commit': 'Sign in', 'utf8': '✓', # 這里里面的authenticity_token就是前面get請求中的,在login頁面中拿到 'authenticity_token': authenticity_token, 'login': '你的githup登錄名', 'password': '你以為我會吧密碼貼出來嗎,還是太年輕啊' }, allow_redirects=True ) with open('e.html', 'wt', encoding='utf-8')as f: f.write(r2.text) print(r2.status_code) print('Repositories' in r2.text)
3 爬取梨視頻
廢話不多說,看勇哥寫的代碼
import requests import re import os from threading import Thread ppth = os.path.dirname(__file__) def get_index_page(url): # 向目標網站發起請求 response = requests.get(url) # 如果相應的狀態嗎是200,說明請求成功 if response.status_code == 200: return response.text def parse_index_page(htmll): url = re.findall('class="vervideo-bd".*?href="(.*?)"', htmll, re.S) return url def get_detail_page(url): movie_text = requests.get(url).text return movie_text def parse_detail_page(text): movie_mp4 = re.findall('srcUrl="(.*?)"', text, re.S) title = re.findall('<h1 class="video-tt">(.*?)</h1>', text, re.S) # print(title) if movie_mp4: # print(movie_mp4[0]) return {'title': title[0], 'movie': movie_mp4[0]} def download(movie_mp4): print(movie_mp4) title=movie_mp4['title'] movie_url=movie_mp4['movie'] response=requests.get(movie_url) if response.status_code==200: title=title.replace('"', ' ').replace("'"," ").replace("?"," ").strip() print(title) filename=ppth+'/Download/'+title+'.mp4' with open(filename,'wb') as f: f.write(response.content) # def main(): # # 基礎的url # base_url = 'https://www.pearvideo.com/category_{page}' # for i in range(5): # # 獲取五條網站數據 # url = base_url.format(page=i) # # 獲取網站的html代碼 # htmll = get_index_page(url) # # 解析出視頻網址 # video_num = parse_index_page(htmll) # for j in video_num: # # 獲取到每條視頻的url # url_end = base_url[0:26] + j # # print(url_end) # # 解析視頻的url數據,拿到.mp4結尾的數據 # movie_text = get_detail_page(url_end) # # 這是一個字典{'title':none,'movie':none} # movie_mp4 = parse_detail_page(movie_text) # # print(movie_mp4) # if movie_mp4: # download(movie_mp4) def main(base_url,i): # 獲取五條網站數據 url = base_url.format(page=i) # 獲取網站的html代碼 htmll = get_index_page(url) # 解析出視頻網址 video_num = parse_index_page(htmll) for j in video_num: # 獲取到每條視頻的url url_end = base_url[0:26] + j # print(url_end) # 解析視頻的url數據,拿到.mp4結尾的數據 movie_text = get_detail_page(url_end) # 這是一個字典{'title':none,'movie':none} movie_mp4 = parse_detail_page(movie_text) # print(movie_mp4) if movie_mp4: download(movie_mp4) if __name__ == '__main__': # 基礎的url base_url = 'https://www.pearvideo.com/category_{page}' for i in range(5): t=Thread(target=main,args=(base_url,i,)) t.start()
4 響應response
1 response的屬性
import requests respone=requests.get('http://www.jianshu.com') # respone屬性 print(respone.text) print(respone.content) print(respone.status_code) print(respone.headers) print(respone.cookies) print(respone.cookies.get_dict()) print(respone.cookies.items()) print(respone.url) print(respone.history) print(respone.encoding) #關閉:response.close() from contextlib import closing with closing(requests.get('xxx',stream=True)) as response: for line in response.iter_content(): pass
2 編碼問題
#編碼問題 import requests,re response=requests.get( 'https://www.autohome.com.cn/shanghai/', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } ) #汽車之家網站返回的頁面內容為gb2312編碼的,而requests的默認編碼為ISO-8859-1,如果不設置成gbk則中文亂碼 response.encoding='gbk' print(response.text) with open('f.html','wt',encoding='gbk')as f: f.write(response.text)
3 獲取二進制數據
import requests response=requests.get('https://images.cnblogs.com/cnblogs_com/ouyang99-/1395591/o_1521768608804.jpg') with open('a.jpg','wb')as f: #寫二進制數據的時候使用content f.write(tesponse.content)
4 當數據過大時,就容易發生內存撐爆的現象,這時
import requests response=requests.get('https://images.cnblogs.com/cnblogs_com/ouyang99-/1395591/o_1521768608804.jpg') with open('a.jpg','wb')as f: #寫二進制數據的時候使用content for line in response.iter_content(): f.write(line)
#這樣來一段一段的寫入文件,就可以很好的避免上述的問題
5 解析json
#解析json import requests response=requests.get('http://httpbin.org/get') import json res1=json.loads(response.text) #太麻煩 res2=response.json() #直接獲取json數據 print(res1 == res2) #True
5response進階用法
1、SSL Cert Verification

#證書驗證(大部分網站都是https) import requests respone=requests.get('https://www.12306.cn') #如果是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端 #改進1:去掉報錯,但是會報警告 import requests respone=requests.get('https://www.12306.cn',verify=False) #不驗證證書,報警告,返回200 print(respone.status_code) #改進2:去掉報錯,並且去掉警報信息 import requests from requests.packages import urllib3 urllib3.disable_warnings() #關閉警告 respone=requests.get('https://www.12306.cn',verify=False) print(respone.status_code) #改進3:加上證書 #很多網站都是https,但是不用證書也可以訪問,大多數情況都是可以攜帶也可以不攜帶證書 #知乎\百度等都是可帶可不帶 #有硬性要求的,則必須帶,比如對於定向的用戶,拿到證書后才有權限訪問某個特定網站 import requests respone=requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key')) print(respone.status_code)
2、使用代理

#官網鏈接: http://docs.python-requests.org/en/master/user/advanced/#proxies #代理設置:先發送請求給代理,然后由代理幫忙發送(封ip是常見的事情) import requests proxies={ 'http':'http://egon:123@localhost:9743',#帶用戶名密碼的代理,@符號前是用戶名與密碼 'http':'http://localhost:9743', 'https':'https://localhost:9743', } respone=requests.get('https://www.12306.cn', proxies=proxies) print(respone.status_code) #支持socks代理,安裝:pip install requests[socks] import requests proxies = { 'http': 'socks5://user:pass@host:port', 'https': 'socks5://user:pass@host:port' } respone=requests.get('https://www.12306.cn', proxies=proxies) print(respone.status_code)
3、超時設置

#超時設置 #兩種超時:float or tuple #timeout=0.1 #代表接收數據的超時時間 #timeout=(0.1,0.2)#0.1代表鏈接超時 0.2代表接收數據的超時時間 import requests respone=requests.get('https://www.baidu.com', timeout=0.0001)
4、 認證設置

#官網鏈接:http://docs.python-requests.org/en/master/user/authentication/ #認證設置:登陸網站是,彈出一個框,要求你輸入用戶名密碼(與alter很類似),此時是無法獲取html的 # 但本質原理是拼接成請求頭發送 # r.headers['Authorization'] = _basic_auth_str(self.username, self.password) # 一般的網站都不用默認的加密方式,都是自己寫 # 那么我們就需要按照網站的加密方式,自己寫一個類似於_basic_auth_str的方法 # 得到加密字符串后添加到請求頭 # r.headers['Authorization'] =func('.....') #看一看默認的加密方式吧,通常網站都不會用默認的加密設置 import requests from requests.auth import HTTPBasicAuth r=requests.get('xxx',auth=HTTPBasicAuth('user','password')) print(r.status_code) #HTTPBasicAuth可以簡寫為如下格式 import requests r=requests.get('xxx',auth=('user','password')) print(r.status_code)
5、異常處理

#異常處理 import requests from requests.exceptions import * #可以查看requests.exceptions獲取異常類型 try: r=requests.get('http://www.baidu.com',timeout=0.00001) except ReadTimeout: print('===:') # except ConnectionError: #網絡不通 # print('-----') # except Timeout: # print('aaaaa') except RequestException: print('Error')
6、上傳文件

import requests files={'file':open('a.jpg','rb')} respone=requests.post('http://httpbin.org/post',files=files) print(respone.status_code)