Day05,requests和selenium模塊


一、requests基於POST請求

#1.requests的GET與POST用法的區別:

'''
    
    GET請求: (HTTP默認的請求方法就是GET)
     * 沒有請求體
     * 數據必須在1K之內!
     * GET請求數據會暴露在瀏覽器的地址欄中

    GET請求常用的操作:
       1. 在瀏覽器的地址欄中直接給出URL,那么就一定是GET請求
       2. 點擊頁面上的超鏈接也一定是GET請求
       3. 提交表單時,表單默認使用GET請求,但可以設置為POST


    POST請求
    (1). 數據不會出現在地址欄中
    (2). 數據的大小沒有上限
    (3). 有請求體
    (4). 請求體中如果存在中文,會使用URL編碼!

    !!!requests.post()用法與requests.get()完全一致,特殊的是requests.post()有一個data參數,用來存放請求體數據!      
      
'''

#2、發送post請求,模擬瀏覽器的登錄行為

import requests
import re


'''
URL: https://github.com/login
GET
Set-Cookie: _gh_sess=UkJNSE92NXNRMjhOajlvdXhEQ0JNQUFqZFdWNUJGWmZMTXNLNFVqV3RYLzdCdHM1NWR3VnVHV0tTYzlpb0xjRDh6RzgzaDdTb01ZMThKZEtYSW14WktIYnhVTFBXaHN1YW1WVC84YStoMURBa1NFZlJNbXJIb3F2bDZZWDFLYVVtZHB0Y3htYk9USzhBOWdYTXZHTE5FMU9SWkhwdjhSUE8wbHFwUHBZOXdpb3AyaVpVUFpxby8vUHBJY2pzeXR1WWx2UEhJOUZUamY3QUpvZ1lvc2dEVlV0UFRDL2U5c3RPYmF5RHJtN0t4UFFFaW14UGhOQ3dKdWU3NEpHSExZKytWbmoxOXQ4bjZ4NUt2ajJ0N2xRaW44MWlQZ1o2Q0Urbjc1N2FEdUI1MDlscjRLK3dMMTRTMHhXbVlNWUs1eU9lVG92SzNIb1FmdWRtL2hzbGkwQnJ0UjBWYzlNcCszNWdoVmFaUXdZeVNYVXFucWxITlcyM0ZVNGsrd0t0RjlSakFBZGV5SjA0NVNLdVlSZXJuYVFmV0NrM1hGVytFQ2phSXcxdXZDM2J6NTZVSnQ2WkljTnhPNG9NYTlobllhRExScDhlQ04xM1l6Zi9jL0p5UkdHRkc3S2Z3RUxSMUdvcEhCazBXWkYwdkE9LS1VbmpDRUwwZGw0amJVOEgwT2ZQaGlBPT0%3D--4562a6af2d21508e2f522a5983004b4b2bb5983c; path=/; secure; HttpOnly
Cookie: _ga=GA1.2.1697930951.1554622929; _octo=GH1.1.1498701842.1560392375; _device_id=22a0ddb58979d9c97ffafeb3113e2567; user_session=TZzfoXANu224u6MsNzS5Z3GSasDsMIBvZC4cvOknaNRrWRJe; __Host-user_session_same_site=TZzfoXANu224u6MsNzS5Z3GSasDsMIBvZC4cvOknaNRrWRJe; logged_in=no; tz=Asia%2FShanghai; has_recent_activity=1; _gat=1; _gh_sess=U3B3L1FZUzFTY3JCRGZqamZvMGxoakhlYys3U29CK1BVZnlwWDVFRXhHNnZ5ZnRvbWNyNVlzQmdiZXRXVGxlQUZObWlPd2dXODdkTVUydGFaNW5YRm03YlhRVEU2bWFiRWlmeE00K25rd2FzOHlYTUUxQkF2R0JpS3ZpdlhxWE1FdFMrdW4zS2VtWmtrc2xLdmpZVnA3NHZjd3R4d3pxMEM4QWh1Ykk1U2JCYmpvYWZycDJaZHRDdjFDT3JYeHJOMDgrbDl3Q2FOZlZIQUlOUXdSaGJQajVVeHF5VjF0Q21FWkpoTlRSWkR2b2haeUlGNmpkdnZOUUc3dG9ydldTaDhlYllZQWlrZkVnNE0rQ1AxNHBrMVNkbFd0M0NSdjRHYW5PalV0LzJqN0w5OE9ncUVzRThBZzAyUjJXUFVFZ3ZjNEhucGh0dVhlYkFWWks3T2JkOEc4TFdpK1VwbStFOVVrMnQrbGo2S3Q1YmtLcS9ZdSt6Skt1aXkrcldRRTVDZzV5Zkljc1M2dVVBWTRpMjNYbmRBeHdJTW5FLzZFZ2xrd3RNWWJTK1p3NnhCWGlGblBzcENtOFJHcUY5S3hGMmFHK0dTRE5lR1FvNVlLVXBPTEdJcE5BT3BWQWhaSW92aDFwRGdyNnI3cE09LS05L01jajZmUGVEcEl6cjJSYTBpaXVnPT0%3D--ae43dcfa7904aed3f423eb6715b139c831a68a5a
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36


'''

headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}

response = requests.get(url='https://github.com/login',headers=headers)

login_cookies = response.cookies.get_dict()
authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />', response.text, re.S)[0]

print(authenticity_token)

headers2 = {
'Referer': 'https://github.com/login',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36',
}

# 拼接請求體信息
form_data = {
"commit": "Sign in",
"utf8": "✓",
"authenticity_token": authenticity_token,
"login": "tankjam",
"password": "kermit46709394",
"webauthn-support": "unsupported",
}

# 往session地址發送post請求
# 攜帶請求頭、請求體、login頁的cookies信息
response2 = requests.post(url='https://github.com/session', data=form_data, headers=headers2, cookies=login_cookies)
print(response2.status_code)
# print(response2.text)
with open('github.html', 'w', encoding='utf-8') as f:
f.write(response2.text)

  

二、response請求

import requests

response = requests.get('https://baidu.com')
# response響應
print(response.status_code)  # 獲取響應狀態碼
print(response.url)  # 獲取url地址
print(response.encoding)  # 字符編碼
response.encoding = 'utf-8'
print(response.text)  # 獲取文本
print(response.content)  # 獲取二進制流
print(response.headers)  # 獲取頁面請求頭信息
print(response.history)  # 上一次跳轉的地址
# 1、返回cookie字典 2、返回cookies對象
print(response.cookies)  # 獲取cookies信息,
print(response.cookies.get_dict())  # 獲取cookies信息轉換成字典
print(response.cookies.items())  # 獲取cookies信息轉換成字典
print(response.encoding)
print(response.elapsed)  # 訪問時間

import requests
# 往音頻地址發送get請求
url = 'https://vd3.bdstatic.com/mda-ic4pfhh3ex32svqi/hd/mda-ic4pfhh3ex32svqi.mp4?auth_key=1557973824-0-0-bfb2e69bb5198ff65e18065d91b2b8c8&bcevod_channel=searchbox_feed&pd=wisenatural&abtest=all.mp4'
response = requests.get(url, stream=True)  # stream=True 把content設置為一個迭代器對象
print(response.content)

with open('love_for_GD.mp4', 'wb') as f:
    for content in response.iter_content():
        f.write(content)

  

三、requests高級用法

'''
'''
'''
證書驗證(大部分網站都是https)
'''
import requests
# # 如果是ssl請求,首先檢查證書是否合法,不合法則報錯,程序終端
# response = requests.get('https://www.xiaohuar.com')
# print(response.status_code)

# 改進1:去掉報錯,但是會報警告
# import requests
# response = requests.get('https://www.xiaohuar.com', verify=False)
# # 不驗證證書,報警告,返回200
# print(response.status_code)

# 改進2:去掉報錯,並且去掉警報信息
# import requests
# import urllib3
# urllib3.disable_warnings()  # 關閉警告
# response = requests.get('https://www.xiaohuar.com', verify=False)
# print(response.status_code)

# 改進3:加上證書
# 很多網站都是https,但是不用證書也可以訪問,大多數情況都是可以攜帶也可以不攜帶證書
# 知乎\百度等都是可帶可不帶
# 有硬性要求的,則必須帶,比如對於定向的用戶,拿到證書后才有權限訪問某個特定網站
# import requests
# import urllib3
# # urllib3.disable_warnings()  # 關閉警告
# # 偽代碼
# response = requests.get(
#     'https://www.xiaohuar.com',
#     # verify=False,
#     # /path/server.crt證書的存放目錄, /path/key
#     cert=('/path/server.crt', '/path/key'))
# print(response.status_code)


'''
超時設置
'''

# 超時設置
# 兩種超時:float or tuple
# timeout=0.1  # 代表接收數據的超時時間
# # timeout=(0.1,0.2)  # 0.1代表鏈接超時  0.2代表接收數據的超時時間
#

# import requests
# response = requests.get('https://www.baidu.com',
#                         timeout=0.0001
# print(response.elapsed)
# print(response.status_code)

'''
代理設置:先發送請求給代理,然后由代理幫忙發送(封ip是常見的事情)
'''
# import requests
# proxies={
#     # 帶用戶名密碼的代理,@符號前是用戶名與密碼
#     'http':'http://tank:123@localhost:9527',
#     # 'http':'http://localhost:9527',
#     # 'https':'https://localhost:9527',
# }
# response=requests.get('https://www.12306.cn',
#                      proxies=proxies)
#
# print(response.status_code)
#
'''
爬取西刺免費代理:
    1.訪問西刺免費代理頁面
    2.通過re模塊解析並提取所有代理
    3.通過ip測試網站對爬取的代理進行測試
    4.若test_ip函數拋出異常代表代理作廢,否則代理有效
    5.利用有效的代理進行代理測試

<tr class="odd">
      <td class="country"><img src="//fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>
      <td>112.85.131.99</td>
      <td>9999</td>
      <td>
        <a href="/2019-05-09/jiangsu">江蘇南通</a>
      </td>
      <td class="country">高匿</td>
      <td>HTTPS</td>
      <td class="country">
        <div title="0.144秒" class="bar">
          <div class="bar_inner fast" style="width:88%">

          </div>
        </div>
      </td>
      <td class="country">
        <div title="0.028秒" class="bar">
          <div class="bar_inner fast" style="width:97%">

          </div>
        </div>
      </td>

      <td>6天</td>
      <td>19-05-16 11:20</td>
    </tr>
re:
    <tr class="odd">(.*?)</td>.*?<td>(.*?)</td>

'''
import requests
import re
import time

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}


def get_index(url):
    time.sleep(1)
    response = requests.get(url, headers=HEADERS)
    return response


def parse_index(text):
    ip_list = re.findall('<tr class="odd">.*?<td>(.*?)</td>.*?<td>(.*?)</td>', text, re.S)
    for ip_port in ip_list:
        ip = ':'.join(ip_port)
        yield ip

def test_ip(ip):
    print('測試ip: %s' % ip)
    try:
        proxies = {
            'https': ip
        }

        # ip測試網站
        ip_url = 'https://www.ipip.net/'

        # 使用有效與無效的代理對ip測試站點進行訪問,若返回的結果為200則代表當前測試ip正常
        response = requests.get(ip_url, headers=HEADERS, proxies=proxies, timeout=1)

        if response.status_code == 200:
            print(f'有用的ip:{ip}')
            return ip

    # 若ip代理無效則拋出異常
    except Exception as e:
        print(e)

# # 使用代理爬取nba
def spider_nba(good_ip):
    url = 'https://china.nba.com/'

    proxies = {
        'https': good_ip
    }

    response = requests.get(url, headers=HEADERS, proxies=proxies)
    print(response.status_code)
    print(response.text)


if __name__ == '__main__':
    base_url = 'https://www.xicidaili.com/nn/{}'

    for line in range(1, 3677):
        ip_url = base_url.format(line)

        response = get_index(ip_url)

        # 解析西刺代理獲取每一個ip列表
        ip_list = parse_index(response.text)

        # 循環每一個ip
        for ip in ip_list:
            # print(ip)

            # 對爬取下來的ip進行測試
            good_ip = test_ip(ip)

            if good_ip:
                # 真是代理,開始測試
                spider_nba(good_ip)



'''
認證設置
'''
import requests
# 通過訪問github的api來測試
# url = 'https://api.github.com/user'
# HEADERS = {
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
# }

# 測試1,失敗返回401
# response = requests.get(url, headers=HEADERS)
# print(response.status_code)  # 401
# print(response.text)
'''
打印結果:
    {
      "message": "Requires authentication",
      "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
    }
'''
#
# # 測試2,通過requests.auth內的HTTPBasicAuth進行認證,認證成功返回用戶信息
# from requests.auth import HTTPBasicAuth
# response = requests.get(url, headers=HEADERS, auth=HTTPBasicAuth('tankjam', 'kermit46709394'))
# print(response.text)
#

# 測試3,通過requests.get請求內的auth參數默認就是HTTPBasicAuth,認證成功返回用戶信息
# response = requests.get(url, headers=HEADERS, auth=('tankjam', 'kermit46709394'))
# print(response.text)


'''
上傳文件
'''
# import requests

# 上傳文本文件
# files1 = {'file': open('user.txt', 'rb')}
# # files參數是POST請求固定參數
# response = requests.post('http://httpbin.org/post', files=files1)
# print(response.status_code)  # 200
# print(response.text)  # 200

# 上傳圖片文件
# files2 = {'jpg': open('一拳.jpg', 'rb')}
# response = requests.post('http://httpbin.org/post', files=files2)
# print(response.status_code)  # 200
# print(response.text)  # 200
#
# 上傳視頻文件
# files3 = {'movie': open('love_for_GD.mp4', 'rb')}
# response = requests.post('http://httpbin.org/post', files=files3)
# print(response.status_code)  # 200
# print(response.text)  # 200

  

四、selenium基本使用

''''''
'''
selenium模塊講解
一 什么是selenium?
    最初是一個自動化測試工具。可以使用它幫我們驅動瀏覽器
    自動去執行某些自定義好的操作。例如在頁面中執行JS代碼、
    跳過登錄驗證。可以使用selenium幫我們實現爬蟲。
    
二 為什么要使用selenium?
    1、優點:
        使用requests模塊登錄需要分析大量的復雜通信流程,使用selenium
    可以輕松跳過登錄驗證。
    
    2、缺點:
        瀏覽器會加載css、js、圖片、視頻...數據,爬蟲效率相比requests模塊要低。
        
三 如何使用selenium?
    下載selenium模塊:
        pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple selenium
    下載瀏覽器驅動:
        http://npm.taobao.org/mirrors/chromedriver/2.38/
'''

# selenium之第一次
from selenium import webdriver  # 用來驅動瀏覽器的

# 調用得到一個動作鏈對象,破解滑動驗證碼的時候用的,可以拖動圖片
from selenium.webdriver import ActionChains

# 按照什么方式查找屬性,By.ID,  By.CSS_SELECTOR, By.Class
from selenium.webdriver.common.by import By

from selenium.webdriver.common.keys import Keys  # 鍵盤按鍵操作

# 和下面WebDriverWait一起用的,EC是expected_conditions的別名
from selenium.webdriver.support import expected_conditions as EC

# 等待頁面加載某些元素
from selenium.webdriver.support.wait import WebDriverWait
import time

# 通過谷歌瀏覽器驅動打開谷歌瀏覽器
# webdriver.Chrome(r'chromedriver.exe的絕對路徑')
# chrome = webdriver.Chrome(r'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe')  # 括號內輸入chromedriver.exe的絕對路徑

# chromedriver.exe存放於python解釋器的Scripts文件夾中

# chrome是一個驅動對象
chrome = webdriver.Chrome()

'''
實例1
'''
# 若try出現異常
# try:
#     # 往tank博客主頁發送get請求
#     # chrome.get('https://www.cnblogs.com/kermitjam/')
#
#     # 參數1: 驅動對象  參數2: 等待時間
#     wait = WebDriverWait(chrome, 10)
#
#     # 1、訪問百度
#     chrome.get('https://www.baidu.com/')
#
#     # 2、查找input輸入框
#     input_tag = wait.until(
#         # 調用EC的presence_of_element_located()
#         EC.presence_of_element_located(
#             # 此處可以寫一個元組
#             # 參數1: 查找屬性的方式
#             # 參數2: 屬性的名字
#             (By.ID, "kw")
#         )
#     )
#     input_tag = wait.until(EC.presence_of_element_located((By.ID, "kw")))
#
#     # 3、搜索一拳超人
#     input_tag.send_keys('一拳超人')
#
#     # 4、按鍵盤回車鍵
#     input_tag.send_keys(Keys.ENTER)
#
#     time.sleep(3)
#
# # 無論發生什么都會關閉瀏覽器
# finally:
#     # 關閉瀏覽器
#     chrome.close()


'''
實例2
'''
try:
    # 往tank博客主頁發送get請求
    # chrome.get('https://www.cnblogs.com/kermitjam/')

    # 參數1: 驅動對象  參數2: 等待時間
    wait = WebDriverWait(chrome, 10)

    # 1、訪問京東主頁
    chrome.get('https://www.jd.com/')

    # 2、查找input輸入框
    input_tag = wait.until(EC.presence_of_element_located((By.ID, "key")))

    # 3、搜索唐詩三百首
    input_tag.send_keys('唐詩三百首')

    # 4、根據class屬性名稱查找標簽
    search_button = wait.until(
        EC.presence_of_element_located((By.CLASS_NAME, 'button')))
    # 5、點擊搜索按鈕
    search_button.click()

    time.sleep(3)

# 無論發生什么都會關閉瀏覽器
finally:
    # 關閉瀏覽器
    chrome.close()

  

五、selenium之基本選擇器

# from selenium import webdriver  # 用來驅動瀏覽器的
# import time
#
# '''
# 隱式等待
# '''
# # 獲取驅動對象、
# driver = webdriver.Chrome()
#
# try:
#     # 顯式等待: 等待某個元素加載
#     # 參數1: 驅動對象  參數2: 等待時間
#     # wait = WebDriverWait(chrome, 10)
#
#     driver.get('https://china.nba.com/')
#
#     # 隱式等待: 等待頁面所有元素加載
#     driver.implicitly_wait(10)
#     news_tag = driver.find_element_by_class_name('nav-news')
#     # 獲取標簽對象
#     print(news_tag)
#     # 獲取標簽的名字
#     print(news_tag.tag_name)
#
#
#     time.sleep(10)
#
# finally:
#     driver.close()


from selenium import webdriver  # 用來驅動瀏覽器的
import time

'''
===============所有方法===================
    element是查找一個標簽
    elements是查找所有標簽

    1、find_element_by_link_text  通過鏈接文本去找
    2、find_element_by_id 通過id去找
    3、find_element_by_class_name
    4、find_element_by_partial_link_text
    5、find_element_by_name
    6、find_element_by_css_selector
    7、find_element_by_tag_name
'''
# 獲取驅動對象、
driver = webdriver.Chrome()

try:

    # 往百度發送請求
    driver.get('https://www.baidu.com/')
    driver.implicitly_wait(10)

    # 1、find_element_by_link_text  通過鏈接文本去找
    # 根據登錄
    # send_tag = driver.find_element_by_link_text('登錄')
    # send_tag.click()

    # 2、find_element_by_partial_link_text 通過局部文本查找a標簽
    login_button = driver.find_element_by_partial_link_text('登')
    login_button.click()
    time.sleep(1)

    # 3、find_element_by_class_name 根據class屬性名查找
    login_tag = driver.find_element_by_class_name('tang-pass-footerBarULogin')
    login_tag.click()
    time.sleep(1)

    # 4、find_element_by_name 根據name屬性查找
    username = driver.find_element_by_name('userName')
    username.send_keys('15622792660')
    time.sleep(1)

    # 5、find_element_by_id 通過id屬性名查找
    password = driver.find_element_by_id('TANGRAM__PSP_10__password')
    password.send_keys('*******')
    time.sleep(1)

    # 6、find_element_by_css_selector  根據屬性選擇器查找
    # 根據id查找登錄按鈕
    login_submit = driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit')
    # driver.find_element_by_css_selector('.pass-button-submit')
    login_submit.click()

    # 7、find_element_by_tag_name  根據標簽名稱查找標簽
    div = driver.find_element_by_tag_name('div')
    print(div.tag_name)

    time.sleep(10)

finally:
    driver.close()

  

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM