Pthon常用模塊之requests，urllib和re

本文轉載自查看原文 2018-11-05 14:01 643 ===爬蟲===

urllib

Python標准庫中提供了：urllib等模塊以供Http請求，但是，它的 API 太渣了。

它需要巨量的工作，甚至包括各種方法覆蓋，來完成最簡單的任務，

下面是簡單的使用urllib來進行請求數據的方法

import urllib.request

f=urllib.request.urlopen('http://www.webxml.com.cn//webservices/qqOnlineWebService.asmx/qqCheckOnline?qqCode=424662508')
result = f.read().decode('utf-8')
 
# 或者 
import urllib.request
 
req = urllib.request.Request("http://www.baidu.com")
response = urllib.urlopen(req)
print(response.read().decode('utf-8'))

我們更推薦大家使用第二種方法，兩種方法請求的結果都一樣，只不過第二種中間多了一個request對象，為啥要這樣子，因為在構建請求時還需要加入好多內容，因此通過構建一個request，服務器響應請求得到應答，這樣顯得邏輯上清晰明確

比如說加入User-Agent參數到請求頭，就可以使用如下方式

import urllib.request
req = urllib.request.Request('http://www.example.com/')
req.add_header("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0")
r = urllib.request.urlopen(req)
result = f.read().decode('utf-8')

Python官方文檔：https://docs.python.org/3.5/library/urllib.request.html#module-urllib.request

Requests

Requests 是使用 Apache2 Licensed 許可證的基於Python開發的HTTP 庫，其在Python內置模塊的基礎上進行了高度的封裝，

從而使得進行網絡請求時，變得美好了許多，而且使用Requests可以輕而易舉的完成瀏覽器可有的任何操作

1.安裝模塊

pip3 install requests

2.簡單使用

import requests
 
r = requests.get('http://www.shangzekai.xyz')
print type(r)
print r.status_code  # 服務器返回狀態碼
print r.encoding # 使用的編碼
print r.text # 返回的內容
cookies ...

3.基本方法

3.1 get請求

# 1、無參數實例
import requests
ret = requests.get('https://github.com/timeline.json')

print(ret.url)
print(ret.text)
 
# 2、有參數實例
import requests
payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.get("http://httpbin.org/get", params=payload)
 
print(ret.url)
print(ret.text)

# 3、解析json
import requests
import json
response = requests.get("http://httpbin.org/get")
print(type(response.text))
print(response.json())
print(json.loads(response.text))
print(type(response.json()))

# 4、添加headers
import requests
response = requests.get("https://www.zhihu.com/explore")
print(response.text)
此時，我們需要加上一些頭信息，來模擬瀏覽器的登錄
import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
response = requests.get("https://www.zhihu.com/explore", headers=headers)
print(response.text)

3.2 Post 請求

# 1、基本POST實例
import requests
 
# 當headers為application/content的時候，請求實例如下：
payload = {'key1': 'value1', 'key2': 'value2'}
ret = requests.post("http://httpbin.org/post", data=payload)
 
print(ret.text)
print(type(response.headers), response.headers)
print(type(response.cookies), response.cookies)
print(type(response.url), response.url)
print(type(response.history), response.history)
 
# 2、發送請求頭和數據實例
import json
import requests
 
url = 'http://httpbin.org/post'
payload = {'some': 'data'}
headers = {'content-type': 'application/json'}
# 當headers為application/json的時候，請求示例如下：
ret = requests.post(url, data=json.dumps(payload), headers=headers)
print(ret.text)

　　注意

　　get和post請求兩者的區別在於，get請求方法參數只有params，而沒有data參數，而post請求中兩者都是有的

請求狀態碼

生成User-Agent

from fake_useragent import UserAgent

ua = UserAgent()

headers = {
    'User-Agent' : ua.random
}

4 模擬登陸

# 1. 首先如何獲取cookie
import requests
response = requests.get('http://www.baidu.com')
print(response.cookies)
for key,value in response.cookies.items():
    print(key + '=' + value)
# 2. 會話登錄
import requests
## 表示設置一個cookie
requests.get('http://www.httpbin.org/cookies/set/123456789')
## 然后獲取網站的cookie
res = requests.get('http://www.httpbin.org/cookies')
print(res.text)
打印，我們發現沒有數據，因為上面的兩行代碼，就相當於兩個瀏覽器進行訪問，因此不可能獲取到第一次cookie訪問的信息
因此，我們采用如下的方法進行模擬登陸
s = requests.Session()
requests.get('http://www.httpbin.org/cookies/set/123456789')
res = requests.get('http://www.httpbin.org/cookies')
print(res.text)
最終，我們采用上述的方法，獲取到了最終的cookie的值，獲取該值之后，我們可以拿着個cookie來進行登錄了

5) SSL設置

import requests
from requests.packages import urllib3
urllib3.disable_warnings()
res = requests.get('http://www.12306.cn',verify=False)
print(res.status_code)
import requests
response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))
print(response.status_code)

* requests其他請求

requests.get(url, params=None, **kwargs)
requests.post(url, data=None, json=None, **kwargs)
requests.put(url, data=None, **kwargs)
requests.head(url, **kwargs)
requests.delete(url, **kwargs)
requests.patch(url, data=None, **kwargs)
requests.options(url, **kwargs)
 
# 以上方法均是在此方法的基礎上構建
requests.request(method, url, **kwargs)

6) 代理設置

import requests
proxies = {
  "http": "http://127.0.0.1:9743",
  "https": "https://127.0.0.1:9743",
}
response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)

有密碼的設置

import requests
proxies = {
    "http": "http://user:password@127.0.0.1:9743/",
}
response = requests.get("https://www.taobao.com", proxies=proxies)
print(response.status_code)

7)、超時時間設置

import requests
from requests.exceptions import ReadTimeout
try:
    response = requests.get("http://httpbin.org/get", timeout = 0.5)
    print(response.status_code)
except ReadTimeout:
    print('Timeout')

8)、異常處理 (導入模塊)

import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
try:
    response = requests.get("http://httpbin.org/get", timeout = 0.5)
    print(response.status_code)
except ReadTimeout:
    print('Timeout')
except ConnectionError:
    print('Connection error')
except RequestException:
    print('Error')

`常見實例

1) 檢測QQ是否在線

import urllib
import requests
from xml.etree import ElementTree as ET
# 使用內置模塊urllib發送HTTP請求，或者XML格式內容
"""
f = urllib.request.urlopen('http://www.webxml.com.cn//webservices/qqOnlineWebService.asmx/qqCheckOnline?qqCode=493133139')
result = f.read().decode('utf-8')
"""
# 使用第三方模塊requests發送HTTP請求，或者XML格式內容
r = requests.get('http://www.webxml.com.cn//webservices/qqOnlineWebService.asmx/qqCheckOnline?qqCode=493133139')
result = r.text
# 解析XML格式內容
node = ET.XML(result)
# 獲取內容
if node.text == "Y":
    print("在線")
else:
    print("離線")

2) 查看火車停靠信息

import urllib
import requests
from xml.etree import ElementTree as ET
# 使用內置模塊urllib發送HTTP請求，或者XML格式內容
"""
f = urllib.request.urlopen('http://www.webxml.com.cn/WebServices/TrainTimeWebService.asmx/getDetailInfoByTrainCode?TrainCode=G666&UserID=')
result = f.read().decode('utf-8')
"""
# 使用第三方模塊requests發送HTTP請求，或者XML格式內容
r = requests.get('http://www.webxml.com.cn/WebServices/TrainTimeWebService.asmx/getDetailInfoByTrainCode?TrainCode=G666&UserID=')
result = r.text
# 解析XML格式內容
root = ET.XML(result)
for node in root.iter('TrainDetailInfo'):
    print(node.find('TrainStation').text,node.find('StartTime').text,node.tag,node.attrib)

Python -- 正則

1.正則表達式的概念

正則表達式是對字符串操作的一種邏輯公式，就是用事先定義好的一些特定字符、及這些特定字符的組合，組成一個“規則字符串”，這個“規則字符串”用來表達對字符串的一種過濾邏輯

正則表達式的大致匹配過程是：

1.依次拿出表達式和文本中的字符比較，
2.如果每一個字符都能匹配，則匹配成功；一旦有匹配不成功的字符則匹配失敗。
3.如果表達式中有量詞或邊界，這個過程會稍微有一些不同

2.常見的正則表達式符號

'^'     匹配字符開頭，若指定flags MULTILINE,這種也可以匹配上(r"^a","\nabc\neee",flags=re.MULTILINE)
'$'     匹配字符結尾
'*'     匹配*號前的字符0次或多次，re.findall("ab*","cabb3abcbbac")  結果為['abb', 'ab', 'a']
'+'     匹配前一個字符1次或多次，re.findall("ab+","abcdabbbba") 結果['ab', 'abb']
'?'     匹配前一個字符1次或0次
'.'     默認匹配除\n之外的任意一個字符，若指定flag DOTALL,則匹配任意字符，包括換行
'{m}'   匹配前一個字符m次
'{n,m}' 匹配前一個字符n到m次，re.findall("ab{1,3}","abb abc abbcbbb") 結果'abb', 'ab', 'abb']
'|'     匹配|左或|右的字符，re.findall("abc|ABC","ABCBabcCD")結果'ABC'
'(...)' 分組匹配，re.search("(abc){2}a(123|456)c", "abcabca456c").group() 結果 abcabca456c
 
 
'\A'    只從字符開頭匹配
'\Z'    匹配字符結尾，同$
'\d'    匹配數字0-9
'\D'    匹配非數字
'\w'    匹配[A-Za-z0-9]
'\W'    匹配非[A-Za-z0-9]
's'     匹配空白字符空格、\t、\n、\r 
 
'(?P<name>...)' 分組匹配 re.search("(?P<province>[0-9]{4})(?P<city>[0-9]{2})(?P<birthday>[0-9]{4})","371481199306143242").groupdict("city") 結果{'province': '3714', 'city': '81', 'birthday': '1993'}

常見的使用方法

re.findall 把所有匹配到的字符放到列表中，以列表中的元素返回

findall(pattern, string, flags=0)

print(re.findall('\d+','one1two2three3four4'))

3.貪婪模式和非貪婪模式

Python里數量詞默認是貪婪的，總是嘗試匹配盡可能多的字符；
非貪婪則相反，總是嘗試匹配盡可能少的字符
但有的時候，我們並不是想要貪婪模式，那怎么辦？

非常簡單，只需要在”*”,”?”,”+”,”{m,n}”后面加上？，使貪婪變成非貪婪

>>> s="This is a number 234-235-22-423"
>>> r=re.findall(".+(\d+-\d+-\d+-\d+)",s)
>>> r
'4-235-22-423'
>>> r=re.findall(".+?(\d+-\d+-\d+-\d+)",s)
>>> r
'234-235-22-423'
>>>

解決方式：非貪婪操作符“？”，這個操作符可以用在”*”,”+”,”?”的后面，要求正則匹配的越少越好

    
>>> re.findall(r"aa(\d+)","aa2343ddd")
['2343']
>>> re.match(r"aa(\d+?)","aa2343ddd").group(1)
'2'

4.Python的多行匹配和點任意匹配

r=re.complile(pattern,re.M)
re.M(re.MULTILINE):多行模式，改變’^’和’$‘的行為,即^ $標志將會匹配每一行

    
>>> re.findall(r"^a(\d+)b","a213b\na2345b\na234b") 
['213']
>>> re.findall(r"^a(\d+)b","a213b\na2345b\na234b",re.M)
['213', '2345', '234']
>>> re.findall(r"a(\d+)b","a213b\na2345b\na234b")  #如果沒有^標志,無需re.M
['213', '2345', '234']

re.S(re.DOTALL):點任意匹配模式

元字符“.”在默認模式下，匹配除換行符外的所有字符。在DOTALL模式下，匹配所有字符，包括換行符

    
>>> re.findall(r".","\n",re.S)                         
['\n']

5.常見的正則表達式

IP：
^(25[0-5]|2[0-4]\d|[0-1]?\d?\d)(\.(25[0-5]|2[0-4]\d|[0-1]?\d?\d)){3}$
手機號：
^1[3|4|5|8][0-9]\d{8}$
郵箱：
[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+

6.一個簡單的抓取網頁分析的案例

import requests
import re
hds=[{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'},
    {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'},
    {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'},
    {'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
    {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'},
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
    {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'},
    {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'},
    {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}]
def get_page(url):
    headers = hds[random.randint(0,len(hds)-1)]
    response = requests.get(url, headers = headers)
    try:
        if response.status_code == 200:
            res = response.text
            return res
        return None
    except Exception as e:
        print(e)
'''
<div class="board-item-content">
    <div class="movie-item-info">
        <p class="name"><a href="/films/1203" title="霸王別姬" data-act="boarditem-click" data-val="{movieId:1203}">霸王別姬</a></p>
        <p class="star">
            主演：張國榮,張豐毅,鞏俐
        </p>
        <p class="releasetime">上映時間：1993-01-01(中國香港)</p>
    </div>
    <div class="movie-item-number score-num">
        <p class="score"><i class="integer">9.</i><i class="fraction">6</i></p>
    </div>
</div>
'''
def get_movie(html):
    partten = '<p.*?><a.*?>(.*?)</a></p>.*?<p.*?>(.*?)</p>.*?<p.*?>(.*?)</p>'
    items = re.findall(partten, html, re.S)
    #print((items))
    return items
def write_file(items):
    fileMovie = open('movie.txt', 'w', encoding='utf8')
    try:
        for movie in items:
            fileMovie.write('電影排名：' + movie[0] + '\r\n')
            fileMovie.write('電影主演：' + movie[1].strip() + '\r\n')
            fileMovie.write('上映時間：' + movie[2] + '\r\n\r\n')
        print('文件寫入成功...')
    finally:
        fileMovie.close()
def main(url):
    html = get_page(url)
    items = get_movie(html)
    write_file(items)
    
if __name__ == '__main__':
    url = "http://maoyan.com/board/4"
    main(url)

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python模塊之requests,urllib和re 美圖錄爬蟲(requests模塊,re模塊) 常用模塊:re模塊下的常用方法 python的re模塊常用方法 Python的re模塊的常用方法深入理解urllib、urllib2及requests requests庫和urllib包對比 python模塊&&模塊re requests模塊 python3之模塊urllib