python 帶你了解爬蟲


一篇文章帶你了解《python爬蟲》

一 什么是網絡爬蟲:

       1. 通俗理解:爬蟲是一個模擬人類請求網站行為的程序。可以自動請求網頁、並數據抓取下來,然后使用一定的規則提取有價值的數據。

  2. 專業介紹:百度百科

二 python urllib:

# demo01.py(urillb基本使用)

復制代碼
# 導入urllib庫(該庫不需要安裝)
import urllib.request
# 請求百度,並接收響應
response = urllib.request.urlopen("http://www.baidu.com/")
# 打印頁面
print(response.read().decode('utf-8'))
復制代碼

# demo2.py(用法講解)

復制代碼
# urllib 用法講解
# urlopen : urllib.request.urlopen('網址','數據','超時設置')

import urllib.request
import urllib.parse
import urllib.error

"""
A:
response = urllib.request.urlopen('http://www.baidu.com/')
print(response.read().decode('utf-8'))

B:
data = urllib.parse.urlencode({'word': 'hello'}).encode('utf-8')
response = urllib.request.urlopen("http://httpbin.org/post", data = data)
print(response.read())

C:
response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
print(response.read())
"""

try:
    response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason.socket.timeout):
        print(response.read())
復制代碼

# demo03.py(響應)

復制代碼
# urllib 響應

import urllib.request
response = urllib.request.urlopen("http://www.baidu.com/")
# 打印響應類型
print(type(response))
# 打印狀態碼
print(response.status)
# 打印響應頭
print(response.getheaders())
復制代碼

# demo04.py(Request 詳解

復制代碼
# Request 詳解

import urllib.request
from urllib import parse

"""
A:
request = urllib.request.Request('http://www.baidu.com')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))

B:
url = "http://httpbin.org/post"
# 指定請求頭
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
    "Host": "api.github.com"
}
# 請求數據
dict = {
    "name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
request = urllib.request.Request(url=url,data=data,headers=headers,method='POST')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
"""

url = "http://httpbin.org/post"
# 請求數據
dict = {
    "name":"Germey"
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
request = urllib.request.Request(url=url,data=data,method='POST')
request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36")
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))
復制代碼

# demo05.py (代理)

復制代碼
# handler(代理)
import urllib.request

proxy_header = urllib.request.ProxyHandler({
    "http":"http://xxx.xxx.xxx.xxx:xxxx",
    "https":"https://xxx.xxx.xxx.xxx:xxxx"
})
opener = urllib.request.build_opener(proxy_header)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
復制代碼

# demo06.py(cookie)

復制代碼
# cookie

import http.cookiejar
import urllib.request

"""
A: http.cookiejar 簡單使用
cookie = http.cookiejar.CookieJar()
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

B:MozillaCookieJar 將網站的cookie存儲在本地文件中
filename = "utils/cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)

C: LWPCookieJar 將網站的cookie存儲在本地文件中
filename = "utils/cookie01.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)

D: 使用文件中的cookie
"""
cookie = http.cookiejar.LWPCookieJar()
cookie.load('utils/cookie01.txt',ignore_discard=True,ignore_expires=True)
handir = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handir)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
復制代碼

# demo07.py(異常處理)

復制代碼
# 異常處理

import urllib.request
from urllib import error

"""
A: urllib error 簡單使用
try:
    response = urllib.request.urlopen('http://www.baidu.com')
except error.URLError as e:
    print(e.reason)
    
B:
try:
    response = urllib.request.urlopen('http://www.baidu.com/')
    print(response.read().decode('utf-8'))
except error.URLError as e:
    print(e.reason)
else:
    print("*************")
    
C: timeout
try:
    response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01)
except error.URLError as e:
    print(e.reason)
"""

# 一個不存在的連接
try:
    response = urllib.request.urlopen("http://www.abcdhaha2.com/")
    html = response.read().decode('utf-8')
    print(html)
except error.URLError as e:
    print(e.reason)
復制代碼

# demo08.py(URL解析)

復制代碼
from urllib.parse import urlparse
from urllib.parse import urlunparse
from urllib.parse import urljoin
from urllib.parse import urlencode

# 語法:urlparse("網址",scheme='http|https', allow_fragments=True)

# A
resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment')
print(type(resuit))
print(resuit)

# B
resuit = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme="https")
print(resuit)

# C
resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment', allow_fragments=True)
print(resuit)

# D
resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False)
print(resuit)

# E
resuit = urlparse('https://www.baidu.com/index.html#comment', allow_fragments=False)
print(resuit)

# F (urlunparse)
data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"]
print(urlunparse(data))

# G (urljoin)
# 語法 : urljoin("網址","要添加的后綴")
print(urljoin("https://www.cnblogs.com/xingxingnbsp/p/xxxxxxxxx.html", "12129466.html"))

# H (urlencode)
params = {
    'name': 'hello_urllib',
    'age': 18
}
base_url = 'http://www.baidu.com?'
url = base_url + urlencode(params)
print(url)
復制代碼

三 python requests:

1. 安裝 requests 庫:pip install requests

# demo01.py

復制代碼
# requests 基本使用

import requests

response = requests.get("http://www.baidu.com")
print(type(response))           # 打印響應類型
print(response.status_code)     # 打印狀態碼
print(type(response.text))      # 打印響應內容類型
print(response.text)            # 打印響應內容
print(response.cookies)         # 打印響應cookie
復制代碼

2. 請求方式:

復制代碼
1 requests.get('網址')
2 requests.post('網址')
3 requests.put('網址')
4 requests.patch('網址')
5 requests.delete('網址')
6 requests.head('網址')
7 requests.options('網址')
復制代碼

3. 基本get請求:

# demo02.py

復制代碼
import requests

"""
A:
response = requests.get('http://www.baidu.com')
print(response.text)

B:
response = requests.get('http://httpbin.org/get?name=hello&age=22')
print(response.text)
"""

data = {
    "name":"hello",
    "age":22
}
response = requests.get('http://httpbin.org/get',params=data)
print(response.text)
復制代碼

4. 解析json:

# demo03.py

復制代碼
# 解析json

import requests
response = requests.get('https://api.jinse.com/v6/www/information/list?catelogue_key=news&limit=23&information_id=18762945&flag=down&version=9.9.9&_source=www')
print(type(response))
print(response.json())
print(type(response.json()))
復制代碼

5. 獲取二進制數據

# demo04.py

復制代碼
import requests

"""
A:
response = requests.get('https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500')
print(type(response.text))
print(type(response.content))
print(response.text)
print(response.content)
"""
response = requests.get('https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500')
with open('images/image.png','wb') as f:
    f.write(response.content)
    f.close()
復制代碼

6. 添加headers:

# demo05.py

復制代碼
import requests

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
response = requests.get("http://www.baidu.com",headers=headers)
print(response.text)
復制代碼

7. 基本的post請求

# demo06.py

復制代碼
import requests

"""
A:
data = {
    "name":"hello",
    "age":22
}
response = requests.post("http://httpbin.org/post",data=data)
print(response.text)
"""

data = {
    "name":"hello",
    "age":22
}
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
response = requests.post("http://httpbin.org/post",data=data,headers=headers)
print(response.text)
復制代碼

8. 響應:(response屬性)

# demo07.py

復制代碼
import requests

response = requests.get('http://www.baidu.com')
print(type(response.status_code),response.status_code)      # 打印響應 狀態碼類型 和 狀態碼
print(type(response.headers),response.headers)              # 打印響應 頭類型 和 響應頭
print(type(response.cookies),response.cookies)              # 打印響應 cookies類型 和 cookies
print(type(response.url),response.url)                      # 打印響應 URL類型 和 URL
print(type(response.history),response.history)              # 打印歷史記錄
復制代碼

9. 狀態碼判斷:

# demo08.py

復制代碼
import requests

"""
A:
response = requests.get('http://www.baidu.com')
# 這里使用了python三元表達式
exit() if not response.status_code == requests.codes.ok else print('request successfully')

B:
response = requests.get('http://www.baidu.com')
# 這里使用了python三元表達式
exit() if not response.status_code == 200 else print('request successfully')

"""
response = requests.get('http://www.baidu.com')
if not response.status_code == 200:
    exit()
else:
    print('request successfully')

# 以上三種方式表達的意思是一樣的
復制代碼

10. 高級操作:

# demo09.py

復制代碼
import requests

# A: 上傳文件 ----------------------------------------------------------------
files = {
    "files":open('images/image.png','rb')
}
response = requests.post('http://www.baidu.com',files=files)
print(response.text)

# B:獲取cookie -------------------------------------------------------------
response = requests.get('http://www.baidu.com')
print(response.cookies)
for key,value in response.cookies.items():
    print(key + "=" + value)
    
# C: 會話維持 --------------------------------------------------------------
requests.get('http://httpbin.org/cookie/set/number/123456789')
response = requests.get('http://httpbin.org/cookkie')
print(response.text)

s = requests.session()
s.get('http://httpbin.org/cookie/set/number/123456789')
response = s.get('http://httpbin.org/cookkie')
print(response.text)

# D: 代理設置 --------------------------------------------------------------
# 方式一:
proxies = {
    'http':'http://ip:port',
    'https':'https://ip:port'
}
response = requests.get('http://www.baidu.com',proxies=proxies)
print(response.status_code)

# 方式二:
proxies = {
    'http':'http://user:password@ip:port/',
    'https':'https://user:password@ip:port/'
}
response = requests.get('http://www.baidu.com',proxies=proxies)
print(response.status_code)

# 方式三:
proxies = {
    'http':'socks5://ip:port',
    'https':'socks5://ip:port'
}
response = requests.get('http://www.baidu.com',proxies=proxies)
print(response.status_code)

# E: 證書認證 ----------------------------------------------------------------
response = requests.get('http://www.12306.cn')
print(response.status_code)

response = requests.get('http://www.12306.cn',verify=False)
print(response.status_code)

# 注意這里的路徑 'path/server.crt','path/key' 該成自己的
response = requests.get('http://www.12306.cn',cert=('path/server.crt','path/key'))
print(response.status_code)

# F:超時設置 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout
try:
    response = requests.get('http://www.taobao.com', timeout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("Timeout")
    
# G: 認證管理 ----------------------------------------------------------------
from requests.auth import HTTPBasicAuth
response = requests.get('http://www.taobao.com', auth=HTTPBasicAuth('user','123'))
print(response.status_code)

response = requests.get('http://www.taobao.com', auth=('user','123'))
print(response.status_code)

# H: 異常處理 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout,ConnectionError,HTTPError,RequestException
try:
    response = requests.get('http://www.taobao.com', timeout=0.1)
    print(response.status_code)
except ReadTimeout:
    print("Timeout")
except HTTPError:
    print("HTTPError")
except ConnectionError:
    print("ConnectionError")
except RequestException:
    print("Error")
復制代碼

四 BeautifulSoup庫詳解:(網頁解析器)

1. 安裝 :pip install beautifulsoup4

2. BeautifulSoup基本用法:

# demo01.py

復制代碼
# BeautifulSoup 的基本使用
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>這是一個列表</h2>
    <ul>
        <li>選項1</li>
        <li>選項2</li>
        <li>選項3</li>
        <li>選項4</li>
        <li>選項5</li>
        <li>選項6</li>
        <li>選項7</li>
        <li>選項8</li>
        <li>選項9</li>
    </ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.prettify())
print(soup.title.string)
復制代碼

3. 標簽選擇器:(只能拿一次)

# demo02.py

復制代碼
# BeautifulSoup 標簽選擇器(只拿一次)
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>這是一個列表</h2>
    <ul>
        <li>選項1</li>
        <li>選項2</li>
        <li>選項3</li>
        <li>選項4</li>
        <li>選項5</li>
        <li>選項6</li>
        <li>選項7</li>
        <li>選項8</li>
        <li>選項9</li>
    </ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.li)
復制代碼

4. 獲取標簽名稱:

# demo03.py

復制代碼
# BeautifulSoup 獲取標簽名稱
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.title.name)
復制代碼

5. 獲取標簽屬性:

# demo04.py

復制代碼
# BeautifulSoup 獲取標簽屬性
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<p class="font-p"></p>
<a href="http://www.baidu.com">百度一下 你就知道</a>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.p.attrs)
print(soup.p.attrs["class"])
print(soup.a.attrs["href"])
復制代碼

6. 獲取內容:

# demo05.py

復制代碼
# BeautifulSoup 獲取標簽屬性
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
div
<a href="http://www.baidu.com">百度一下 你就知道</a>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.p.string)
print(soup.a.string)
復制代碼

7. 嵌套選擇:

 # demo06.py

復制代碼
# 嵌套選擇
from bs4 import BeautifulSoup
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>這是一個列表</h2>
    <ul>
        <li>選項1</li>
    </ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.ul.li.string)
復制代碼

8. 子節點和孫節點:

 # demo07.py

復制代碼
# 子節點和孫節點
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h2>這是一個列表</h2>
    <ul><li>選項1</li><li>選項2</li><li><a href="http://www.baidu.com">百度一下 你就知道</a></li></ul>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.ul.contents) # 選擇所有子節點 返回值為列表類型
print(soup.ul.childern) # 選擇單個子節點
print(soup.ul.descendants)  # 獲取所有子孫節點
for i,child in enumerate(soup.ul.descendants):
    print(i,child)
復制代碼

9. 父節點和祖先節點:

 # demo08.py

復制代碼
# 父節點和祖先節點

from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
<div>
    <ol>
        <li><a href="http://www.baidu.com">百度一下 你就知道</a></li>
    </ol>
</div>
</body>
</html>
"""
soup = BeautifulSoup(html,'lxml')
print(soup.a.parent)    # 選擇父節點
print(type(soup.a.parents)) # 選擇所有父節點
print(list(enumerate(soup.a.parents)))
復制代碼

10.兄弟節點:

# demo09.py

復制代碼
# 兄弟節點
from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div>
    <h1>我是一個大大的H1</h1>
    <h2>我是一個大大的H2</h2>
    <p>我是一個簡單的p標簽</p>
    <h3>我是一個大大的H3</h3>
    <h4>我是一個大大的H4</h4>
</div>
</body>
</html>
"""
html = html.replace('\n','').replace(' ','')   # 去掉html代碼的 "\n" 和 空格
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.p.next_siblings)))    # 獲取當前加點下所有的兄弟節點
print(list(enumerate(soup.p.previous_siblings)))    # 獲取當前加點上所有的兄弟節點
復制代碼

11. 標准選擇器(***重點***)

 # demo10.py

復制代碼
from bs4 import BeautifulSoup

# 標准選擇器(重點 建議反復觀看)
# 語法:find_all(name,attrs,recursive,text,**kwargs)
"""
find 返回符合條件的單個元素 find_all 返回所有符合條件的所有元素
    1. find_parent()          # 返回直接父節點
    2. find_parents()         # 獲取所有祖先節點
    3. find_next_sibling()    # 返回當前節點后邊一個兄弟節點
    4. find_next_siblings()   # 返回當前節點后邊所有兄弟節點
    5. find_all_next()        # 返回當前節點后所有符合條件的節點
    6. find_next()            # 返回當前節點后第一個符合條件的節點
    7. find_all_previous()    # 返回當前節點后所有符合條件的節點
    8. find_previous()        # 返回當前節點后第一個符合條件的節點
"""

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""


# A:name --------------------------------------------------------------
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('ul'))  # 獲取所有ul標簽 返回列表類型
print(type(soup.find_all('ul')[0])) # 獲取類型
for ul in soup.find_all('ul'): 
    print(ul.find_all('li'))

# B:attrs -------------------------------------------------------------
# 方式一:
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={"id":"list-1"})) # 獲取 id 為 list-1 的所有元素
print(soup.find_all(attrs={"class":"lisi"}))    # 獲取 class 為 lisi 的所有元素
# 方式二:
print(soup.find_all(id = "list-1")) # 獲取 id 為 list-1 的所有元素
print(soup.find_all(class_ = "lisi"))   # 獲取 class 為 lisi 的所有元素
# 以上兩種方式執行結果是一樣的

# C:text --------------------------------------------------------------
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text = "選項1"))

# D:css選擇器(***) -----------------------------------------------------
# 1:
soup = BeautifulSoup(html, 'lxml')
print(soup.select('#list-2'))       # ID 選擇器
print(soup.select('.zhangsan'))     # class 選擇器
print(soup.select('ul li'))         # 標簽選擇器
print(soup.select('#divid h2'))     # ID 和 標簽 共同使用

# 2:
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul.select('li'))
    
# 3:屬性選擇器
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
    print(ul.get('id'))
    print(ul['id'])

# 4:獲取內容
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
    print(li.get_text())
復制代碼

五 pyquery 庫詳解

1. 安裝: pip install pyquery

2. 初始化:

# demo01.py

復制代碼
# 初始化
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""

# A: 字符串初始化 -------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
print(doc('li'))

# B: URL初始化 ----------------------------------------------------------------------------------------------------------
doc = PyQuery(url="http://www.baidu.com")
print(doc('head'))

# C: 文件初始化(在同級目錄下創建index.html 代碼和上邊的一樣) ---------------------------------------------------------------
# 這種方法會報錯 :UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 187: illegal multibyte sequence
# 解決方法去掉html文件中的中文字符,這種解決方式不推薦(有待研究)
# doc = PyQuery(filename='index.html')
# print(doc('li'))

# 可以改成這種方法(但是,總感覺有問題)
with open("index.html","r",encoding="utf-8")as f:
    doc = f.read()
result = PyQuery(doc)
print(result('li'))
復制代碼

3. 基本CSS選擇器:

# demo02.py

復制代碼
# 基本CSS選擇器
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
print(doc('#divid #list-1 li'))
復制代碼

4. 查找元素:

A: 子元素

# demo03.py

復制代碼
# 子元素
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
items = doc('#list-1')
print(type(items))
print(items)
li_list = items.find('li')
print(type(li_list))
print(li_list)
復制代碼

B: 父元素

# demo04.py

復制代碼
# 父元素
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
items = doc('#list-1')
container = items.parent()
print(type(container))
print(container)
parents = items.parents()
print(type(parents))
print(parents)
復制代碼

C: 兄弟元素

# demo05.py

復制代碼
# 兄弟元素
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
lis = doc('#list-1 .zhangsan')
print(lis.siblings())
print(lis.siblings('.zhangsan'))
復制代碼

D: 遍歷

# demo06.py

復制代碼
# 遍歷
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項2</li>
        <li class="lisi">選項3</li>
    </ul>
</div>
</body>
</html>
"""
doc = PyQuery(html)
lis = doc('#list-2 .lisi')
print(lis)
li_list = doc('.lisi').items()
print(type(li_list))
for li in li_list:
    print(li)
復制代碼

E: 獲取信息(標簽屬性)

# demo07.py

復制代碼
# 獲取信息(獲取屬性)
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
a = doc('#divid a')
print(a)
print(a.attr('href'))
print(a.attr.href)
復制代碼

F: 獲取文本

# demo08.py

復制代碼
# 獲取文本
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
a = doc('#divid a')
print(a)
print(a.text())
復制代碼

G: 獲取html

# demo09.py

復制代碼
# 獲取html
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
<div id="divid">
    <a href="http://www.baidu.com">百度一下 你就知道</a>
</div>
</body>
</html>
"""
doc = PyQuery(html)
div = doc('#divid')
print(div)
print(div.html())
復制代碼

H: DOM操作

# demo10.py

復制代碼
# DOM 操作
from pyquery import PyQuery

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BeautifulSoup 學習</title>
</head>
<body>
<h1>BeautifulSoup</h1>
div id="divid">
    <h2>這是一個列表</h2>
    <ul id="list-1">
        <li class="zhangsan">選項1</li>
        <li class="zhangsan">選項2</li>
        <li class="zhangsan">選項3</li>
    </ul>
    <ul id="list-2">
        <li class="lisi">選項1</li>
        <li class="lisi">選項1</li>
        <li class="lisi">選項1</li>
    </ul>
</div>
</body>
</html>
"""

# 1. addClass,removeClass ----------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc('.lisi')
print(li)
li.remove_class('lisi')
print(li)
li.add_class('zhangsan')
print(li)

# 2. attr,css ----------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc('.zhangsan')
print(li)
li.attr('name','link')
print(li)
li.css('font-size','40px')
print(li)

# 3. remove ------------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
div = doc('#divid')
print(div.text())
div = doc.find('h2').remove()
print(div.text())

# 4. 偽類選擇器 ---------------------------------------------------------------------------------------------------------
doc = PyQuery(html)
li = doc('.zhangsan:first-child')       # 獲取列表的第一個選項
print(li)
li = doc('.zhangsan:last-child')        # 獲取列表的最后一個選項
print(li)
li = doc('.zhangsan:nth-child(2)')      # 獲取列表的第二個選項
print(li)
li = doc('.zhangsan:gt(0)')             # 獲取索引大於0的所有選項
print(li)
li = doc('.zhangsan:nth-child(1n)')     # 獲取第一個之后的所有選項(包括第一個選項)
print(li)
li = doc('.zhangsan:contains(選項3)')    # 過去內容為"選項3"的選項
print(li)
復制代碼

六 selenium庫詳解(自動化測試工具)

selenium 在爬蟲中主要用來解決JavaScrapt渲染問題

1. 安裝:pip install selenium

2. 基本使用:

# demo01.py

復制代碼
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

"""
項目目標:實現百度搜索
1. 創建瀏覽器對象 請求百度
2. 元素定位輸入框
3. 輸入搜索內容
4. 點擊回車
"""
# 創建瀏覽器對象(我用的是谷歌瀏覽器)
browser = webdriver.Chrome()
try:
    # 請求百度
    browser.get("http://www.baidu.com")
    # 定位輸入框
    input = browser.find_element_by_id('kw')
    # 輸入搜索內容
    input.send_keys("selenium")
    # 點擊回車
    input.send_keys(Keys.ENTER)
    # 打印當前的url地址
    print(browser.current_url)
    # 打印cookies
    print(browser.get_cookies())
    # 打印頁面
    print(browser.page_source)
except Exception as e:
    print(e,"=============================")
finally:
    browser.close()

"""
有可能會遇到的錯誤
1. selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
    這是由於程序找不到 chromedriver 驅動
解決:
    下載 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
    注意版本:版本對照表 (https://blog.csdn.net/BinGISer/article/details/88559532)

2. selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 78
    這是由於 ChromeDriver 和 Chrome 版本不對應
解決:
    刪除之前下載的 chromedriver
    重新下載 chromedriver (http://chromedriver.storage.googleapis.com/index.html)
    注意版本:版本對照表 (https://blog.csdn.net/BinGISer/article/details/88559532)
    
大功告成
"""
復制代碼

3. 聲明瀏覽器對象

# demo02.py

復制代碼
# selenium 聲明瀏覽器
from selenium import webdriver
browser = webdriver.Chrome()    # 谷歌瀏覽器
browser = webdriver.Firefox()   # 火狐瀏覽器
browser = webdriver.Edge()      # 微軟瀏覽器
browser = webdriver.PhantomJS() # 無界面瀏覽器
browser = webdriver.Safari()    # Safari瀏覽器
復制代碼

4. 訪問頁面

 # demo03.py

復制代碼
import time
from selenium import webdriver

# 聲明瀏覽器對象
browser = webdriver.Chrome()
# 訪問淘寶
browser.get('https://www.taobao.com')
# 將瀏覽器最大化顯示
browser.maximize_window()
# 停止5秒
time.sleep(5)
# 打印響應頁面
print(browser.page_source)
# 關閉瀏覽器
browser.close()
復制代碼

5. 查找元素(單個元素)

# demo04.py

復制代碼
# 查找元素(單個元素)
from selenium import webdriver

# 聲明瀏覽器對象
browser = webdriver.Chrome()
# 訪問淘寶
browser.get('https://www.taobao.com')
# 將瀏覽器最大化顯示
browser.maximize_window()
# 定位淘寶搜索框(三種方式都可以)
input_id = browser.find_element_by_id('q')
input_selector = browser.find_element_by_css_selector('#q')
input_xpath = browser.find_element_by_xpath('//*[@id="q"]')
print(input_id)
print(input_selector)
print(input_xpath)
# 關閉瀏覽器
browser.close()

"""
查找單個元素常用方法:
    browser.find_element_by_xpath()
    browser.find_element_by_name()
    browser.find_element_by_link_text()
    browser.find_element_by_partial_link_text()
    browser.find_element_by_tag_name()
    browser.find_element_by_class_name()
    browser.find_element_by_css_selector()
"""
復制代碼

6. 查找元素(多個元素)

# demo05.py

復制代碼
# 查找元素(單個元素)
from selenium import webdriver

# 聲明瀏覽器對象
browser = webdriver.Chrome()
# 訪問淘寶
browser.get('https://www.taobao.com')
# 將瀏覽器最大化顯示
browser.maximize_window()
# 查找 class="J_Cat a-all" 的所有元素
li_list = browser.find_elements_by_css_selector('.J_Cat')
print(li_list)
# 關閉瀏覽器
browser.close()

"""
查找多個元素常用方法:
    browser.find_elements_by_xpath()
    browser.find_elements_by_name()
    browser.find_elements_by_link_text()
    browser.find_elements_by_partial_link_text()
    browser.find_elements_by_tag_name()
    browser.find_elements_by_class_name()
    browser.find_elements_by_css_selector()
"""
復制代碼

7. 元素交互

# demo06.py

復制代碼
import time
from selenium import webdriver

# 聲明瀏覽器對象
browser = webdriver.Chrome()
# 請求淘寶
browser.get("https://www.taobao.com")
# 窗口最大化
browser.maximize_window()
# 定位搜索框
input = browser.find_element_by_id('q')
# 輸入"內存條"
input.send_keys("內存條")
time.sleep(3)
# 清除搜索框內容
input.clear()
time.sleep(5)
# 輸入 "1T硬盤"
input.send_keys("1T硬盤")
# 定位搜索按鈕
button = browser.find_element_by_class_name('btn-search')
# 點擊搜索按鈕
button.click()
time.sleep(10)
# 關閉瀏覽器
browser.close()
復制代碼

8. 執行javascrapt

# demo07.py

復制代碼
# 執行 javascrapt
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
# 滾動條拉到最下邊
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
# 彈窗
browser.execute_script('alert("To Bottom")')
復制代碼

9. 獲取元素信息(獲取屬性)

# demo08.py

復制代碼
# 獲取元素信息(獲取屬性)

from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/"
browser.get(url)
logo = browser.find_element_by_css_selector('.SignFlowHomepage-logo')
print(logo)
print(logo.get_attribute('src'))
browser.close()
復制代碼

10. 獲取元素信息(獲取文本值)

# demo09.py

復制代碼
# 獲取元素信息(獲取文本值)

from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/explore"
browser.get(url)
input = browser.find_element_by_id('Popover1-toggle')
input.send_keys('新冠病毒')
print(input.text)
復制代碼

11. 獲取元素信息(獲取ID,位置,標簽名,大小)

# demo10.py

復制代碼
# 獲取元素信息(獲取ID,位置,標簽名,大小)

from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/explore"
browser.get(url)
input = browser.find_element_by_id('Popover1-toggle')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
browser.close()
復制代碼

12. 獲取元素信息(iframe)

# demo11.py

復制代碼
# 獲取元素信息(iframe)

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome()
url = "https://www.runoob.com/try/try.php?filename=tryjquery_hide"
browser.get(url)
browser.switch_to.frame('iframeResult')
button = browser.find_element_by_css_selector('button')
print(button)
try:
    logo = browser.find_element_by_class_name('logo')
except NoSuchElementException:
    print('NO LOGO')
finally:
    browser.switch_to.parent_frame()
    logo = browser.find_element_by_class_name('logo')
    print(logo)
    print(logo.text)
    browser.close()
復制代碼

13. 等待

# demo12.py

復制代碼
# 等待

""" 
顯示等待就是有條件的等待
隱式等待就是無條件的等待

隱式等待
    當使用了隱式等待執行測試的時候,如果 WebDriver 沒有在 DOM 中找到元素,將繼續等待,超出設定時間后則拋出找不到元素的異常,
    換句話說,當查找元素或元素並沒有立即出現的時候,隱式等待將等待一段時間再查找 DOM,默認的時間是 0

顯式等待
    指定某個條件,然后設置最長等待時間。如果在這個時間還沒有找到元素,那么便會拋出異常。
    只有該條件觸發,才執行后續代碼,這個使用更靈活。 
    主要涉及到selenium.webdriver.support 下的expected_conditions類。 
"""

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get('http://www.taobao.com')
browser.maximize_window()
browser.implicitly_wait(10)
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_all_elements_located((By.ID,'q')))
button = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.btn-search')))
print(input)
print(button)
browser.close()
復制代碼

14. 瀏覽器的前進和后退

# demo13.py

復制代碼
# 瀏覽器的前進和后退
import time
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
time.sleep(1)
browser.get('https://www.taobao.com')
time.sleep(1)
browser.get('https://www.cnblogs.com/xingxingnbsp/')
time.sleep(1)
browser.back()
time.sleep(2)
browser.forward()
time.sleep(2)
browser.close()
復制代碼

15. Cookies

# demo14.py

復制代碼
# cookies
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({"name":"name","domain":"www.zhihu.com","value":"germey"})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
browser.close()
復制代碼

16. 選項卡管理(不兼容)

# demo15.py

復制代碼
# 選項卡管理
import time
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
time.sleep(2)
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(2)
browser.get('https://www.cnblogs.com/xingxingnbsp/')
time.sleep(3)
browser.close()
復制代碼

17. 異常處理

 # demo16.py

復制代碼
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException

browser = webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')
except TimeoutException:
    print('Time Out')
try:
    browser.find_element_by_id('hello')
except NoSuchElementException:
    print('No Element')
finally:
    browser.close()
復制代碼

 文章摘錄:https://www.cnblogs.com/xingxingnbsp/p/12129466.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM