python 爬虫《从入门到放弃》


一篇文章带你了解《python爬虫》

一 什么是网络爬虫:

       1. 通俗理解:爬虫是一个模拟人类请求网站行为的程序。可以自动请求网页、并数据抓取下来,然后使用一定的规则提取有价值的数据。

  2. 专业介绍:百度百科

二 python urllib:

# demo01.py(urillb基本使用)

# 导入urllib库(该库不需要安装)
import urllib.request # 请求百度,并接收响应
response = urllib.request.urlopen("http://www.baidu.com/") # 打印页面
print(response.read().decode('utf-8'))

# demo2.py(用法讲解)

# urllib 用法讲解 # urlopen : urllib.request.urlopen('网址','数据','超时设置')

import urllib.request import urllib.parse import urllib.error """ A: response = urllib.request.urlopen('http://www.baidu.com/') print(response.read().decode('utf-8')) B: data = urllib.parse.urlencode({'word': 'hello'}).encode('utf-8') response = urllib.request.urlopen("http://httpbin.org/post", data = data) print(response.read()) C: response = urllib.request.urlopen("http://httpbin.org/get",timeout=1) print(response.read()) """

try: response = urllib.request.urlopen("http://httpbin.org/get", timeout=0.1) except urllib.error.URLError as e: if isinstance(e.reason.socket.timeout): print(response.read())

# demo03.py(响应)

# urllib 响应

import urllib.request response = urllib.request.urlopen("http://www.baidu.com/") # 打印响应类型
print(type(response)) # 打印状态码
print(response.status) # 打印响应头
print(response.getheaders())

# demo04.py(Request 详解

# Request 详解

import urllib.request from urllib import parse """ A: request = urllib.request.Request('http://www.baidu.com') response = urllib.request.urlopen(request) print(response.read().decode('utf-8')) B: url = "http://httpbin.org/post" # 指定请求头 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36", "Host": "api.github.com" } # 请求数据 dict = { "name":"Germey" } data = bytes(parse.urlencode(dict),encoding='utf-8') request = urllib.request.Request(url=url,data=data,headers=headers,method='POST') response = urllib.request.urlopen(request) print(response.read().decode('utf-8')) """ url = "http://httpbin.org/post"
# 请求数据
dict = { "name":"Germey" } data = bytes(parse.urlencode(dict),encoding='utf-8') request = urllib.request.Request(url=url,data=data,method='POST') request.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36") response = urllib.request.urlopen(request) print(response.read().decode('utf-8'))

# demo05.py (代理)

# handler(代理)
import urllib.request proxy_header = urllib.request.ProxyHandler({ "http":"http://xxx.xxx.xxx.xxx:xxxx", "https":"https://xxx.xxx.xxx.xxx:xxxx" }) opener = urllib.request.build_opener(proxy_header) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))

# demo06.py(cookie)

# cookie

import http.cookiejar import urllib.request """ A: http.cookiejar 简单使用 cookie = http.cookiejar.CookieJar() handir = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handir) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8')) B:MozillaCookieJar 将网站的cookie存储在本地文件中 filename = "utils/cookie.txt" cookie = http.cookiejar.MozillaCookieJar(filename) handir = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handir) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True,ignore_expires=True) C: LWPCookieJar 将网站的cookie存储在本地文件中 filename = "utils/cookie01.txt" cookie = http.cookiejar.LWPCookieJar(filename) handir = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handir) response = opener.open('http://www.baidu.com') cookie.save(ignore_discard=True,ignore_expires=True) D: 使用文件中的cookie """ cookie = http.cookiejar.LWPCookieJar() cookie.load('utils/cookie01.txt',ignore_discard=True,ignore_expires=True) handir = urllib.request.HTTPCookieProcessor(cookie) opener = urllib.request.build_opener(handir) response = opener.open('http://www.baidu.com') print(response.read().decode('utf-8'))

# demo07.py(异常处理)

# 异常处理

import urllib.request from urllib import error """ A: urllib error 简单使用 try: response = urllib.request.urlopen('http://www.baidu.com') except error.URLError as e: print(e.reason) B: try: response = urllib.request.urlopen('http://www.baidu.com/') print(response.read().decode('utf-8')) except error.URLError as e: print(e.reason) else: print("*************") C: timeout try: response = urllib.request.urlopen('http://www.baidu.com',timeout=0.01) except error.URLError as e: print(e.reason) """

# 一个不存在的连接
try: response = urllib.request.urlopen("http://www.abcdhaha2.com/") html = response.read().decode('utf-8') print(html) except error.URLError as e: print(e.reason)

# demo08.py(URL解析)

from urllib.parse import urlparse from urllib.parse import urlunparse from urllib.parse import urljoin from urllib.parse import urlencode # 语法:urlparse("网址",scheme='http|https', allow_fragments=True)

# A
resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment') print(type(resuit)) print(resuit) # B
resuit = urlparse('www.baidu.com/index.html;user?id=5#comment', scheme="https") print(resuit) # C
resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment', allow_fragments=True) print(resuit) # D
resuit = urlparse('https://www.baidu.com/index.html;user?id=5#comment', allow_fragments=False) print(resuit) # E
resuit = urlparse('https://www.baidu.com/index.html#comment', allow_fragments=False) print(resuit) # F (urlunparse)
data = ["http", "www.baidu.com", "index.html", "user", "a=6", "comment"] print(urlunparse(data)) # G (urljoin) # 语法 : urljoin("网址","要添加的后缀")
print(urljoin("https://www.cnblogs.com/xingxingnbsp/p/xxxxxxxxx.html", "12129466.html")) # H (urlencode)
params = { 'name': 'hello_urllib', 'age': 18 } base_url = 'http://www.baidu.com?' url = base_url + urlencode(params) print(url)

三 python requests:

1. 安装 requests 库:pip install requests

# demo01.py

# requests 基本使用

import requests response = requests.get("http://www.baidu.com") print(type(response))           # 打印响应类型
print(response.status_code)     # 打印状态码
print(type(response.text))      # 打印响应内容类型
print(response.text)            # 打印响应内容
print(response.cookies)         # 打印响应cookie

2. 请求方式:

1 requests.get('网址') 2 requests.post('网址') 3 requests.put('网址') 4 requests.patch('网址') 5 requests.delete('网址') 6 requests.head('网址') 7 requests.options('网址')

3. 基本get请求:

# demo02.py

import requests """ A: response = requests.get('http://www.baidu.com') print(response.text) B: response = requests.get('http://httpbin.org/get?name=hello&age=22') print(response.text) """ data = { "name":"hello", "age":22 } response = requests.get('http://httpbin.org/get',params=data) print(response.text)

4. 解析json:

# demo03.py

# 解析json

import requests response = requests.get('https://api.jinse.com/v6/www/information/list?catelogue_key=news&limit=23&information_id=18762945&flag=down&version=9.9.9&_source=www') print(type(response)) print(response.json()) print(type(response.json()))

5. 获取二进制数据

# demo04.py

import requests """ A: response = requests.get('https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500') print(type(response.text)) print(type(response.content)) print(response.text) print(response.content) """ response = requests.get('https://images.pexels.com/photos/3393793/pexels-photo-3393793.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500') with open('images/image.png','wb') as f: f.write(response.content) f.close()

6. 添加headers:

# demo05.py

import requests headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36" } response = requests.get("http://www.baidu.com",headers=headers) print(response.text)

7. 基本的post请求

# demo06.py

import requests """ A: data = { "name":"hello", "age":22 } response = requests.post("http://httpbin.org/post",data=data) print(response.text) """ data = { "name":"hello", "age":22 } headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36" } response = requests.post("http://httpbin.org/post",data=data,headers=headers) print(response.text)

8. 响应:(response属性)

# demo07.py

import requests response = requests.get('http://www.baidu.com') print(type(response.status_code),response.status_code)      # 打印响应 状态码类型 和 状态码
print(type(response.headers),response.headers)              # 打印响应 头类型 和 响应头
print(type(response.cookies),response.cookies)              # 打印响应 cookies类型 和 cookies
print(type(response.url),response.url)                      # 打印响应 URL类型 和 URL
print(type(response.history),response.history)              # 打印历史记录

9. 状态码判断:

# demo08.py

import requests """ A: response = requests.get('http://www.baidu.com') # 这里使用了python三元表达式 exit() if not response.status_code == requests.codes.ok else print('request successfully') B: response = requests.get('http://www.baidu.com') # 这里使用了python三元表达式 exit() if not response.status_code == 200 else print('request successfully') """ response = requests.get('http://www.baidu.com') if not response.status_code == 200: exit() else: print('request successfully') # 以上三种方式表达的意思是一样的

10. 高级操作:

# demo09.py

import requests # A: 上传文件 ----------------------------------------------------------------
files = { "files":open('images/image.png','rb') } response = requests.post('http://www.baidu.com',files=files) print(response.text) # B:获取cookie -------------------------------------------------------------
response = requests.get('http://www.baidu.com') print(response.cookies) for key,value in response.cookies.items(): print(key + "=" + value) # C: 会话维持 --------------------------------------------------------------
requests.get('http://httpbin.org/cookie/set/number/123456789') response = requests.get('http://httpbin.org/cookkie') print(response.text) s = requests.session() s.get('http://httpbin.org/cookie/set/number/123456789') response = s.get('http://httpbin.org/cookkie') print(response.text) # D: 代理设置 -------------------------------------------------------------- # 方式一:
proxies = { 'http':'http://ip:port', 'https':'https://ip:port' } response = requests.get('http://www.baidu.com',proxies=proxies) print(response.status_code) # 方式二:
proxies = { 'http':'http://user:password@ip:port/', 'https':'https://user:password@ip:port/' } response = requests.get('http://www.baidu.com',proxies=proxies) print(response.status_code) # 方式三:
proxies = { 'http':'socks5://ip:port', 'https':'socks5://ip:port' } response = requests.get('http://www.baidu.com',proxies=proxies) print(response.status_code) # E: 证书认证 ----------------------------------------------------------------
response = requests.get('http://www.12306.cn') print(response.status_code) response = requests.get('http://www.12306.cn',verify=False) print(response.status_code) # 注意这里的路径 'path/server.crt','path/key' 该成自己的
response = requests.get('http://www.12306.cn',cert=('path/server.crt','path/key')) print(response.status_code) # F:超时设置 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout try: response = requests.get('http://www.taobao.com', timeout=0.1) print(response.status_code) except ReadTimeout: print("Timeout") # G: 认证管理 ----------------------------------------------------------------
from requests.auth import HTTPBasicAuth response = requests.get('http://www.taobao.com', auth=HTTPBasicAuth('user','123')) print(response.status_code) response = requests.get('http://www.taobao.com', auth=('user','123')) print(response.status_code) # H: 异常处理 ----------------------------------------------------------------
from requests.exceptions import ReadTimeout,ConnectionError,HTTPError,RequestException try: response = requests.get('http://www.taobao.com', timeout=0.1) print(response.status_code) except ReadTimeout: print("Timeout") except HTTPError: print("HTTPError") except ConnectionError: print("ConnectionError") except RequestException: print("Error")

四 BeautifulSoup库详解:(网页解析器)

1. 安装 :pip install beautifulsoup4

2. BeautifulSoup基本用法:

# demo01.py

# BeautifulSoup 的基本使用
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div> <h2>这是一个列表</h2> <ul> <li>选项1</li> <li>选项2</li> <li>选项3</li> <li>选项4</li> <li>选项5</li> <li>选项6</li> <li>选项7</li> <li>选项8</li> <li>选项9</li> </ul> </div> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.prettify()) print(soup.title.string)

3. 标签选择器:(只能拿一次)

# demo02.py

# BeautifulSoup 标签选择器(只拿一次)
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div> <h2>这是一个列表</h2> <ul> <li>选项1</li> <li>选项2</li> <li>选项3</li> <li>选项4</li> <li>选项5</li> <li>选项6</li> <li>选项7</li> <li>选项8</li> <li>选项9</li> </ul> </div> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.title) print(type(soup.title)) print(soup.head) print(soup.li)

4. 获取标签名称:

# demo03.py

# BeautifulSoup 获取标签名称
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.title.name)

5. 获取标签属性:

# demo04.py

# BeautifulSoup 获取标签属性
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <p class="font-p"></p> <a href="http://www.baidu.com">百度一下 你就知道</a> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.p.attrs) print(soup.p.attrs["class"]) print(soup.a.attrs["href"])

6. 获取内容:

# demo05.py

# BeautifulSoup 获取标签属性
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> div <a href="http://www.baidu.com">百度一下 你就知道</a> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.p.string) print(soup.a.string)

7. 嵌套选择:

 # demo06.py

# 嵌套选择
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div> <h2>这是一个列表</h2> <ul> <li>选项1</li> </ul> </div> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.ul.li.string)

8. 子节点和孙节点:

 # demo07.py

# 子节点和孙节点
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div> <h2>这是一个列表</h2> <ul><li>选项1</li><li>选项2</li><li><a href="http://www.baidu.com">百度一下 你就知道</a></li></ul> </div> </body> </html> """ soup = BeautifulSoup(html, 'lxml') print(soup.ul.contents) # 选择所有子节点 返回值为列表类型
print(soup.ul.childern) # 选择单个子节点
print(soup.ul.descendants)  # 获取所有子孙节点
for i,child in enumerate(soup.ul.descendants): print(i,child)

9. 父节点和祖先节点:

 # demo08.py

# 父节点和祖先节点

from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> <div> <ol> <li><a href="http://www.baidu.com">百度一下 你就知道</a></li> </ol> </div> </body> </html> """ soup = BeautifulSoup(html,'lxml') print(soup.a.parent)    # 选择父节点
print(type(soup.a.parents)) # 选择所有父节点
print(list(enumerate(soup.a.parents)))

10.兄弟节点:

# demo09.py

# 兄弟节点
from bs4 import BeautifulSoup html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div> <h1>我是一个大大的H1</h1> <h2>我是一个大大的H2</h2> <p>我是一个简单的p标签</p> <h3>我是一个大大的H3</h3> <h4>我是一个大大的H4</h4> </div> </body> </html> """ html = html.replace('\n','').replace(' ','')   # 去掉html代码的 "\n" 和 空格
soup = BeautifulSoup(html, 'lxml') print(list(enumerate(soup.p.next_siblings)))    # 获取当前加点下所有的兄弟节点
print(list(enumerate(soup.p.previous_siblings)))    # 获取当前加点上所有的兄弟节点

11. 标准选择器(***重点***)

 # demo10.py

from bs4 import BeautifulSoup # 标准选择器(重点 建议反复观看) # 语法:find_all(name,attrs,recursive,text,**kwargs)
""" find 返回符合条件的单个元素 find_all 返回所有符合条件的所有元素 1. find_parent() # 返回直接父节点 2. find_parents() # 获取所有祖先节点 3. find_next_sibling() # 返回当前节点后边一个兄弟节点 4. find_next_siblings() # 返回当前节点后边所有兄弟节点 5. find_all_next() # 返回当前节点后所有符合条件的节点 6. find_next() # 返回当前节点后第一个符合条件的节点 7. find_all_previous() # 返回当前节点后所有符合条件的节点 8. find_previous() # 返回当前节点后第一个符合条件的节点 """ html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """


# A:name --------------------------------------------------------------
soup = BeautifulSoup(html, 'lxml') print(soup.find_all('ul'))  # 获取所有ul标签 返回列表类型
print(type(soup.find_all('ul')[0])) # 获取类型
for ul in soup.find_all('ul'): print(ul.find_all('li')) # B:attrs ------------------------------------------------------------- # 方式一:
soup = BeautifulSoup(html, 'lxml') print(soup.find_all(attrs={"id":"list-1"})) # 获取 id 为 list-1 的所有元素
print(soup.find_all(attrs={"class":"lisi"}))    # 获取 class 为 lisi 的所有元素 # 方式二:
print(soup.find_all(id = "list-1")) # 获取 id 为 list-1 的所有元素
print(soup.find_all(class_ = "lisi"))   # 获取 class 为 lisi 的所有元素 # 以上两种方式执行结果是一样的

# C:text --------------------------------------------------------------
soup = BeautifulSoup(html, 'lxml') print(soup.find_all(text = "选项1")) # D:css选择器(***) ----------------------------------------------------- # 1:
soup = BeautifulSoup(html, 'lxml') print(soup.select('#list-2'))       # ID 选择器
print(soup.select('.zhangsan'))     # class 选择器
print(soup.select('ul li'))         # 标签选择器
print(soup.select('#divid h2'))     # ID 和 标签 共同使用

# 2:
soup = BeautifulSoup(html, 'lxml') for ul in soup.select('ul'): print(ul.select('li')) # 3:属性选择器
soup = BeautifulSoup(html, 'lxml') for ul in soup.select('ul'): print(ul.get('id')) print(ul['id']) # 4:获取内容
soup = BeautifulSoup(html, 'lxml') for li in soup.select('li'): print(li.get_text())

五 pyquery 库详解

1. 安装: pip install pyquery

2. 初始化:

# demo01.py

# 初始化
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """

# A: 字符串初始化 -------------------------------------------------------------------------------------------------------
doc = PyQuery(html) print(doc('li')) # B: URL初始化 ----------------------------------------------------------------------------------------------------------
doc = PyQuery(url="http://www.baidu.com") print(doc('head')) # C: 文件初始化(在同级目录下创建index.html 代码和上边的一样) --------------------------------------------------------------- # 这种方法会报错 :UnicodeDecodeError: 'gbk' codec can't decode byte 0x80 in position 187: illegal multibyte sequence # 解决方法去掉html文件中的中文字符,这种解决方式不推荐(有待研究) # doc = PyQuery(filename='index.html') # print(doc('li'))

# 可以改成这种方法(但是,总感觉有问题)
with open("index.html","r",encoding="utf-8")as f: doc = f.read() result = PyQuery(doc) print(result('li'))

3. 基本CSS选择器:

# demo02.py

# 基本CSS选择器
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """ doc = PyQuery(html) print(doc('#divid #list-1 li'))

4. 查找元素:

A: 子元素

# demo03.py

# 子元素
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """ doc = PyQuery(html) items = doc('#list-1') print(type(items)) print(items) li_list = items.find('li') print(type(li_list)) print(li_list)

B: 父元素

# demo04.py

# 父元素
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """ doc = PyQuery(html) items = doc('#list-1') container = items.parent() print(type(container)) print(container) parents = items.parents() print(type(parents)) print(parents)

C: 兄弟元素

# demo05.py

# 兄弟元素
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """ doc = PyQuery(html) lis = doc('#list-1 .zhangsan') print(lis.siblings()) print(lis.siblings('.zhangsan'))

D: 遍历

# demo06.py

# 遍历
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项2</li> <li class="lisi">选项3</li> </ul> </div> </body> </html> """ doc = PyQuery(html) lis = doc('#list-2 .lisi') print(lis) li_list = doc('.lisi').items() print(type(li_list)) for li in li_list: print(li)

E: 获取信息(标签属性)

# demo07.py

# 获取信息(获取属性)
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <a href="http://www.baidu.com">百度一下 你就知道</a> </div> </body> </html> """ doc = PyQuery(html) a = doc('#divid a') print(a) print(a.attr('href')) print(a.attr.href)

F: 获取文本

# demo08.py

# 获取文本
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <a href="http://www.baidu.com">百度一下 你就知道</a> </div> </body> </html> """ doc = PyQuery(html) a = doc('#divid a') print(a) print(a.text())

G: 获取html

# demo09.py

# 获取html
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> <div id="divid"> <a href="http://www.baidu.com">百度一下 你就知道</a> </div> </body> </html> """ doc = PyQuery(html) div = doc('#divid') print(div) print(div.html())

H: DOM操作

# demo10.py

# DOM 操作
from pyquery import PyQuery html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>BeautifulSoup 学习</title> </head> <body> <h1>BeautifulSoup</h1> div id="divid"> <h2>这是一个列表</h2> <ul id="list-1"> <li class="zhangsan">选项1</li> <li class="zhangsan">选项2</li> <li class="zhangsan">选项3</li> </ul> <ul id="list-2"> <li class="lisi">选项1</li> <li class="lisi">选项1</li> <li class="lisi">选项1</li> </ul> </div> </body> </html> """

# 1. addClass,removeClass ----------------------------------------------------------------------------------------------
doc = PyQuery(html) li = doc('.lisi') print(li) li.remove_class('lisi') print(li) li.add_class('zhangsan') print(li) # 2. attr,css ----------------------------------------------------------------------------------------------------------
doc = PyQuery(html) li = doc('.zhangsan') print(li) li.attr('name','link') print(li) li.css('font-size','40px') print(li) # 3. remove ------------------------------------------------------------------------------------------------------------
doc = PyQuery(html) div = doc('#divid') print(div.text()) div = doc.find('h2').remove() print(div.text()) # 4. 伪类选择器 ---------------------------------------------------------------------------------------------------------
doc = PyQuery(html) li = doc('.zhangsan:first-child')       # 获取列表的第一个选项
print(li) li = doc('.zhangsan:last-child')        # 获取列表的最后一个选项
print(li) li = doc('.zhangsan:nth-child(2)')      # 获取列表的第二个选项
print(li) li = doc('.zhangsan:gt(0)')             # 获取索引大于0的所有选项
print(li) li = doc('.zhangsan:nth-child(1n)')     # 获取第一个之后的所有选项(包括第一个选项)
print(li) li = doc('.zhangsan:contains(选项3)')    # 过去内容为"选项3"的选项
print(li)

六 selenium库详解(自动化测试工具)

selenium 在爬虫中主要用来解决JavaScrapt渲染问题

1. 安装:pip install selenium

2. 基本使用:

# demo01.py

from selenium import webdriver from selenium.webdriver.common.keys import Keys """ 项目目标:实现百度搜索 1. 创建浏览器对象 请求百度 2. 元素定位输入框 3. 输入搜索内容 4. 点击回车 """
# 创建浏览器对象(我用的是谷歌浏览器)
browser = webdriver.Chrome() try: # 请求百度
    browser.get("http://www.baidu.com") # 定位输入框
    input = browser.find_element_by_id('kw') # 输入搜索内容
    input.send_keys("selenium") # 点击回车
 input.send_keys(Keys.ENTER) # 打印当前的url地址
    print(browser.current_url) # 打印cookies
    print(browser.get_cookies()) # 打印页面
    print(browser.page_source) except Exception as e: print(e,"=============================") finally: browser.close() """ 有可能会遇到的错误 1. selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home 这是由于程序找不到 chromedriver 驱动 解决: 下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html) 注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532) 2. selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 78 这是由于 ChromeDriver 和 Chrome 版本不对应 解决: 删除之前下载的 chromedriver 重新下载 chromedriver (http://chromedriver.storage.googleapis.com/index.html) 注意版本:版本对照表 (https://blog.csdn.net/BinGISer/article/details/88559532) 大功告成 """

3. 声明浏览器对象

# demo02.py

# selenium 声明浏览器
from selenium import webdriver browser = webdriver.Chrome()    # 谷歌浏览器
browser = webdriver.Firefox()   # 火狐浏览器
browser = webdriver.Edge()      # 微软浏览器
browser = webdriver.PhantomJS() # 无界面浏览器
browser = webdriver.Safari()    # Safari浏览器

4. 访问页面

 # demo03.py

import time from selenium import webdriver # 声明浏览器对象
browser = webdriver.Chrome() # 访问淘宝
browser.get('https://www.taobao.com') # 将浏览器最大化显示
browser.maximize_window() # 停止5秒
time.sleep(5) # 打印响应页面
print(browser.page_source) # 关闭浏览器
browser.close()

5. 查找元素(单个元素)

# demo04.py

# 查找元素(单个元素)
from selenium import webdriver # 声明浏览器对象
browser = webdriver.Chrome() # 访问淘宝
browser.get('https://www.taobao.com') # 将浏览器最大化显示
browser.maximize_window() # 定位淘宝搜索框(三种方式都可以)
input_id = browser.find_element_by_id('q') input_selector = browser.find_element_by_css_selector('#q') input_xpath = browser.find_element_by_xpath('//*[@id="q"]') print(input_id) print(input_selector) print(input_xpath) # 关闭浏览器
browser.close() """ 查找单个元素常用方法: browser.find_element_by_xpath() browser.find_element_by_name() browser.find_element_by_link_text() browser.find_element_by_partial_link_text() browser.find_element_by_tag_name() browser.find_element_by_class_name() browser.find_element_by_css_selector() """

6. 查找元素(多个元素)

# demo05.py

# 查找元素(单个元素)
from selenium import webdriver # 声明浏览器对象
browser = webdriver.Chrome() # 访问淘宝
browser.get('https://www.taobao.com') # 将浏览器最大化显示
browser.maximize_window() # 查找 class="J_Cat a-all" 的所有元素
li_list = browser.find_elements_by_css_selector('.J_Cat') print(li_list) # 关闭浏览器
browser.close() """ 查找多个元素常用方法: browser.find_elements_by_xpath() browser.find_elements_by_name() browser.find_elements_by_link_text() browser.find_elements_by_partial_link_text() browser.find_elements_by_tag_name() browser.find_elements_by_class_name() browser.find_elements_by_css_selector() """

7. 元素交互

# demo06.py

import time from selenium import webdriver # 声明浏览器对象
browser = webdriver.Chrome() # 请求淘宝
browser.get("https://www.taobao.com") # 窗口最大化
browser.maximize_window() # 定位搜索框
input = browser.find_element_by_id('q') # 输入"内存条"
input.send_keys("内存条") time.sleep(3) # 清除搜索框内容
input.clear() time.sleep(5) # 输入 "1T硬盘"
input.send_keys("1T硬盘") # 定位搜索按钮
button = browser.find_element_by_class_name('btn-search') # 点击搜索按钮
button.click() time.sleep(10) # 关闭浏览器
browser.close()

8. 执行javascrapt

# demo07.py

# 执行 javascrapt
from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.taobao.com")
# 滚动条拉到最下边
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
# 弹窗
browser.execute_script('alert("To Bottom")')

9. 获取元素信息(获取属性)

# demo08.py

# 获取元素信息(获取属性)

from selenium import webdriver browser = webdriver.Chrome() url = "https://www.zhihu.com/" browser.get(url) logo = browser.find_element_by_css_selector('.SignFlowHomepage-logo') print(logo) print(logo.get_attribute('src')) browser.close()

10. 获取元素信息(获取文本值)

# demo09.py

# 获取元素信息(获取文本值)

from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/explore"
browser.get(url)
input = browser.find_element_by_id('Popover1-toggle')
input.send_keys('新冠病毒')
print(input.text)

11. 获取元素信息(获取ID,位置,标签名,大小)

# demo10.py

# 获取元素信息(获取ID,位置,标签名,大小)

from selenium import webdriver
browser = webdriver.Chrome()
url = "https://www.zhihu.com/explore"
browser.get(url)
input = browser.find_element_by_id('Popover1-toggle')
print(input.id)
print(input.location)
print(input.tag_name)
print(input.size)
browser.close()

12. 获取元素信息(iframe)

# demo11.py

# 获取元素信息(iframe)

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
browser = webdriver.Chrome()
url = "https://www.runoob.com/try/try.php?filename=tryjquery_hide"
browser.get(url)
browser.switch_to.frame('iframeResult')
button = browser.find_element_by_css_selector('button')
print(button)
try:
    logo = browser.find_element_by_class_name('logo')
except NoSuchElementException:
    print('NO LOGO')
finally:
    browser.switch_to.parent_frame()
    logo = browser.find_element_by_class_name('logo')
    print(logo)
    print(logo.text)
    browser.close()

13. 等待

# demo12.py

# 等待

""" 
显示等待就是有条件的等待
隐式等待就是无条件的等待

隐式等待
    当使用了隐式等待执行测试的时候,如果 WebDriver 没有在 DOM 中找到元素,将继续等待,超出设定时间后则抛出找不到元素的异常,
    换句话说,当查找元素或元素并没有立即出现的时候,隐式等待将等待一段时间再查找 DOM,默认的时间是 0

显式等待
    指定某个条件,然后设置最长等待时间。如果在这个时间还没有找到元素,那么便会抛出异常。
    只有该条件触发,才执行后续代码,这个使用更灵活。 
    主要涉及到selenium.webdriver.support 下的expected_conditions类。 
"""

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
browser = webdriver.Chrome()
browser.get('http://www.taobao.com')
browser.maximize_window()
browser.implicitly_wait(10)
wait = WebDriverWait(browser,10)
input = wait.until(EC.presence_of_all_elements_located((By.ID,'q')))
button = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR,'.btn-search')))
print(input)
print(button)
browser.close()

14. 浏览器的前进和后退

# demo13.py

# 浏览器的前进和后退
import time
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
time.sleep(1)
browser.get('https://www.taobao.com')
time.sleep(1)
browser.get('https://www.cnblogs.com/xingxingnbsp/')
time.sleep(1)
browser.back()
time.sleep(2)
browser.forward()
time.sleep(2)
browser.close()

15. Cookies

# demo14.py

# cookies
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({"name":"name","domain":"www.zhihu.com","value":"germey"})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
browser.close()

16. 选项卡管理(不兼容)

# demo15.py

# 选项卡管理
import time
from selenium import webdriver

browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
time.sleep(2)
browser.execute_script('window.open()')
print(browser.window_handles)
browser.switch_to_window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(2)
browser.get('https://www.cnblogs.com/xingxingnbsp/')
time.sleep(3)
browser.close()

17. 异常处理

 # demo16.py

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException

browser = webdriver.Chrome()
try:
    browser.get('https://www.baidu.com')
except TimeoutException:
    print('Time Out')
try:
    browser.find_element_by_id('hello')
except NoSuchElementException:
    print('No Element')
finally:
    browser.close()

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM