# python 爬蟲之美麗湯 BeautifulSoup
作者: jwang106
1. 使用requests獲取網頁的html源碼
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.autohome.com.cn/news/')
response.encoding = response.apparent_encoding
response.text
request用法總結
response = requests.get(url)
# get傳參
>>> payload = {'key1': 'value1', 'key2': 'value2', 'key3': None}
>>> r = requests.get('http://httpbin.org/get', params=payload)
# 參數也可以傳遞列表
>>> payload = {'key1': 'value1', 'key2': ['value2', 'value3']}
>>> r = requests.get('http://httpbin.org/get', params=payload)
>>> print(r.url)
http://httpbin.org/get?key1=value1&key2=value2&key2=value3
# 編碼
request.encoding
# 返回headers中的編碼解析的結果
text
# 返回二進制結果
content
# response.json()返回JSON格式,可能拋出異常
apparent_encoding
# 狀態碼 404 200等
status_code
# 為方便引用,Requests還附帶了一個內置的狀態碼查詢對象:
print r.status_code == requests.codes.ok
2. 使用美麗湯
舉例: 如果目標是爬取某個html里某個id下a標簽的圖片
soup = BeautifulSoup(response.text,features='html.parser')
# 直接用soup.find(id='xxx') 簡單又好記
# soup的每一個find的return可以繼續用find, find是找到第一個,
# find_all 是所有,返回list
target = soup.find(id='auto-channel-lazyload-article')
li_list = target.find_all('li')
for i in li_list:
a = i.find('a')
if a:
print(a.attrs.get('href'))
txt = a.find('h3').text
print(txt)
img_url = 'https:' + a.find('img').attrs.get('src')
print(img_url)
img_response = requests.get(url=img_url)
import uuid
file_name = str(uuid.uuid4()) + '.jpg'
with open(file_name,'wb') as f:
f.write(img_response.content)
打印一下這些元素的type,就更容易懂了
print(type(soup))
print(type(target))
print(type(li_list[0]))
output:
<class 'bs4.BeautifulSoup'>
<class 'bs4.element.Tag'>
<class 'bs4.element.Tag'>
打印一下a
a = li_list[0].find('a')
a.attrs
output:
{'href': '//www.autohome.com.cn/news/201901/928448.html#pvareaid=102624'}
可以看到是一個字典,並且汽車之家使用了小技巧來防止加密,就是href里沒有寫https,沒有難度我們自己加上就ok了。
后面的代碼就很好懂了,獲取使用requests獲取圖片,然后寫入本地文件。美麗湯總結
soup = BeautifulSoup(response.text, features='html.parser')
soup.find('div')
soup.find(id='1')
soup.find('div', id='1')
find是找第一個 find_all是所有,返回列表
3. 講一下uuid
通用唯一識別碼(英語:Universally Unique Identifier,UUID)
uuid.uuid1([node[, clock_seq]])
Generate a UUID from a host ID, sequence number, and the current time.
uuid.uuid3(namespace, name)
Generate a UUID based on the MD5 hash of a namespace identifier (which is a UUID) and a name (which is a string).
uuid.uuid4()
Generate a random UUID.