一、BeautifulSoup安裝
pip install beautifulsoup4
二、使用示例
from bs4 import BeautifulSoup html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> asdf <div class="title"> <b>The Dormouse's story總共</b> <h1>f</h1> </div> <div class="story">Once upon a time there were three little sisters; and their names were <a class="sister0" id="link1">Els<span>f</span>ie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div> ad<br/>sf <p class="story">...</p> </body> </html> """
soup = BeautifulSoup(html_doc, features="lxml")
1.name標簽名稱
tag1 = soup.find('a')
print(tag1) #打印第一個a標簽內容
name = tag1.name #獲取
print(name)
tag1.name = 'span' #設置標簽為span
print(soup) #打印內容
2.attr標簽屬性
tag2 = soup.find('a') attrs = tag2.attrs #獲取第一個a標簽所有屬性值 print(attrs) link1 = soup.find_all('a',attrs={'id':'link1'}) #獲取所有a標簽中,屬性有'id':'link1'的內容 print(link1) tag2.attrs = {'ik':123} #設置attrs值 print(tag2.attrs) tag2.attrs['id'] = 'xxxx' #設置 print(tag2.attrs) tag2.attrs['id'] = 'qqq' #設置 print(tag2.attrs)
3.find與find_all查找區別
#find匹配是第一個標簽 tag3 = soup.find('a') print(tag3) #find_al是查找所有標簽 tag4 = soup.find_all('a') print(tag4)
4.clear,將標簽的所有子標簽全部清空(保留標簽名)
tag5 = soup.find('body') tag5.clear() print(soup)
5.has_attr,檢查標簽是否具有該屬性
tag6 = soup.find('a') v = tag6.has_attr('id') print(v)
6.get_text,獲取標簽內部文本內容
tag7 = soup.find('a') v = tag7.get_text('id') print(v)
7.decompose,遞歸的刪除所有的標簽
body = soup.find('body') body.decompose() print(soup)
8.extract,遞歸的刪除所有的標簽,並獲取刪除的標簽
body = soup.find('body') body.extract() print(soup)
9.decode,轉換為字符串(含當前標簽);decode_contents(不含當前標簽)
body = soup.find('body') # v = body.decode() v = body.decode_contents() print(v)
10.encode,轉換為字節(含當前標簽);encode_contents(不含當前標簽)
body = soup.find('body') # v = body.encode() v = body.encode_contents() print(v)
11.標簽的內容
tag8 = soup.find('span') print(tag8.string) #獲取內容 print(tag8) tag8.string = 'new content' #設置新內容 print(tag8) tag9 = soup.find('body') v = tag9.stripped_strings # 遞歸內部獲取所有標簽的文本 print(v) print(next(v))
12.children,所有子標簽
body = soup.find('body') v = body.children
13.children,所有子子孫孫標簽
body = soup.find('body') v = body.descendants
多余的可以查找官方文檔:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html