喜歡我的博客可以加關注,有問題可以提問我。
1.基本使用(下面的html由於過長就不復制了都復用第一個)
html=""" <html> <head><title>dsojfeoifjosieofiej</title></head> <meta http-equiv="content-type" content="text/html;charset=utf-8"> <meta http-equiv="X-UA-Compatible" content="IE=Edge"> <meta content="always" name="referrer"> <meta name="theme-color" content="#2932e1"> <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" /> <link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="百度搜索" /> <link rel="icon" sizes="any" mask href="//www.baidu.com/img/baidu_85beaf5496f291521eb75ba38eacbd87.svg"> <link rel="dns-prefetch" href="//s1.bdstatic.com"/> <link rel="dns-prefetch" href="//t11.baidu.com"/> <link rel="dns-prefetch" href="//t12.baidu.com"/> <link rel="dns-prefetch" href="//b1.bdstatic.com"/> """ from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.prettify()) print(soup.title.string)
2.選擇元素
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.title) print(soup.head) print(soup.p)(只輸出第一個)
3.獲取名稱
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.name)
4.獲取屬性
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.attrs['name']) print(soup.p['name'])
5.獲取內容
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.string)
6.嵌套選擇
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.head.title.string)
7.子節點和子孫節點
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.contents)#(子節點) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.children) for i,child in enumerate(soup.p.children): print(i,child)#(子節點) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.p.desccendants) for i,child in enumerate(soup.p.desccendants): print(i,child)#(子孫節點)
8.父節點和祖先節點
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.a.parent)#(父節點) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(list(enumerate(soup.a.parents)))#(祖先節點)
9.兄弟節點
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(list(enumerate(soup.a.next_siblings))) print(list(enumerate(soup.a.previous_siblings)))
10.標准選擇器
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for ul in soup.find_all('ul'): print(ul.find_all('li'))
10.1加參數
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find_all(attrs={'id':'list-1'})) print(soup.find_all(attrs={'name':'elements'})) from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find_all(id='list-1')) print(soup.find_all(class_='elements'))
10.2text
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find_all(text='Foo'))#(返回內容)
10.3 find(返回單個元素就是第一個元素)
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.find('ul')) print(type(soup.find('ul'))) print(soup.find('page'))
10.4 find_parents() find_parent()(這里和上面的類似就不粘貼代碼了)
10.5 find_next_siblings() find_next_sibling()(這里和上面的類似就不粘貼代碼了)
11. CSS 選擇器
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') print(soup.select('.panel .panel-heading'))#(選擇class 為.panel 下的class 為。panel0heading的標簽) print(soup.select('ul li'))#(選擇標簽ul 下的li標簽) print(soup.select('#list-2 .element'))#(選擇id為list-2 下的class為 element標簽) print(type(soup.select('ul')[0]))
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for ul in soup.select('ul'): print(ul.select('ul'))
11.1 獲取屬性
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for ul in soup.select('ul'): print(ul['id']) print(ul.attrs['id'])
11.2 獲取內容
from bs4 import BeautifulSoup soup=BeautifulSoup(html,'lxml') for li in soup.select('li'): print(li.get_text())