python beautifulsoup 對html 進行爬取分類（部分）

本文轉載自查看原文 2019-11-09 10:14 248

html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''

from bs4 import BeautifulSoup
soup= BeautifulSoup(html,'lxml')

print(soup.prettify())#格式化代碼，打印結果自動補全缺失的代碼
print(soup.title.string)#文章標題

結果：
<html>
<head>
<title>
 The Domouse's story
</title>
</head>
<body>

 
 The Dormouse's story
 


 Once upon a time there were little sisters;and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">
 
 </a>
 <a class="sister" hred="http://example.com/lacle" id="link2">
 Lacle
 </a>
 and
 <a class="sister" hred="http://example.com/tilie" id="link3">
 Tillie
 </a>
 and they lived at bottom of a well.


 ...

</body>
</html>
The Domouse's story

選擇元素
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title)
 #<title>The Domouse's story</title>
print(type(soup.title))
 #<class 'bs4.element.Tag'>
print(soup.head)
 #<head><title>The Domouse's story</title></head>
print(soup.p)#當出現多個時，只返回第一個
 #The Dormouse's story

獲取標簽名稱：
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.name)
#title

獲取屬性：
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

print(soup.p.attrs['name'])
#dromouse
print(soup.p['name'])
#dromouse

獲取標簽內容：
html = '''
<html><head><title>The Domouse's story</title></head>
<body>
The Dormouse's story
Once upon a time there were little sisters;and their names were
<a href="http://example.com/elsie"class="sister"id="link1"></a>
<a hred="http://example.com/lacle"class="sister"id="link2">Lacle</a>and
<a hred="http://example.com/tilie"class="sister"id="link3">Tillie</a>
and they lived at bottom of a well.
...
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

print(soup.p.string)
#The Dormouse's story

根據name查找
html = '''
<div class="panel">
 <div class="panel-heading"name="elements">
 <h4>Hello</h4>
 </div>
 <div class="panel-body">
 <ul class="list"Id="list-1">
 <li class="element">Foo</li>
 <li class="element">Bar</li>
 <li class="element">Jay</li>
 </ul>
 <ul class="list list-small"Id="list-2">
 <li class="element">Foo</li>
 <li class="element">Bar</li>
 </ul>
 </div>
<div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')

print(soup.find_all('ul'))#列表類型
print(type(soup.find_all('ul')[0]))

結果：

[<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>, <ul class="list list-small" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
</ul>]
<class 'bs4.element.Tag'>

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python和BeautifulSoup進行網頁爬取使用 BeautifulSoup 和 Selenium 進行網頁爬取 Python使用BeautifulSoup爬取網頁信息 python3.8通過python selenium+requests+BeautifulSoup+ BrowserMobProxy對頁面進行徹底爬取 python 爬取html頁面 python爬蟲（BeautifulSoup）爬取B站視頻字幕 Python之爬取網頁時遇到的問題——BeautifulSoup python網絡爬蟲之解析網頁的BeautifulSoup(爬取電影圖片)[三] Python之爬取網頁時遇到的問題——BeautifulSoup python+beautifulsoup爬取華為應用市場的應用信息