Python3.x：bs4解析html基礎用法

代碼：

import urllib.request
from bs4 import BeautifulSoup
import re

url = r'http://fund.eastmoney.com/340007.html?spm=search'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
req = urllib.request.Request(url=url, headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')

#使用自帶的html.parser解析，速度慢但通用
soup = BeautifulSoup(html, "html.parser")
#或者soup = BeautifulSoup(html, "html5lib")

#輸出第一個 title 標簽
print(soup.title)
#輸出第一個 title 標簽的標簽名稱
print(soup.title.name)
#輸出第一個 title 標簽的包含內容
print(soup.title.string)
#輸出第一個 title 標簽的父標簽的標簽名稱
print(soup.title.parent.name)

#輸出第一個  p 標簽
print(soup.p)
#輸出第一個  p 標簽的 class 屬性內容
print(soup.p['class'])
#輸出第一個  a 標簽的  href 屬性內容
print(soup.a['href'])

#輸出第一個  p 標簽的所有子節點
print(soup.p.contents)

#輸出第一個  a 標簽
print(soup.a)
#輸出所有的  a 標簽，以列表形式顯示
print(soup.find_all('a'))

#輸出第一個 id 屬性等於  gz_gszze 的標簽
print(soup.find(id='gz_gszze'))
#輸出第一個 id 屬性等於  gz_gszze 的標簽的文本內容
print(soup.find(id='gz_gszze').get_text())

#獲取所有文字內容
print(soup.get_text())
#輸出第一個  a 標簽的所有屬性信息
print(soup.a.attrs)

#循環a標簽
for link in soup.find_all('a'):
    #獲取 link 的  href 屬性內容
    print(link.get('href'))

#對soup.p的子節點進行循環輸出    
for child in soup.p.children:
    print(child)

#正則匹配，標簽名字中帶有sp的標簽
for tag in soup.find_all(re.compile("sp")):
    print(tag.name)

#按照CSS類名搜索tag的功能非常實用,但標識CSS類名的關鍵字 class 在Python中是保留字,使用 class 做參數會導致語法錯誤.從Beautiful Soup的4.1.1版本開始,可以通過 class_ 參數搜索有指定CSS類名的tag
#查找dl標簽class為dataItem02的所有dl標簽
for tag in soup.find_all("dl", class_="dataItem02"):
    print(tag.name)
#或者
for tag in soup.find_all('dl', attrs={'class': "dataItem02"}):
    print(tag.name)

#查找dl標簽class為包含'ui-font-'字符的所有dl標簽
for tagspan in child.find_all("span", class_=re.compile('ui-font-')):
    print(tagspan.get_text())

#數組對象定義（用於存放對象）
content_list = []
#按照CSS類名搜索tag的功能非常實用,但標識CSS類名的關鍵字 class 在Python中是保留字,使用 class 做參數會導致語法錯誤.從Beautiful Soup的4.1.1版本開始,可以通過 class_ 參數搜索有指定CSS類名的tag
#查找dl標簽class為dataItem02的所有dl標簽
for tag in soup.find_all("dl", class_="dataItem02"):
    #對tag的子節點進行循環輸出  
    for child in tag.children:
        print(child)
        #將對象存進數組
        content_list.append(child)
#獲取數組中的第一個對象的值
print('content_list[0]：'+content_list[0].get_text())

find與find_all一起用：

    #第一個class = 'postlist'的div里的所有a 標簽是我們要找的信息
    #注意：BeautifulSoup()返回的類型是<class 'bs4.BeautifulSoup'>
    #　 　find()返回的類型是<class 'bs4.element.Tag'>
    #　 　find_all()返回的類型是<class 'bs4.element.ResultSet'>
    #　 　<class 'bs4.element.ResultSet'>不能再進項find/find_all操作
    all_a = soup.find('div', class_='postlist').find_all('a', target='_blank')
    for a in all_a:
        title = a.get_text()  # 提取文本
        if(title != ''):
            print("標題：" + title)

#最大頁數在span標簽中的第10個
pic_max = soup.find_all('span')[10].text

#找標題
title = soup.find('h2',class_='main-title').text

#圖片地址在img標簽alt屬性為'圖書'地方
pic_url = mess.find('img',alt = '圖書')
#獲取pic_url中的src屬性值：pic_url['src']
html = requests.get(pic_url['src'],headers = headers)

#圖片不是文本文件，以二進制格式寫入，所以是html.content
#open(路徑+文件名,讀寫模式)
#讀寫模式:r只讀,r+讀寫,w新建(會覆蓋原有文件),a追加,b二進制文件.常用模式
f = open(file_name,'wb')
f.write(html.content)
f.close()

#正則 re.findall  的簡單用法（返回string中所有與pattern相匹配的全部字串，返回形式為數組），用法：findall(pattern, string, flags=0)
#示例1：查找全部r標識代表后面是正則的語句
str_1 = re.findall(r"com","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_1)
#輸出結果：['com']

#示例2：符號^表示匹配以http開頭的的字符串返回,
str_2 = re.findall(r"^http","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_2)
# 輸出結果：['http']

#示例3：用$符號表示以html結尾的字符串返回,判斷是否字符串結束的字符串
str_3 = re.findall(r"html$","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_3)
# 輸出結果：['html']

# 示例4：[...]匹配括號中的其中一個字符
str_4 = re.findall(r"[n,w]b","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_4)
# 輸出結果：['nb']

# 示例5：“d”是正則語法規則用來匹配0到9之間的數返回列表
str_5 = re.findall(r"\d","http://www.cnblogs.com/lizm166/p/8143231.html")
str_6 = re.findall(r"\d\d\d","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_5)
# 輸出結果：['1', '6', '6', '8', '1', '4', '3', '2', '3', '1']
print (str_6)
# 輸出結果：['166', '814', '323']

# 示例6：小d表示取數字0-9，大D表示不要數字，也就是除了數字以外的內容返回
str_7 = re.findall(r"\D","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_7)
# 輸出結果：['h', 't', 't', 'p', ':', '/', '/', 'w', 'w', 'w', '.', 'c', 'n', 'b', 'l', 'o', 'g', 's', '.', 'c', 'o', 'm', '/', 'l', 'i', 'z', 'm', '/', 'p', '/', '.', 'h', 't', 'm', 'l']

# 示例7：“w”在正則里面代表匹配從小寫a到z,大寫A到Z，數字0到9
str_8 = re.findall(r"\w","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_8)
# 輸出結果：['h', 't', 't', 'p', 'w', 'w', 'w', 'c', 'n', 'b', 'l', 'o', 'g', 's', 'c', 'o', 'm', 'l', 'i', 'z', 'm', '1', '6', '6', 'p', '8', '1', '4', '3', '2', '3', '1', 'h', 't', 'm', 'l']

# 示例8：“W”在正則里面代表匹配除了字母與數字以外的特殊符號
str_9 = re.findall(r"\W","http://www.cnblogs.com/lizm166/p/8143231.html")
print (str_9)
# 輸出結果：[':', '/', '/', '.', '.', '/', '/', '/', '.']

# 獲取所有a標簽（屬性target為_blank）
tr.find_all('a',target='_blank')

作者：整合俠
鏈接：http://www.cnblogs.com/lizm166/p/8205085.html
來源：博客園
著作權歸作者所有。商業轉載請聯系作者獲得授權，非商業轉載請注明出處。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 bs4 python解析html bs4 解析以及用法 Python（00）：BeautifulSoup(BS4)解析HTML和XML python 3.x 爬蟲基礎---Requersts,BeautifulSoup4（bs4） Python爬蟲bs4解析實戰 Python：數據解析（bs4 / xpath） python爬蟲基礎_requests和bs4 python的基礎爬蟲（利用requests和bs4） bs4解析庫 bs4和xpath的用法