BeautifulSoup4-提取HTML中所有URL鏈接


 '''

提取HTML中所有URL鏈接

'''

 

import requests
from bs4 import BeautifulSoup
import re

# r = requests.get("https://python123.io/ws/demo.html")
# demo = r.text

demo = """
<html><head><title>This is a python demo page</title></head>
<body>
<p class="title"><b>The demo python introduces several python courses.</b></p>
<p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
</body></html>
"""

"""

find_all(name, attrs, recursive, string, **kwargs)方法:
<tag>(..) 等價於 <tag>.find_all(..)
soup(..) 等價於 soup.find_all(..)

"""

soup = BeautifulSoup(demo, "html.parser")

for link in soup.find_all('a'): # 1、搜索到所有<a>標簽
print(link.get("href")) # 2、解析<a>標簽格式,提取href后的鏈接內容


print(soup.find_all('a')) # 查找<a>標簽
'[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]'
print(soup.find_all(['a', 'b'])) # 同時查找<a><b>標簽

for tag in soup.find_all(True): # 獲取所有標簽
print(tag.name)

'''
html
head
title
body
p
b
p
a
a
'''

# 只顯示以b開頭的標簽,包括<b>和<body>標簽元素
for tag in soup.find_all(re.compile('b')): # 正則表達式查找以<b>開頭的標簽元素
print(tag.name)


print(soup.find_all('p', 'course')) # 返回<p>標簽中,屬性值為"course"的標簽元素

print(soup.find_all(id = "link1")) # 返回屬性中id域等於"link1"的標簽元素

print(soup.find_all(id = re.compile("link"))) # 返回屬性中id域以"link"開頭的所有標簽元素

print(soup.find_all(attrs={"class": "py1"}))


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM