BeautifulSoup4-提取HTML中所有URL鏈接

本文轉載自查看原文 2020-06-27 07:31 1102

'''

提取HTML中所有URL鏈接

'''

import requests
from bs4 import BeautifulSoup
import re

# r = requests.get("https://python123.io/ws/demo.html")
# demo = r.text

demo = """
<html><head><title>This is a python demo page</title></head>
<body>
The demo python introduces several python courses.
Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
<a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1">Basic Python</a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.
</body></html>
"""

"""

find_all(name, attrs, recursive, string, **kwargs)方法：
<tag>(..) 等價於 <tag>.find_all(..)
soup(..) 等價於 soup.find_all(..)

"""

soup = BeautifulSoup(demo, "html.parser")

for link in soup.find_all('a'): # 1、搜索到所有<a>標簽
print(link.get("href")) # 2、解析<a>標簽格式，提取href后的鏈接內容

print(soup.find_all('a')) # 查找<a>標簽
'[<a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a>, <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>]'
print(soup.find_all(['a', 'b'])) # 同時查找<a>標簽

for tag in soup.find_all(True): # 獲取所有標簽
print(tag.name)

'''
html
head
title
body
p
b
p
a
a
'''

# 只顯示以b開頭的標簽，包括和<body>標簽元素
for tag in soup.find_all(re.compile('b')): # 正則表達式查找以開頭的標簽元素
print(tag.name)

print(soup.find_all('p', 'course')) # 返回標簽中，屬性值為"course"的標簽元素

print(soup.find_all(id = "link1")) # 返回屬性中id域等於"link1"的標簽元素

print(soup.find_all(id = re.compile("link"))) # 返回屬性中id域以"link"開頭的所有標簽元素

print(soup.find_all(attrs={"class": "py1"}))

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 JAVA提取字符串中所有的URL鏈接，並加上a標簽 js去除字符串中所有html標簽替換某特殊字符以及獲取URL 參數 Python中BeautifulSoup中對HTML標簽的提取獲取SpringBoot中所有的url和其參數 html之超鏈接及URL JAVA匹配html中所有img標簽使用BeautifulSoup 爬取一個頁面上的所有的超鏈接提取網頁里所有鏈接的方法 JAVA獲取請求鏈接中所有參數（GET請求） java爬取網站中所有網頁的源代碼和鏈接