Python3正則去掉HTML標簽
1.引用一段代碼
import re
html = '<pre class="line mt-10 q-content" accuse="qContent">\
目的是通過第一次soup.find按class粗略篩選並通過soup.find_all篩選出列表中的a標簽並讀入href和title屬性<br><br>\
但是由於目標鏈接可能有圖片鏈接,而這是我不想要的.請問如何去除?<br></pre>'
reg = re.compile('<[^>]*>')
print(reg.sub('',html))
2.重點
reg = re.compile('<[^>]*>')
print(reg.sub('',html))
3.實例
開始
import requests
import re
from bs4 import BeautifulSoup
retxt=open('test.log','r')
for x in range(250,999):
#rurl=rurl.strip('\n')
url='http://ananas.mooc1.mti100.com/tologin?fid={0}'.format(x)
#print(url)
try:
response=requests.get(url,timeout=1).text
#print(response)
soup=BeautifulSoup(response,features="lxml")
result=soup.find_all('span',attrs={'class':'l_schoolName2'})
print('學校:{0}'.format(result))
except requests.exceptions.InvalidURL:
pass
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.ReadTimeout:
pass
輸出
學校:[<span class="l_schoolName2" id="schoolName2">
杭州師范大學
</span>]
學校:[<span class="l_schoolName2" id="schoolName2">
去除標簽之后
import requests
import re
from bs4 import BeautifulSoup
#retxt=open('test.log','r')
for x in range(250,999):
#rurl=rurl.strip('\n')
url='http://ananas.mooc1.mti100.com/tologin?fid={0}'.format(x)
#print(url)
try:
response=requests.get(url,timeout=1).text
#print(response)
soup=BeautifulSoup(response,features="lxml")
result=soup.find_all('span',attrs={'class':'l_schoolName2'})
reg=re.compile('<[^>]*>',re.S)
print('學校:{0}'.format(reg.sub('',str(result))))
except requests.exceptions.InvalidURL:
pass
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.ReadTimeout:
pass
輸出
學校:[]
學校:[]
學校:[
上海電子信息職業技術學院
]
學校:[]
學校:[
超星大學