網頁獲取用的是requests包,網頁解析的方式有re與beautifulsoup兩種。
1.網頁獲取:
import requests
url="https://dblp.uni-trier.de/search/publ/api?q=heterogeneous%20graph%20year%3A2021%3A%7Cyear%3A2020%3A%20venue%3AInf._Sci.%3A&h=1000&format=xml"
response = requests.get(url) # 獲取網頁響應
encoding = response.apparent_encoding
response.encoding = encoding # 設置字符集
#解析數據
print(response.text) # 打印網頁xml代碼
2.網頁解析:
import re
obj = re.compile(r'<hit.*?pid=".*?">(?P<name>.*?)</author>.*?<title>(?P<title>.*?)</title><venue>(?P<venue>.*?)</venue><volume>(?P<volume>.*?)</volume><pages>(?P<pages>.*?)</pages><year>(?P<year>.*?)</year>.*?<doi>(?P<doi>.*?)</doi><ee>(?P<ee>.*?)</ee>',re.S)
result = obj.finditer(xml)
for it in result:
list[i].append(it.group("name"))
print("firstAuthor:" + it.group("name")) # 第一作者
list[i].append(it.group("title"))
print("title:"+it.group("title")) #標題
if it.group("venue")!=None:
list[i].append(it.group("venue"))
print("venue:" + it.group("venue"))
list[i].append(it.group("volume"))
print("volume:" + it.group("volume"))
list[i].append(it.group("pages"))
print("pages:"+it.group("pages"))
list[i].append(it.group("year"))
print("year:"+it.group("year"))
list[i].append(it.group("doi"))
print("doi:"+it.group("doi"))
list[i].append(it.group("ee"))
print("ee:"+it.group("ee"))
print(list)