這是一個通過使用requests和BeautifulSoup庫,簡單爬取網站的所有超鏈接的小爬蟲。有任何問題歡迎留言討論。
import requests from bs4 import BeautifulSoup def getHTMLText(url): ''' 此函數用於獲取網頁的html文檔 ''' try: #獲取服務器的響應內容,並設置最大請求時間為6秒 res = requests.get(url, timeout = 6) #判斷返回狀態碼是否為200 res.raise_for_status() #設置該html文檔可能的編碼 res.encoding = res.apparent_encoding #返回網頁HTML代碼 return res.text except: return '產生異常' def main(): ''' 主函數 ''' #目標網頁,這個可以換成一個你喜歡的網站 url = 'https://www.cnblogs.com/huwt/' demo = getHTMLText(url) #解析HTML代碼 soup = BeautifulSoup(demo, 'html.parser') #模糊搜索HTML代碼的所有包含href屬性的<a>標簽 a_labels = soup.find_all('a', attrs={'href': True}) #獲取所有<a>標簽中的href對應的值,即超鏈接 for a in a_labels: print(a.get('href')) main()
測試結果:
https://www.cnblogs.com/huwt/ https://www.cnblogs.com/huwt/ https://www.cnblogs.com/ https://www.cnblogs.com/huwt/ https://i.cnblogs.com/EditPosts.aspx?opt=1 https://msg.cnblogs.com/send/%E8%B7%AF%E6%BC%AB%E6%BC%AB%E6%88%91%E4%B8%8D%E7%95%8F https://www.cnblogs.com/huwt/rss https://i.cnblogs.com/ https://www.cnblogs.com/huwt/archive/2019/04/10.html https://www.cnblogs.com/huwt/p/10680209.html https://www.cnblogs.com/huwt/p/10680209.html https://i.cnblogs.com/EditPosts.aspx?postid=10680209 https://www.cnblogs.com/huwt/p/10685968.html https://www.cnblogs.com/huwt/p/10685968.html https://i.cnblogs.com/EditPosts.aspx?postid=10685968 https://www.cnblogs.com/huwt/archive/2019/04/08.html https://www.cnblogs.com/huwt/p/10673470.html https://www.cnblogs.com/huwt/p/10673470.html https://i.cnblogs.com/EditPosts.aspx?postid=10673470 https://www.cnblogs.com/huwt/archive/2019/03/31.html https://www.cnblogs.com/huwt/p/10633896.html https://www.cnblogs.com/huwt/p/10633896.html https://i.cnblogs.com/EditPosts.aspx?postid=10633896 https://www.cnblogs.com/huwt/p/10632084.html https://www.cnblogs.com/huwt/p/10632084.html https://i.cnblogs.com/EditPosts.aspx?postid=10632084 https://www.cnblogs.com/huwt/archive/2019/03/30.html https://www.cnblogs.com/huwt/p/10629625.html https://www.cnblogs.com/huwt/p/10629625.html https://i.cnblogs.com/EditPosts.aspx?postid=10629625 https://www.cnblogs.com/huwt/archive/2019/03/25.html https://www.cnblogs.com/huwt/p/10597502.html https://www.cnblogs.com/huwt/p/10597502.html https://i.cnblogs.com/EditPosts.aspx?postid=10597502 https://www.cnblogs.com/huwt/archive/2019/03/24.html https://www.cnblogs.com/huwt/p/10591353.html https://www.cnblogs.com/huwt/p/10591353.html https://i.cnblogs.com/EditPosts.aspx?postid=10591353 https://www.cnblogs.com/huwt/archive/2019/03/16.html https://www.cnblogs.com/huwt/p/10540942.html https://www.cnblogs.com/huwt/p/10540942.html https://i.cnblogs.com/EditPosts.aspx?postid=10540942 https://www.cnblogs.com/huwt/p/10541675.html https://www.cnblogs.com/huwt/p/10541675.html https://i.cnblogs.com/EditPosts.aspx?postid=10541675 https://www.cnblogs.com/huwt/default.html?page=2 [Finished in 1.1s]