Python 使用selenium抓取網頁文本和下載音頻
#!\usr\bin\env python # -*- coding: utf-8 -*- '一個自動從https://podcast.duolingo.com/spanish中下載音頻並且爬取文本的程序' '需要配置下載以下所需庫,並且配置好webdriver.Chrome(),否則報錯' from selenium import webdriver import requests import re import os import shelve def mainProc(): '主進程' db = openDb() get_pages(db) get_episodes(db) db.close() def openDb(): '打開data文件,如果當前路徑不存在,則新建文件並初始化' filename = "data.dat" if not os.path.exists(filename): db = shelve.open("data", writeback=True) db["pages"] = [] db["episodes"] = [] else: db = shelve.open("data", writeback=True) return db def get_pages(db): '遍歷獲取所有頁面的網址並保存到shelve文件中' # 主頁面 main = 'https://podcast.duolingo.com/spanish' # 循環遍歷獲取所有頁面的網址 # 第一頁則為主頁面,不需要在main末尾添加i #'https://podcast.duolingo.com/spanish2' 以此類推" # 如果頁面沒有在文件中存在,則嘗試訪問頁面,如果200成功,寫入文本 for i in range(1, 100): page = main if i == 1 else main + str(i) if not page in db["pages"]: r = requests.get(page) print(f'{page} with status code {r.status_code}.') if r.status_code != 200: break db["pages"].append(page) # 獲取頁面所有節目鏈接並補全連接 episodes = re.findall('entry-title">\s*<a href="(.*)" rel', r.text) for episode in episodes: episode = str(main[:-7]) + str(episode[2:]) db["episodes"].append(episode) def get_episodes(db): '在每一頁中遍歷所有的單集網址' for episode in db["episodes"]: r = requests.get(episode) print(f'{episode} with status code {r.status_code}.') if r.status_code != 200: continue # 將頁面的文本寫入文件中並下載音頻 get_transcript(episode) get_audios(r, episode) def get_transcript(episode): # 獲取節目單集網址中的文本 filename = 'transcript/' + episode.split('/')[-1] + '.txt' if os.path.exists(filename): print(filename, 'existed!') else: req = requests.get(episode) print('{episode} with status code {status}.'.format(episode=episode, status=req.status_code)) if not os.path.exists('transcript'): os.mkdir('transcript') with open(filename, 'w+', encoding="utf-8") as fp: for lines in re.findall('strong>(.*)</strong>(.*)</p>', req.text): for line in lines: fp.write(line) fp.write('\n\n') print(filename, 'added!') def get_audios(r, episode): audio = "https:" + re.findall('<iframe .* src="(.*)" height', r.text)[0] # 自定義下載配置 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_argument("--ignore-certificate-errors") prefs = {"download.default_directory":r"E:\Python\code\project\duolingo\audio"} chromeOptions.add_experimental_option("prefs", prefs) # 下載文件 print(audio) browser = webdriver.Chrome(chrome_options=chromeOptions) browser.get(audio) if not os.path.exists("audio"): os.mkdir("audio") browser.find_element_by_id('download-player').click() download_status = False while not download_status: download_status = True for i in os.listdir('audio'): if i.endswith(".crdownload"): download_status = False time.sleep(5) browser.close() if __name__ == "__main__": mainProc()