Python 使用selenium抓取網頁文本和下載音頻

本文轉載自查看原文 2020-01-20 16:15 1165 爬蟲

#！\usr\bin\env python
# -*- coding: utf-8 -*-

'一個自動從https://podcast.duolingo.com/spanish中下載音頻並且爬取文本的程序'
'需要配置下載以下所需庫，並且配置好webdriver.Chrome()，否則報錯'

from selenium import webdriver
import requests
import re
import os
import shelve
                   
def mainProc():
    '主進程'
    db = openDb()
    get_pages(db)
    get_episodes(db)
    db.close() 

def openDb():
    '打開data文件，如果當前路徑不存在，則新建文件並初始化'
    filename = "data.dat"
    if not os.path.exists(filename):
        db = shelve.open("data", writeback=True)
        db["pages"] = []
        db["episodes"] = []
    else:
        db = shelve.open("data", writeback=True)

    return db

def get_pages(db):
    '遍歷獲取所有頁面的網址並保存到shelve文件中'
    # 主頁面
    main = 'https://podcast.duolingo.com/spanish'  

    # 循環遍歷獲取所有頁面的網址
    # 第一頁則為主頁面，不需要在main末尾添加i
    #'https://podcast.duolingo.com/spanish2' 以此類推"
    # 如果頁面沒有在文件中存在，則嘗試訪問頁面，如果200成功，寫入文本
    
    for i in range(1, 100):   
        page = main if i == 1 else main + str(i)
        if not page in db["pages"]:                                 
            r = requests.get(page)
            print(f'{page} with status code {r.status_code}.') 
            if r.status_code != 200:                                
                break 
            db["pages"].append(page)
            # 獲取頁面所有節目鏈接並補全連接
            episodes = re.findall('entry-title">\s*<a href="(.*)" rel', r.text)
            for episode in episodes:
                episode = str(main[:-7]) + str(episode[2:])
                db["episodes"].append(episode)
                        
def get_episodes(db):
    '在每一頁中遍歷所有的單集網址'
    for episode in db["episodes"]:
        r = requests.get(episode)
        print(f'{episode} with status code {r.status_code}.') 
        if r.status_code != 200:
            continue
        # 將頁面的文本寫入文件中並下載音頻
        get_transcript(episode)
        get_audios(r, episode)

def get_transcript(episode):
    # 獲取節目單集網址中的文本
    filename = 'transcript/' + episode.split('/')[-1] + '.txt'
    if os.path.exists(filename):
        print(filename, 'existed!')
    else:
        req = requests.get(episode)
        print('{episode} with status code {status}.'.format(episode=episode, status=req.status_code))
        if not os.path.exists('transcript'):
            os.mkdir('transcript')
        with open(filename, 'w+', encoding="utf-8") as fp:
            for lines in re.findall('strong>(.*)</strong>(.*)</p>', req.text):
                for line in lines:
                    fp.write(line)
                fp.write('\n\n')
            print(filename, 'added!')

def get_audios(r, episode):
    audio = "https:" + re.findall('<iframe .* src="(.*)" height', r.text)[0]
    # 自定義下載配置
    chromeOptions = webdriver.ChromeOptions()
    chromeOptions.add_argument("--ignore-certificate-errors")
    prefs = {"download.default_directory":r"E:\Python\code\project\duolingo\audio"}
    chromeOptions.add_experimental_option("prefs", prefs)
    # 下載文件
    print(audio)
    browser = webdriver.Chrome(chrome_options=chromeOptions)
    browser.get(audio)
    if not os.path.exists("audio"):
        os.mkdir("audio")
    browser.find_element_by_id('download-player').click()
    download_status = False
    while not download_status:
        download_status = True
        for i in os.listdir('audio'):
            if i.endswith(".crdownload"):
                download_status = False
                time.sleep(5)
    browser.close()

if __name__ == "__main__":
    mainProc()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python抓取網頁動態數據——selenium webdriver的使用使用Python3爬蟲抓取網頁來下載小說 Python selenium自動化網頁抓取器 python+selenium動態抓取網頁數據怎樣使用python爬蟲進行網頁圖片抓取 python使用ip代理抓取網頁 selenium提取網頁文本 python selenium 關於將網頁打包為靜態網頁（mhtml）下載。使用瀏覽器從網頁下載音頻文件網頁爬蟲--python3.6+selenium+BeautifulSoup實現動態網頁的數據抓取，適用於對抓取頻率不高的情況