scrapy爬蟲下載音頻文件並儲存到本地

本文轉載自查看原文 2020-05-29 21:59 1481 爬蟲/ python

玩爬蟲，怎么能少了scrapy框架呢。scrapy框架被稱為是復雜並好用的爬蟲框架。

當初學框架的時候是一頭霧水，一旦實戰成功過后，感覺瞬間打通了任督二脈，很有成就感。

接下來，將對scrapy框架爬蟲代碼編寫流程做簡要說明：

import scrapy


class OnlinelistenningItem(scrapy.Item):
    file_text = scrapy.Field()  # 左邊是字段名，右邊是item對象。item是字典類型數據，字段通過item['file_text']方式提取數據。
    text_path = scrapy.Field()
    file_paths = scrapy.Field()
    file_urls = scrapy.Field()

四、解析網頁，得到相應數據

1. 打開spider_name.py文件

2. 將解析獲取到的url音頻下載鏈接返回

# -*- coding: utf-8 -*-
import scrapy
from urllib import parse
import time
from scrapy import Selector
from ..items import OnlinelistenningItem
from scrapy.utils.project import get_project_settings


class TingroomSpider(scrapy.Spider):
    name = 'tingroom'
    allowed_domains = ['tingroom.com']
    start_urls = ['http://www.tingroom.com/lesson/']  # 起始網頁，通過起始網頁解析獲取更多url，然后直到解析到所需音頻鏈接
    domain = 'http://www.tingroom.com'
    root_dir = get_project_settings().get('FILES_STORE') + '\\'  # 獲取根目錄，在setting文件中定義的變量

    def parse(self, response):
        # response.body  # 獲取音頻圖片下載到的數據，以二進制寫入文件的方式儲存
        listenning_rts = response.xpath('/html/body/div[5]//ul[@id="line_01"]//a')
        for class1_rt in listenning_rts:
            class1_title = class1_rt.xpath('./text()').extract_first().strip()
            class1_path = self.root_dir + class1_title  # 判斷title文件夾是否存在
            first_url = class1_rt.xpath('./@href').extract_first().strip()
            first_url = parse.urljoin(self.domain, first_url)  # 類別1鏈接
            meta = {'result_path': class1_path}
            if class1_title in download_list:
                print('first_url:', first_url)
                yield scrapy.http.Request(first_url, meta=meta, callback=self.listenningParse)  # 將獲取到的連接傳給listenningParse進行進一步解析，通過meta傳遞參數

    def listenningParse(self, response):  # 聽力板塊解析
        meta = response.meta  # meta是字典若直接使用meta['']取值，若無會報錯。采用get方法，若無數據不會報錯，且返回None
        result_path = meta.get('result_path')  # 當前文件列表的目錄
        
        # 獲取下一頁數據
        next_url = response.xpath('//div[@class="dede_pages"]//a[text()="下一頁"]/@href')
        if next_url:
            next_url = next_url.extract_first()
            next_url = parse.urljoin(self.domain, next_url)  # 下一頁鏈接，再調用自己處理數據
            yield scrapy.http.Request(next_url, meta=meta, callback=self.listenningParse)  # 調用自身，繼續執行
        # 獲取所有內容的標題和正文鏈接
        article_rts = response.xpath('//a[@class="goog"]')
        if article_rts:
            for article_rt in article_rts:
                article_url = article_rt.xpath('./@href').extract_first().strip()
                article_url = parse.urljoin(self.domain, article_url)
                article_title = article_rt.xpath('./text()')
                if article_title:
                    article_title = article_title.extract_first().strip().replace(':', '：').replace('/',
                                                                                                    '_').replace(
                        '\\', '_').replace('*', '').replace(
                        '?', '？').replace('\"', '”').replace('|', '').replace('<', '《').replace('>', '》').replace(
                        ' ', '_')
                    meta['article_title'] = article_title
                    yield scrapy.http.Request(article_url, meta=meta,
                                              callback=self.listenningArticlePage)  # 傳給listenningArticlePage繼續解析

    def listenningArticlePage(self, response):  # 進入正文
        meta = response.meta  # 獲取參數
        
        # 獲取正文內容：文本，字幕
        rs_texts = response.xpath('//div[@class="content"]//text()').extract()
        rs_text = [i.strip().replace('\n', '').replace('\r', '').replace('\t', '').replace('\xa0', '') for i in
                   rs_texts]
        while '' in rs_text:
            rs_text.remove('')
        file_text = []
        for rs in rs_text:
            if rs == '點擊':  # 用於分割正文和單詞
                rs = '\n' + '=' * 60 + '\n' + '\n重要詞匯：'
            # 如果是純數字，或其他字符串，就跳過
            elif 'google_ad_client' in rs or 'tingroom' in rs or '單詞翻譯:' in rs or '收聽單詞發音' == rs or rs.isdigit():
                continue
            file_text.append(rs + '\n')
        
        # 獲取下載鏈接，返回給pipeline下載並儲存文件
        download_rt = response.xpath('//param[@name="movie"]/@value')
        if download_rt:
            download_url = download_rt.re_first('http:.*')  # 下載鏈接
            file_type = download_rt.re_first('com.*(\..*)')  # 根據下載鏈接，獲取下載文件類型，有的是mp3，有的是rm等
            if not file_type:
                return
            file_name = meta.get('article_title') + file_type
            file_path = meta.get('result_path') + '\\' + file_name  # 文件儲存目錄 + 名稱
            text_path = meta.get('result_path') + '\\' + meta.get('article_title') + '.txt'  # 文本儲存目錄 + 名稱

            item = OnlinelistenningItem()  # item中定義的字段，此處通過其建立對象
            item['file_text'] = file_text  # 通過字典方法，存入數據
            item['text_path'] = text_path  # 文本路徑 + 文件名，用於存儲。因為是自定義儲存，所以文本路徑是絕對路徑
            item['file_paths'] = [file_path]  # 音頻儲存路徑，因為是scrapy儲存，所以可以是相對路徑，可以是絕對路徑。
            item['file_urls'] = [download_url]  # 音頻下載鏈接
            yield item  # 此處，會將下載鏈接等信息傳給pipeline，pipeline里面配置好，會自動下載並儲存文件

五、儲存數據

1. 打開pipelines.py文件：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from .items import OnlinelistenningItem  # 如果需要判斷不同的item，需要導入item，用isinstance(item, item_name)來判斷
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request

class OnlinelistenningSelfPipeline(FilesPipeline):
    def get_media_requests(self, item, info):  # 獲取item中的url，用於下載文件
        file_url = item['file_urls'][0]
        yield Request(file_url, meta=item)

    def file_path(self, request, response=None, info=None):  # 通過request匹配設置文件路徑
        meta = request.meta
        file_path = meta.get('file_paths')[0]  # 自動儲存。相對路徑（相對setting.py中的FILES_STORE），或絕對路徑
        return file_path

    def item_completed(self, results, item, info):
        with open(item['text_path'], 'w', encoding='utf8') as f:  # 自己寫入文本內容到對應路徑下，同樣通過item傳入數據（路徑 + 文本內容）
            f.writelines(item['file_text'])
        print(f'{item["file_paths"][0]}下載完成！\n{"*" * 50}\n')
        return item

六、user_agent和ip代理設置

1. 打開middlewares.py文件

2. 輸入下面代碼：

from fake_useragent import UserAgent
class OnlinelistenningUseragentMiddleware(object):
    def __init__(self):
        self.ua = UserAgent()  # 建立UserAgent對象
    def process_request(self, request, spider):
        us_agent = self.ua.random  # 調用UserAgent().random生成隨機的user agent
        request.headers.setdefault('User-Agent', us_agent)

import json, random
class OnlinelistenningProxyMiddleware(object):
    def __init__(self):
        pass
    def process_request(self, request, spider):
        ip = 'https://114.98.25.25:4216'  
        request.meta['proxy'] = ip  # 將ip地址傳入即可

七、setting.py文件設置

# 關閉機器協議
ROBOTSTXT_OBEY = False  # 一定要關閉，不然無法爬取到數據

# 儲存文件根目錄設置：
# 注意windows中斜杠方向，如果方向反了程序也能運行，不過會有個小bug，會在當前代碼路徑下生成一個空文件夾。名字為自定義的文件夾。
FILES_STORE = 'F:\\在線英語聽力室\\聽力教程new'  # 儲存路徑，注意：應使用雙\\，單斜杠會讓\202,\201等識別為url狀態碼

# 長連接斷開時間
DOWNLOAD_TIMEOUT = 1800  


# 代理和user_agent設置
DOWNLOADER_MIDDLEWARES = {  
    'OnlineListenning.middlewares.OnlinelistenningUseragentMiddleware': 543,  # 打開useragent，數字表示優先級，越小優先級越高
    'OnlineListenning.middlewares.OnlinelistenningProxyMiddleware': 542,  # 打開ip代理
}

# 打開pipeline下載
ITEM_PIPELINES = {
    'OnlineListenning.pipelines.OnlinelistenningSelfPipeline': 1,  # 注意修改自己的pipeline名稱
}

# 下載延時
DOWNLOAD_DELAY = 3

# 同時下載數量，減少服務器壓力
CONCURRENT_REQUESTS_PER_DOMAIN = 8

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 (七) 爬蟲之爬取視頻和音頻文件 Android 使用URLConnection下載音頻文件爬蟲下載圖片信息保存到本地文件夾下音頻文件格式關於音頻文件的上傳 Html5選擇本地視頻音頻文件播放 scrapy爬蟲系列之三--爬取圖片保存到本地使用瀏覽器從網頁下載音頻文件教你如何下載微信公眾號的音頻文件重命名從喜馬拉雅下載的音頻文件

scrapy爬蟲下載音頻文件並儲存到本地

目錄

一、新建工程

二、新建spider

三、定義所需爬取字段

四、解析網頁，得到相應數據

五、儲存數據

六、user_agent和ip代理設置

七、setting.py文件設置

免責聲明！