校花網視頻爬取

本文轉載自查看原文 2018-01-15 21:13 1269 爬蟲

# 簡易版爬取校花網視頻
import requests
import re
import hashlib
import time

movie_path = "C:\mp4"     #視頻存儲目錄

def get_page(url):
    '''
    返回一個字符串的網頁頁面
    :param url: 
    :return: 
    '''
    try:
        response = requests.get(url)      # 請求傳入的url
        if response.status_code == 200:   # 如果頁面返回200：正常返回text字符串
            return response.text
    except Exception:
        pass

def parse_index(index_page):
    '''
    正則匹配到頁面中的每個視頻鏈接[],[],[],[],[],[],[]...
    :param index_page: 
    :return: 每次返回一個列表，也就是一個url
    '''
    urls = re.findall('class="items".*?href="(.*?)"',index_page,re.S)   #查找匹配的url
    for url in urls:
        print(url)
        if not url.startswith("http"):
            '''
            因為某些鏈接不知只是域名后邊的字符串，所以加判斷，
            '''
            url = "http://www.xiaohuar.com"+url
        yield url

def parse_detail(detail_page):
    '''
    接收上面函數傳入的url，正則匹配查到視頻的url鏈接
    :param detail_page: 
    :return: 返回視頻的url鏈接
    '''
    l = re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
    if l:
        movie_url = l[0]
        if movie_url.endswith("mp4"):
            yield movie_url


def get_movie(url):
    '''
    接收一個視頻的url
    :param url: 
    :return: 
    '''
    try:
        response = requests.get(url)
        # response：請求到的資源
        if response.status_code == 200:
            m = hashlib.md5()
            m.update(str(time.time()).encode("utf-8"))
            m.update(url.encode("utf-8"))
            filepath = "%s\%s.mp4" % (movie_path, m.hexdigest())   # 視頻名字是movie_path/時間字符串的哈希值的加密字符串
            with open(filepath, "wb") as f:
                f.write(response.content)                #文件是以wb模式打開，所以用content的方式寫入
                print("%s 下載成功" % url)
    except Exception:
        pass


def main():
    '''
    url：格式化后的url字符串；
    index_page:第一次請求到的頁面；
    detail_urls：頁面中的url列表
    detail_page：上邊列表中的url每個發送一次get請求
    movie_urls：解析后的視頻地址
    :return: 文件寫入硬盤
    '''
    base_url = 'http://www.xiaohuar.com/list-3-{page_num}.html'     # 請求地址
    for i in range(5):                                              # 視頻總共有五頁
        url = base_url.format(page_num=i)
        index_page = get_page(url)
        detail_urls = parse_index(index_page) 
        for detail_url in detail_urls:
            detail_page = get_page(detail_url)
            movie_urls = parse_detail(detail_page)
            for movie_url in movie_urls:
                get_movie(movie_url)

if __name__ == '__main__':
    main()

#並發爬取
import requests #pip3 install requests
import re
import hashlib
import time
from concurrent.futures import ThreadPoolExecutor

pool=ThreadPoolExecutor(50)
movie_path=r'C:\mp4'

def get_page(url):
    try:
        response=requests.get(url)
        if response.status_code == 200:
            return response.text
    except Exception:
        pass

def parse_index(index_page):
    index_page=index_page.result()
    urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
    for detail_url in urls:
        if not detail_url.startswith('http'):
            detail_url='http://www.xiaohuar.com'+detail_url
        pool.submit(get_page,detail_url).add_done_callback(parse_detail)

def parse_detail(detail_page):
    detail_page=detail_page.result()
    l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
    if l:
        movie_url=l[0]
        if movie_url.endswith('mp4'):
            pool.submit(get_movie,movie_url)

def get_movie(url):
    try:
        response=requests.get(url)
        if response.status_code == 200:
            m=hashlib.md5()
            m.update(str(time.time()).encode('utf-8'))
            m.update(url.encode('utf-8'))
            filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
            with open(filepath,'wb') as f:
                f.write(response.content)
                print('%s 下載成功' %url)
    except Exception:
        pass

def main():
    base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
    for i in range(5):
        url=base_url.format(page_num=i)
        pool.submit(get_page,url).add_done_callback(parse_index)

if __name__ == '__main__':
    main()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python-爬取校花網視頻(單線程和多線程版本) 校花網爬取校花照片 python爬蟲:爬取慕課網視頻 Python爬蟲爬取1905電影網視頻電影並存儲到mysql數據庫 python 爬取視頻爬取VIP視頻爬取西瓜視頻爬取網站視頻爬取VIP視頻爬取騰訊視頻