# 簡易版爬取校花網視頻
import requests import re import hashlib import time movie_path = "C:\mp4" #視頻存儲目錄 def get_page(url): ''' 返回一個字符串的網頁頁面 :param url: :return: ''' try: response = requests.get(url) # 請求傳入的url if response.status_code == 200: # 如果頁面返回200:正常返回text字符串 return response.text except Exception: pass def parse_index(index_page): ''' 正則匹配到頁面中的每個視頻鏈接[],[],[],[],[],[],[]... :param index_page: :return: 每次返回一個列表,也就是一個url ''' urls = re.findall('class="items".*?href="(.*?)"',index_page,re.S) #查找匹配的url for url in urls: print(url) if not url.startswith("http"): ''' 因為某些鏈接不知只是域名后邊的字符串,所以加判斷, ''' url = "http://www.xiaohuar.com"+url yield url def parse_detail(detail_page): ''' 接收上面函數傳入的url,正則匹配查到視頻的url鏈接 :param detail_page: :return: 返回視頻的url鏈接 ''' l = re.findall('id="media".*?src="(.*?)"',detail_page,re.S) if l: movie_url = l[0] if movie_url.endswith("mp4"): yield movie_url def get_movie(url): ''' 接收一個視頻的url :param url: :return: ''' try: response = requests.get(url) # response:請求到的資源 if response.status_code == 200: m = hashlib.md5() m.update(str(time.time()).encode("utf-8")) m.update(url.encode("utf-8")) filepath = "%s\%s.mp4" % (movie_path, m.hexdigest()) # 視頻名字是movie_path/時間字符串的哈希值的加密字符串 with open(filepath, "wb") as f: f.write(response.content) #文件是以wb模式打開,所以用content的方式寫入 print("%s 下載成功" % url) except Exception: pass def main(): ''' url:格式化后的url字符串; index_page:第一次請求到的頁面; detail_urls:頁面中的url列表 detail_page:上邊列表中的url每個發送一次get請求 movie_urls:解析后的視頻地址 :return: 文件寫入硬盤 ''' base_url = 'http://www.xiaohuar.com/list-3-{page_num}.html' # 請求地址 for i in range(5): # 視頻總共有五頁 url = base_url.format(page_num=i) index_page = get_page(url) detail_urls = parse_index(index_page) for detail_url in detail_urls: detail_page = get_page(detail_url) movie_urls = parse_detail(detail_page) for movie_url in movie_urls: get_movie(movie_url) if __name__ == '__main__': main()
#並發爬取 import requests #pip3 install requests import re import hashlib import time from concurrent.futures import ThreadPoolExecutor pool=ThreadPoolExecutor(50) movie_path=r'C:\mp4' def get_page(url): try: response=requests.get(url) if response.status_code == 200: return response.text except Exception: pass def parse_index(index_page): index_page=index_page.result() urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S) for detail_url in urls: if not detail_url.startswith('http'): detail_url='http://www.xiaohuar.com'+detail_url pool.submit(get_page,detail_url).add_done_callback(parse_detail) def parse_detail(detail_page): detail_page=detail_page.result() l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S) if l: movie_url=l[0] if movie_url.endswith('mp4'): pool.submit(get_movie,movie_url) def get_movie(url): try: response=requests.get(url) if response.status_code == 200: m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(url.encode('utf-8')) filepath='%s\%s.mp4' %(movie_path,m.hexdigest()) with open(filepath,'wb') as f: f.write(response.content) print('%s 下載成功' %url) except Exception: pass def main(): base_url='http://www.xiaohuar.com/list-3-{page_num}.html' for i in range(5): url=base_url.format(page_num=i) pool.submit(get_page,url).add_done_callback(parse_index) if __name__ == '__main__': main()