# 簡易版爬取校花網視頻
import requests
import re
import hashlib
import time
movie_path = "C:\mp4" #視頻存儲目錄
def get_page(url):
'''
返回一個字符串的網頁頁面
:param url:
:return:
'''
try:
response = requests.get(url) # 請求傳入的url
if response.status_code == 200: # 如果頁面返回200:正常返回text字符串
return response.text
except Exception:
pass
def parse_index(index_page):
'''
正則匹配到頁面中的每個視頻鏈接[],[],[],[],[],[],[]...
:param index_page:
:return: 每次返回一個列表,也就是一個url
'''
urls = re.findall('class="items".*?href="(.*?)"',index_page,re.S) #查找匹配的url
for url in urls:
print(url)
if not url.startswith("http"):
'''
因為某些鏈接不知只是域名后邊的字符串,所以加判斷,
'''
url = "http://www.xiaohuar.com"+url
yield url
def parse_detail(detail_page):
'''
接收上面函數傳入的url,正則匹配查到視頻的url鏈接
:param detail_page:
:return: 返回視頻的url鏈接
'''
l = re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
if l:
movie_url = l[0]
if movie_url.endswith("mp4"):
yield movie_url
def get_movie(url):
'''
接收一個視頻的url
:param url:
:return:
'''
try:
response = requests.get(url)
# response:請求到的資源
if response.status_code == 200:
m = hashlib.md5()
m.update(str(time.time()).encode("utf-8"))
m.update(url.encode("utf-8"))
filepath = "%s\%s.mp4" % (movie_path, m.hexdigest()) # 視頻名字是movie_path/時間字符串的哈希值的加密字符串
with open(filepath, "wb") as f:
f.write(response.content) #文件是以wb模式打開,所以用content的方式寫入
print("%s 下載成功" % url)
except Exception:
pass
def main():
'''
url:格式化后的url字符串;
index_page:第一次請求到的頁面;
detail_urls:頁面中的url列表
detail_page:上邊列表中的url每個發送一次get請求
movie_urls:解析后的視頻地址
:return: 文件寫入硬盤
'''
base_url = 'http://www.xiaohuar.com/list-3-{page_num}.html' # 請求地址
for i in range(5): # 視頻總共有五頁
url = base_url.format(page_num=i)
index_page = get_page(url)
detail_urls = parse_index(index_page)
for detail_url in detail_urls:
detail_page = get_page(detail_url)
movie_urls = parse_detail(detail_page)
for movie_url in movie_urls:
get_movie(movie_url)
if __name__ == '__main__':
main()
#並發爬取
import requests #pip3 install requests
import re
import hashlib
import time
from concurrent.futures import ThreadPoolExecutor
pool=ThreadPoolExecutor(50)
movie_path=r'C:\mp4'
def get_page(url):
try:
response=requests.get(url)
if response.status_code == 200:
return response.text
except Exception:
pass
def parse_index(index_page):
index_page=index_page.result()
urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
for detail_url in urls:
if not detail_url.startswith('http'):
detail_url='http://www.xiaohuar.com'+detail_url
pool.submit(get_page,detail_url).add_done_callback(parse_detail)
def parse_detail(detail_page):
detail_page=detail_page.result()
l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
if l:
movie_url=l[0]
if movie_url.endswith('mp4'):
pool.submit(get_movie,movie_url)
def get_movie(url):
try:
response=requests.get(url)
if response.status_code == 200:
m=hashlib.md5()
m.update(str(time.time()).encode('utf-8'))
m.update(url.encode('utf-8'))
filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
with open(filepath,'wb') as f:
f.write(response.content)
print('%s 下載成功' %url)
except Exception:
pass
def main():
base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
for i in range(5):
url=base_url.format(page_num=i)
pool.submit(get_page,url).add_done_callback(parse_index)
if __name__ == '__main__':
main()