[小爬蟲]——某網站視頻爬蟲


[小爬蟲]——某網站視頻爬蟲

  • 技術路線:requests + re

  • 關於exe下載:可能涉及到侵權

  • 源代碼:下面

  • 爬取思路:在html中找出加載資源的js文件,截取出一段結尾為.m3u8的亂碼字符,經過16進制解碼后得到一串有效的m3u8鏈接,爬取此m3u8文件並在此文件中找到新的.m3u8鏈接,再次轉碼后下載,里面存儲有此視頻的.ts文件,將.ts爬下來並合成即可

  • 效果:10s內完成400M的下載

  • 總結:去看了許多人的blog,許多網站存儲播放視頻都是采取一樣的策略,即[ 兩層.m3u8 + 無加密.ts文件 ]

  • 代碼:


import requests, re, os, threadpool

root = r'C:\Users\Administrator\PycharmProjects\freeTest\video' + '\\'
kv = {'User-Agent': 'Mozilla/5.0'}
mp4sNum, tim = 0, 0
ls = []
wangzhi = ''

def getHtml(url):
	try:
		r = requests.get(url, headers=kv, timeout=30)
		r.raise_for_status()
		r.encoding = r.apparent_encoding
		return r.text
	except:
		print('Html Error.')

def tran2(t):
	if t == 'A': return 10
	elif t == 'B': return 11
	elif t == 'C': return 12
	elif t == 'D': return 13
	elif t == 'E': return 14
	elif t == 'F': return 15
	else: return int(t)

def tran1(s):
	res = ''
	for i in range(0, len(s)):
		if s[i] == '%':
			res += chr(int(s[i + 1]) * 16 + tran2(s[i + 2]))
		elif s[i - 1] == '%' or s[i - 2] == '%': pass
		else: res += s[i]
	return res

def makeList(n, mp4s):
	for i in range(0, n + 10): ls.append([])
	size = (len(mp4s) // n) + 1
	count = 0
	block = 1
	for i in mp4s:
		count += 1
		if count > block * size:
			block += 1
			ls[block].append(i)
		else:
			ls[block].append(i)

def fastNB(tvs):
	if not len(tvs): return
	global tim, mp4sNum
	for i in tvs:
		tim += 1
		name = i.split('/')[-1].split('.')[0]
		path = root + str(name) + '.ts'
		if not os.path.exists(path):
			tag = 1
			while tag:
				try:
					r = requests.get(wangzhi + i, headers=kv)
					with open(path, 'wb') as f:
						f.write(r.content)
						print('\r進度為:{:.2%}'.format(tim / mp4sNum), end='')
						tag = 0
				except:
					print('\r進度為:{:.2%}'.format(tim / mp4sNum), end='')

def download(text):
	global mp4sNum
	mp4s = re.findall(pattern=r'/\d*?/\w*?/.*?/.*?/.*?\.ts', string=text)
	mp4sNum = len(mp4s)
	print('共有', mp4sNum, '個ts文件')
	n = int(input('輸入創建的線程數量:'))
	makeList(n, mp4s)
	pool = threadpool.ThreadPool(n)
	task = threadpool.makeRequests(fastNB, ls)
	[pool.putRequest(req) for req in task]
	pool.wait()
	print('\n正在重新排序...')
	cnt = 0
	for i in mp4s:
		cnt += 1
		num = str(cnt)
		if len(num) == 1: num = '0000' + num
		elif len(num) == 2: num = '000' + num
		elif len(num) == 3: num = '00' + num
		elif len(num) == 4: num = '0' + num
		name = i.split('/')[-1].split('.')[0]
		pathOld = root + name + '.ts'
		pathNew = root + num + '.ts'
		os.rename(pathOld, pathNew)
	print('正在拼裝視頻...')
	cnt = 0
	f = open(root + 'video.ts', 'wb+')
	for i in mp4s:
		cnt += 1
		name = str(cnt)
		if len(name) == 1: name = '0000' + name
		elif len(name) == 2: name = '000' + name
		elif len(name) == 3: name = '00' + name
		elif len(name) == 4: name = '0' + name
		path = root + str(name) + '.ts'
		for line in open(path, 'rb'):
			f.write(line)
		f.flush()
		os.remove(path)
	f.close()

def play(text):
	js = re.findall(pattern=r'/upload/playdata/.*?\.js', string=text)
	tmp = 'http://www.xxx.com' + js[0]
	text = getHtml(tmp)
	tmp = re.findall(pattern=r'\%u5728\%u7ebf\%u64ad\%u653e.*\%24\%u5728\%u7ebf\%u64ad\%u653e', string=text) # 提取“在線播放”
	if not len(tmp):
		tmp = re.findall(pattern='http.*?\.m3u8', string=text)
	else:
		text = text.replace(tmp[0], '')
		tmp = re.findall(pattern='http.*?\.m3u8', string=text)
	urlFake = 'http' + tran1(tmp[0].replace('https', ''))
	name = ''
	playerName = ['這里存各種網站使用的主用/備用播放器']
	for i in playerName:
		if urlFake.find(i) != -1:
			name = i
			break
	text = getHtml(urlFake)
	tmp = re.findall(pattern=r'/.*?m3u8', string=text)
	url = 'http://' + name + '.com/' + tmp[0]
	global wangzhi
	wangzhi = 'http://' + name + '.com/'
	download(getHtml(url))

def main():
	print('下載位置在同目錄的video下')
	url = input('Please input a link:')
	play(getHtml('http://www.xxx.com/' + url))

if __name__ == '__main__': main()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM