[小爬蟲]——某網站視頻爬蟲
-
技術路線:requests + re
-
關於exe下載:可能涉及到侵權
-
源代碼:下面
-
爬取思路:在html中找出加載資源的js文件,截取出一段結尾為.m3u8的亂碼字符,經過16進制解碼后得到一串有效的m3u8鏈接,爬取此m3u8文件並在此文件中找到新的.m3u8鏈接,再次轉碼后下載,里面存儲有此視頻的.ts文件,將.ts爬下來並合成即可
-
效果:10s內完成400M的下載
-
總結:去看了許多人的blog,許多網站存儲播放視頻都是采取一樣的策略,即[ 兩層.m3u8 + 無加密.ts文件 ]
-
代碼:
import requests, re, os, threadpool
root = r'C:\Users\Administrator\PycharmProjects\freeTest\video' + '\\'
kv = {'User-Agent': 'Mozilla/5.0'}
mp4sNum, tim = 0, 0
ls = []
wangzhi = ''
def getHtml(url):
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('Html Error.')
def tran2(t):
if t == 'A': return 10
elif t == 'B': return 11
elif t == 'C': return 12
elif t == 'D': return 13
elif t == 'E': return 14
elif t == 'F': return 15
else: return int(t)
def tran1(s):
res = ''
for i in range(0, len(s)):
if s[i] == '%':
res += chr(int(s[i + 1]) * 16 + tran2(s[i + 2]))
elif s[i - 1] == '%' or s[i - 2] == '%': pass
else: res += s[i]
return res
def makeList(n, mp4s):
for i in range(0, n + 10): ls.append([])
size = (len(mp4s) // n) + 1
count = 0
block = 1
for i in mp4s:
count += 1
if count > block * size:
block += 1
ls[block].append(i)
else:
ls[block].append(i)
def fastNB(tvs):
if not len(tvs): return
global tim, mp4sNum
for i in tvs:
tim += 1
name = i.split('/')[-1].split('.')[0]
path = root + str(name) + '.ts'
if not os.path.exists(path):
tag = 1
while tag:
try:
r = requests.get(wangzhi + i, headers=kv)
with open(path, 'wb') as f:
f.write(r.content)
print('\r進度為:{:.2%}'.format(tim / mp4sNum), end='')
tag = 0
except:
print('\r進度為:{:.2%}'.format(tim / mp4sNum), end='')
def download(text):
global mp4sNum
mp4s = re.findall(pattern=r'/\d*?/\w*?/.*?/.*?/.*?\.ts', string=text)
mp4sNum = len(mp4s)
print('共有', mp4sNum, '個ts文件')
n = int(input('輸入創建的線程數量:'))
makeList(n, mp4s)
pool = threadpool.ThreadPool(n)
task = threadpool.makeRequests(fastNB, ls)
[pool.putRequest(req) for req in task]
pool.wait()
print('\n正在重新排序...')
cnt = 0
for i in mp4s:
cnt += 1
num = str(cnt)
if len(num) == 1: num = '0000' + num
elif len(num) == 2: num = '000' + num
elif len(num) == 3: num = '00' + num
elif len(num) == 4: num = '0' + num
name = i.split('/')[-1].split('.')[0]
pathOld = root + name + '.ts'
pathNew = root + num + '.ts'
os.rename(pathOld, pathNew)
print('正在拼裝視頻...')
cnt = 0
f = open(root + 'video.ts', 'wb+')
for i in mp4s:
cnt += 1
name = str(cnt)
if len(name) == 1: name = '0000' + name
elif len(name) == 2: name = '000' + name
elif len(name) == 3: name = '00' + name
elif len(name) == 4: name = '0' + name
path = root + str(name) + '.ts'
for line in open(path, 'rb'):
f.write(line)
f.flush()
os.remove(path)
f.close()
def play(text):
js = re.findall(pattern=r'/upload/playdata/.*?\.js', string=text)
tmp = 'http://www.xxx.com' + js[0]
text = getHtml(tmp)
tmp = re.findall(pattern=r'\%u5728\%u7ebf\%u64ad\%u653e.*\%24\%u5728\%u7ebf\%u64ad\%u653e', string=text) # 提取“在線播放”
if not len(tmp):
tmp = re.findall(pattern='http.*?\.m3u8', string=text)
else:
text = text.replace(tmp[0], '')
tmp = re.findall(pattern='http.*?\.m3u8', string=text)
urlFake = 'http' + tran1(tmp[0].replace('https', ''))
name = ''
playerName = ['這里存各種網站使用的主用/備用播放器']
for i in playerName:
if urlFake.find(i) != -1:
name = i
break
text = getHtml(urlFake)
tmp = re.findall(pattern=r'/.*?m3u8', string=text)
url = 'http://' + name + '.com/' + tmp[0]
global wangzhi
wangzhi = 'http://' + name + '.com/'
download(getHtml(url))
def main():
print('下載位置在同目錄的video下')
url = input('Please input a link:')
play(getHtml('http://www.xxx.com/' + url))
if __name__ == '__main__': main()