視頻抓取原理:獲取所有的知識類別id-》然后獲取其子項-》根據子項鏈接分析獲取該類課程數-》循環獲取鏈接指向的視頻。
需要安裝python庫:requests
python解析xml使用了網上找的代碼。
本來可以再優化。但是懶!
1 # coding: UTF-8
2 import os
3 import sys
4 import requests
5 import urllib.request,io
6 from html.parser import HTMLParser
7
8 #全局變量
9
10 id_list = set() #保存視頻ID的列表
11 id_dict = {} #保存id和對應子視頻數目
12 cookies = {} #保存cookies
13
14 #HTML解析類
15 class MyHTMLParser(HTMLParser):
16 def __init__(self, key, attr):
17 HTMLParser.__init__(self)
18 self.links = []
19 self.keys = key
20 self.attr = attr
21 def handle_starttag(self, tag, attrs):
22 #print "Encountered the beginning of a %s tag" % tag
23 #if tag == "source":
24 if tag == self.keys:
25 if len(attrs) == 0:
26 pass
27 else:
28 for (variable, value) in attrs:
29 #if variable == "src":
30 if variable == self.attr:
31 self.links.append(value)
32
33
34 #解析cookies字典
35 def getCookies(cookies_str):
36 global cookies
37 for line in cookiesStr.split(';'):
38 #其設置為1就會把字符串拆分成2份
39 name, value = line.strip().split('=', 1)
40 cookies[name] = value
41
42 def getHtml(url, key, value):
43 global cookies
44 r = requests.get(url, cookies=cookies)
45 content = r.content.decode('UTF-8')
46 hp = MyHTMLParser("source", "src")
47 hp.feed(content)
48 hp.close()
49 print(hp.links)
50 for link in hp.links:
51 link_str = str(link)
52 if link_str.find(".mp4") >= 0:
53 downloadFile(link, key, value)
54 else:
55 print("沒有找到對應視頻")
56
57
58 #獲取課程數目
59 def getCourseNum(url):
60 global cookies
61 url_list = set()
62 r = requests.get(url, cookies=cookies)
63 content = r.content.decode('UTF-8')
64 hp = MyHTMLParser("a", "href")
65 hp.feed(content)
66 hp.close()
67 for link in hp.links:
68 link_str = str(link)
69 if link_str.find("http://www.jikexueyuan.com/course/") >= 0 and link_str.find(".html?ss=1") >= 0:
70 url_list.add(link_str)
71 return url_list.__len__()
72
73 #獲取所有視頻ID,根據目錄網頁
74 def getIdList(root):
75 global cookies
76 r = requests.get(root, cookies=cookies)
77 content = r.content.decode('UTF-8')
78 hp = MyHTMLParser("a", "href")
79 hp.feed(content)
80 hp.close()
81 #print(hp.links)
82 #聲明引用全局id_list,在最上面定義
83 global id_list
84 global id_dict
85
86 for link in hp.links:
87 link_str = str(link)
88 if link_str.find("http://www.jikexueyuan.com/course/") >= 0 and link_str.find(".html")>= 0:
89 #print(link)
90 c_id = link_str.lstrip("http://www.jikexueyuan.com/course/").rstrip(".html")
91 if c_id not in id_list:
92 id_dict[c_id] = getCourseNum(link_str)
93 print(c_id, id_dict[c_id])
94 id_list.add(c_id)
95 print(id_dict)
96
97 def downloadFile(url, key, value):
98 #url = 'http://cv4.jikexueyuan.com/10de45bbf83e450ff5e11ff4599d7166/201603202253/cocos2d-x/course_712/01/video/c712b_01_h264_sd_960_540.mp4'
99 r = requests.get(url)
100 file_name = str(key)+"_"+str(value)+".mp4"
101 with open(file_name, "wb") as code:
102 code.write(r.content)
103
104 if __name__=="__main__":
105 count = 0
106 #解析cookies 利用免費時間下載需要視頻,需要賬號的cookies
107 cookiesStr = "通過谷歌瀏覽器可以獲取"
108 getCookies(cookiesStr)
109
110
111 root = "http://ke.jikexueyuan.com/xilie/331?huodong=shequn_0307"
112 getIdList(root)
113
114 head = "http://www.jikexueyuan.com/course/"
115
116 for key in id_dict:
117 if id_dict[key] <= 0:
118 print(id_dict[key],"沒有數據")
119 break
120 for i in range(1, id_dict[key]+1):
121 url = head+key+"_"+str(i)+".html?ss=1"
122 print("下載:")
123 print(url)
124 count += 1
125 getHtml(url, key, i)
126 print("視頻總數:")
127 print(count)
可以優化的點:由於沒有獲取到每個視頻的名字,看起來很不爽。可以獲取視頻的名字,然后根據類別建立文件夾。這樣保存起來更方便觀看。
cookies真的可以獲取直接用。那么意味着如果截取用戶瀏覽器登陸信息也可以直接登陸並獲取有用的信息。哪些黑客獲取cookies然后盜取用戶信息原理就是這樣的么?有意思。