技術選型
下載器是Requests
解析使用的是正則表達式
效果圖:

准備好各個包
# -*- coding: utf-8 -*-
import requests #第三方下載器
import re #正則表達式
import json #格式化數據用
from requests.exceptions import RequestException #做異常處理
from multiprocessing import Pool #使用多進程
開始編寫代碼,new一個py文件
1.requests下載頁面
response =requests.get(url)
url:當前需要爬取的鏈接
requests.get()獲取頁面
這里需要注意編碼的問題;

就像下面這樣:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
這樣返回的就是一個string類型的數據
2.except RequestException:捕捉異常
為了代碼更加健壯,我們在可能發生異常的地方做異常捕獲
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
更多異常介紹官網
http://www.python-requests.org/en/master/_modules/requests/exceptions/#RequestException
到這里,我們就可以編寫main方法進行調用程序了
代碼如下:
# -*- coding: utf-8 -*-
import requests
from requests.exceptions import RequestException
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
def main():
url = 'https://coding.imooc.com/?page=1'
html = get_one_page(url)
print(html)
if __name__ == '__main__':
main()
這樣就可以把頁面下載下來了
接着,就是解析頁面
3.正則表達式介紹
re.compile()方法:編譯正則表達式
通過一個正則表達式字符串 編譯生成 一個字符串對象
re.findall(pattern,html)方法:找到所有匹配的內容
參數:
pattern:編譯過的正則表達式
html:用response.content.decode("utf-8")得到的頁面內容
def parse_one_page(html):
pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">'
'(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>'
'(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>'
'(.*?)</p>.*?</div>',re.S)
items = re.findall(pattern,html)
for item in items:
#格式化每一條數據為字典類型的數據
yield {
'teacher': item[0],
'title': item[1],
'grade': item[2],
'people':item[3],
'score': item[4],
'describe': item[5]
}
完整代碼:
# -*- coding: utf-8 -*-
import requests
import re
from requests.exceptions import RequestException
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">'
'(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>'
'(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>'
'(.*?)</p>.*?</div>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'teacher': item[0],
'title': item[1],
'grade': item[2],
'people':item[3],
'score': item[4],
'describe': item[5]
}
def main():
url = 'https://coding.imooc.com/?page=1'
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
if __name__ == '__main__':
main()
保存解析后的數據到本地文件
4.保存文件操作
with open('imooctest.txt','a',encoding='utf-8') as f
with as :打開自動閉合的文件並設立對象f進行操作
參數:
imooctest.txt:文件名字
a:追加方式
encoding:編碼格式 不這樣設置可能保存的數據會亂碼
f.write(json.dumps(content,ensure_ascii =False)+'\n')
json.dumps:將剛才被格式化后的字典轉為字符串
ensure_ascii =False 不這樣設置可能保存的數據會亂碼
+'\n' 每條數據為一行
代碼如下:
# -*- coding: utf-8 -*-
import requests
import re
import json
from requests.exceptions import RequestException
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">'
'(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>'
'(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>'
'(.*?)</p>.*?</div>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'teacher': item[0],
'title': item[1],
'grade': item[2],
'people':item[3],
'score': item[4],
'describe': item[5]
}
def write_to_file(content):
with open('imooctest.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main():
url = 'https://coding.imooc.com/?page=1'
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
main()
5.爬取所有頁面並以多進程方式
分析頁面,會發現,需要爬取的頁面如下
https://coding.imooc.com/?page=1
https://coding.imooc.com/?page=2
https://coding.imooc.com/?page=3
https://coding.imooc.com/?page=4
我們需要構造這種格式的頁面
主函數可以類似這樣:
for i in range(4):
main(i+1)
完整代碼:
# -*- coding: utf-8 -*-
import requests
import re
import json
from requests.exceptions import RequestException
from multiprocessing import Pool
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.content.decode("utf-8")
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('<div class="box">.*?lecturer-info.*?<span>(.*?)</span>.*?shizhan-intro-box.*?title=".*?">'
'(.*?)</p>.*?class="grade">(.*?)</span>.*?imv2-set-sns.*?</i>'
'(.*?)</span>.*?class="big-text">(.*?)</p>.*?shizan-desc.*?>'
'(.*?)</p>.*?</div>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'teacher': item[0],
'title': item[1],
'grade': item[2],
'people':item[3],
'score': item[4],
'describe': item[5]
}
def write_to_file(content):
with open('imoocAll.txt','a',encoding='utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False)+'\n')
f.close()
def main(page):
url = 'https://coding.imooc.com/?page='+str(page)
html = get_one_page(url)
# parse_one_page(html)
# print(html)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
pool = Pool()
pool.map(main,[i+1 for i in range(4)])
# for i in range(4):
# main(i+1)
到這里,我們就能夠把慕課網上面的全部實戰課程的信息爬取下來,拿到這些數據,你就可以做自己喜愛的分析了
