Python新手爬蟲三：爬取PPT模板

本文轉載自查看原文 2020-03-16 12:50 3680 Python爬蟲

爬取網站：第一PPT（http://www.1ppt.com/）此網站真的良心

老樣子，先上最后成功的源碼（在D盤創建一個"D:\PPT"文件夾，直接將代碼執行就可獲取到PPT）：

import requests import urllib import os from bs4 import BeautifulSoup from fake_useragent import UserAgent def getPPT(url):      f = requests.get(url,headers=headers)   #發送GET請求
     f.encoding = f.apparent_encoding    　　#設置編碼方式
    soup1 = BeautifulSoup(f.text,'lxml')　　　#使用lxml解析器解析     classHtml = soup1.find('div',class_="col_nav i_nav clearfix").select('a')    #在html中查找標簽為div，class屬性為 col_nav...的代碼塊並獲取所有的 a 標簽
    for i in classHtml[:56]:　　#只要前56個類別         classUrl = i['href'].split('/')[2] #將ppt模板類別關鍵詞存到classUrl，i['href']表示獲取i中href屬性的值，split('/')[2]表示以'/'為分隔符區第二個值
                 if not os.path.isdir(r'D:\PPT\\'+i['title']):   #判斷有無此目錄，兩個\\，第一個\轉義了第二個\
            os.mkdir(r'D:\PPT\\'+i['title']) #若無，創建此目錄。
        else:             continue    #若有此目錄，直接退出循環，就認為此類別已經下載完畢了
                 n = 0 #定義一個變量用來統計模板的個數         for y in range(1,15):   #假設每個類別都有14頁ppt（頁數這一塊找了很久，沒找到全部獲取的方法，只能采取此措施）
            pagesUrl = url+i['href']+'/ppt_'+classUrl+'_'+str(y)+'.html' #獲取每一頁的URL             a = requests.get(pagesUrl,headers=headers)                          if a.status_code != 404:    #排除狀態碼為404的網頁
                soup2 = BeautifulSoup(a.text,'lxml')                                  for downppt in soup2.find('ul',class_='tplist').select('li > a'):   #獲取每一個模板下載界面的URL，find作用不再贅述，select('li > a')表示查看li標簽下的a標簽的內容
                    b = requests.get(url+downppt['href'],headers=headers) #獲取最后的下載界面的html                     b.encoding = b.apparent_encoding    #設置編碼類型
                    soup3 = BeautifulSoup(b.text,'lxml')　　#因為到了一個新的界面，要重新獲取當前界面html                     downList = soup3.find('ul',class_='downurllist').select('a')    #獲取下載PPT的URL
                    pptName = soup3.select('h1')   #獲取ppt模板名稱
                    print('Downloading......')                     try:                         urllib.request.urlretrieve(downList[0]['href'],r'D:\PPT\\'+i['title']+'/'+pptName[0].get_text()+'.rar')    #開始下載模板
                        print(i['title']+'type template download completed the '+str(n)+' few.'+pptName[0].get_text())                         n += 1                     except:                         print(i['title']+'type download failed the '+str(n)+' few.')                         n += 1

if __name__ == '__main__':     headers = {'user-agent':UserAgent().random} #定義請求頭
    getPPT('http://www.1ppt.com')