一個簡單的python爬蟲,爬取知乎


一個簡單的python爬蟲,爬取知乎

  • 主要實現 爬取一個收藏夾 里 所有問題答案下的 圖片
  • 文字信息暫未收錄,可自行實現,比圖片更簡單
  • 具體代碼里有詳細注釋,請自行閱讀

項目源碼:

  1 # -*- coding:utf-8 -*-
  2 
  3 from spider import SpiderHTML
  4 from multiprocessing import Pool
  5 import sys,urllib,http,os,random,re,time
  6 __author__ = 'waiting'
  7 '''
  8 使用了第三方的類庫 BeautifulSoup4,請自行安裝
  9 需要目錄下的spider.py文件
 10 運行環境:python3.4,windows7
 11 '''
 12 
 13 #收藏夾的地址
 14 url = 'https://www.zhihu.com/collection/30822111'  #page參數改為代碼添加
 15 
 16 #本地存放的路徑,不存在會自動創建
 17 store_path = 'E:\\zhihu\收藏夾\\會員才知道的世界'
 18 
 19 class zhihuCollectionSpider(SpiderHTML):
 20   def __init__(self,pageStart, pageEnd, url):
 21     self._url = url
 22     self._pageStart = int(pageStart)
 23     self._pageEnd = int(pageEnd)+1
 24     self.downLimit = 0            #低於此贊同的答案不收錄
 25 
 26   def start(self):
 27     for page in range(self._pageStart,self._pageEnd):    #收藏夾的頁數
 28       url = self._url + '?page='+str(page)
 29       content = self.getUrl(url)
 30       questionList = content.find_all('div',class_='zm-item')
 31       for question in questionList:            #收藏夾的每個問題
 32         Qtitle = question.find('h2',class_='zm-item-title')
 33         if Qtitle is None:                #被和諧了
 34           continue
 35 
 36         questionStr = Qtitle.a.string
 37         Qurl = 'https://www.zhihu.com'+Qtitle.a['href']  #問題題目
 38         Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string)      #windows文件/目錄名不支持的特殊符號
 39         try:
 40           print('-----正在獲取問題:'+Qtitle+'-----')    #獲取到問題的鏈接和標題,進入抓取
 41         except UnicodeEncodeError:
 42           print(r'---問題含有特殊字符無法顯示---')
 43         try:
 44           Qcontent = self.getUrl(Qurl)
 45         except:
 46           print('!!!!獲取出錯!!!!!')
 47           pass
 48         answerList = Qcontent.find_all('div',class_='zm-item-answer  zm-item-expanded')
 49         self._processAnswer(answerList,Qtitle)            #處理問題的答案
 50         time.sleep(5)
 51 
 52 
 53   def _processAnswer(self,answerList,Qtitle):
 54     j = 0      
 55     for answer in answerList:
 56       j = j + 1
 57       
 58       upvoted = int(answer.find('span',class_='count').string.replace('K','000'))   #獲得此答案贊同數
 59       if upvoted < self.downLimit:
 60         continue
 61       authorInfo = answer.find('div',class_='zm-item-answer-author-info')        #獲取作者信息
 62       author = {'introduction':'','link':''}
 63       try:
 64         author['name'] = authorInfo.find('a',class_='author-link').string       #獲得作者的名字
 65         author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #獲得作者的簡介
 66         author['link'] = authorInfo.find('a',class_='author-link')['href']      
 67       except AttributeError:
 68         author['name'] = '匿名用戶'+str(j)
 69       except TypeError:                                  #簡介為空的情況
 70         pass                                     #匿名用戶沒有鏈接
 71 
 72       file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')
 73       if os.path.exists(file_name):              #已經抓取過
 74         continue
 75   
 76       self.saveText(file_name,'{introduction}\r\n{link}'.format(**author))      #保存作者的信息
 77       print('正在獲取用戶`{name}`的答案'.format(**author))
 78       answerContent = answer.find('div',class_='zm-editable-content clearfix')
 79       if answerContent is None:                #被舉報的用戶沒有答案內容
 80         continue
 81   
 82       imgs = answerContent.find_all('img')
 83       if len(imgs) == 0:                    #答案沒有上圖
 84         pass
 85       else:
 86         self._getImgFromAnswer(imgs,Qtitle,**author)
 87 
 88   #收錄圖片
 89   def _getImgFromAnswer(self,imgs,Qtitle,**author):
 90     i = 0
 91     for img in imgs:
 92       if 'inline-image' in img['class']:          #不抓取知乎的小圖
 93         continue
 94       i = i + 1
 95       imgUrl = img['src']
 96       extension = os.path.splitext(imgUrl)[1]
 97       path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)
 98       try:
 99         self.saveImg(imgUrl,path_name)          #捕獲各種圖片異常,流程不中斷
100       except:                  
101         pass
102         
103   #收錄文字
104   def _getTextFromAnswer(self):
105     pass
106 
107 #命令行下運行,例:zhihu.py 1 5   獲取1到5頁的數據
108 if __name__ == '__main__':
109   page, limit, paramsNum= 1, 0, len(sys.argv)
110   if paramsNum>=3:
111     page, pageEnd = sys.argv[1], sys.argv[2]
112   elif paramsNum == 2:
113     page = sys.argv[1]
114     pageEnd = page
115   else:
116     page,pageEnd = 1,1
117 
118   spider = zhihuCollectionSpider(page,pageEnd,url)
119   spider.start()

很多初學者,對Python的概念都是模糊不清的,C語言、Python能做什么,學的時候,該按照什么線路去學習,學完往哪方面發展,想深入了解,詳情可以點擊有道雲筆記鏈接了解:http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aa


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM