代碼如下:
from zhihu_oauth import ZhihuClient
from zhihu_oauth.exception import NeedCaptchaException
from docx import Document
from docx.shared import Inches
from w3lib.html import remove_tags
import urllib.request
from io import StringIO
import subprocess
import re
document = Document()
client = ZhihuClient()
try: client.login('username', 'pass') except NeedCaptchaException: # 保存驗證碼並提示輸入,重新登錄 with open('a.gif', 'wb') as f: f.write(client.get_captcha()) captcha = input('please input captcha:') client.login('898311543@qq.com', '1818039565', captcha) p = re.compile('<img.*>') num = input('請輸入問題序號:') question = client.question(int(num)) i = 0; file = open(question.title+'.html','w') file.write('<html><head></head><body>') # document.add_heading(question.title, 0) file.write('<h1>'+question.title+'</h1>') for answer in question.answers: # print(question.answers[0].content.replace('<br>','\n')) # print(question.answers[0].voteup_count) print(answer.author.name) # document.add_paragraph("回答者:"+answer.author.name+'\t'+'贊數:'+str(answer.voteup_count), style='IntenseQuote') # document.add_paragraph(remove_tags(answer.content, keep=('img',))) # document.add_page_break() file.write('<br><br>'+"回答者:"+answer.author.name+' '+'贊數:'+str(answer.voteup_count)) file.write('<p>'+answer.content+'</p>') i=i+1 if(i>50): break #document.save(str(question.title)+'.docx') file.write('</body></html>') file.close() subprocess.call(['pandoc',question.title+'.html','-o',question.title+'.docx']) subprocess.call(['rm','-rf',question.title+'.html'])
參考資料:https://github.com/7sDream/zhihu-oauth
這段代碼主要利用zhihu-oauth實現對知乎內容的獲取,這個項目很好的對知乎的API進行了封裝並且非常容易的實現知乎的各種操作。
在Ubuntu系統下通過
pip install -U zhihu_oauth
可以安裝zhihu_oauth模塊。為以后下載做好准備。
在下載知乎文章中主要遇到的問題是,不好處理圖片。
當直接將content保存到docx中的時候,會直接將<img>標簽保存到docx中,不好處理。后來在issue中找到了比較好的解決方案。
一種解決方案是直接保存成html然后用pandoc轉碼。具體的代碼實現可以參考上面的代碼。