# -*- coding: utf-8 -*- from selenium import webdriver import time, re,requests,os,time,random,traceback import urllib.request,threading from bs4 import BeautifulSoup import html.parser from tkinter import * from tkinter import ttk import tkinter.messagebox def getHtml(questionId,page): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--start-maximized') # 最大化運行(全屏窗口),不設置,取元素會報錯 chrome_options.add_argument('--disable-infobars') # 禁用瀏覽器正在被自動化程序控制的提示 chrome_options.add_argument('--incognito') # 隱身模式(無痕模式) chrome_options.add_argument('--headless') # 瀏覽器不提供可視化頁面 driver = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options) # 打開瀏覽器 driver.get("https://www.zhihu.com/question/"+questionId+"/answers/updated?page="+str(page)) # 打開想要爬取的知乎頁面 # 模擬用戶操作 def execute_times(times): for i in range(times): print('第'+str(i)+'次點擊') driver.execute_script("window.scrollTo(0, "+str(1000 * i)+");") time.sleep(3) driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") execute_times(12) result_raw = driver.page_source # 這是原網頁 HTML 信息 result_soup = BeautifulSoup(result_raw, 'html.parser')# 然后將其解析 result_bf = result_soup.prettify() # 結構化原 HTML 文件 answers = driver.find_elements_by_class_name("RichContent-inner") txt = "start\n" for answer in answers: if len(answer.text) > 300: txt = txt + answer.text + "\n-----------我是分隔符------\n" with open(questionId +"/page_"+str(page)+".txt", 'w',encoding="utf-8") as zhpage: # 存儲路徑里的文件夾需要事先創建。 zhpage.write(txt) zhpage.close() print("爬取回答頁面成功!!!") driver.quit() return result_soup def readTxt(path): f = open(path,'r',encoding='utf-8') strTxt = f.read() f.close() return strTxt def main(questionId,startPage,endPage): mkdir([questionId]) for i in range(startPage,endPage): try: getHtml(questionId,i) time.sleep(random.choice(range(5,8))) except Exception: traceback.print_exc() pass def mkdir(paths): for path in paths: if not os.path.exists(path): os.mkdir(path) def getanswer(): questionId = var_id.get() start = var_start.get() end = var_end.get() main(questionId,start,end) if __name__ == '__main__': main(str(308829198),101,200) tk = Tk() tk.title('獲取知乎問題所有答案') tk.geometry('600x150') frame = Frame(tk) Label(tk,text='問題標識:(例:https://www.zhihu.com/question/324405640/answer/720532471中的324405640 )',width=200,anchor=W, justify=LEFT).place(x=10,y=10) var_id = Variable() question_id = Entry(tk,textvariable=var_id,width=30) question_id.place(x=10,y=40) Label(tk,text='開始頁:').place(x=230,y=40) var_start = Variable() e = Entry(tk, textvariable=var_start,width=10).place(x=290,y=40) var_start.set(1) Label(tk,text='結束頁:').place(x=360,y=40) var_end = Variable() e = Entry(tk, textvariable=var_end,width=10).place(x=420,y=40) var_end.set(10) Button(tk, text="獲取答案", command=getanswer).place(x=200,y=80) #tk.mainloop()