python獲取頁面文字信息


# -*- coding: utf-8 -*-

from selenium import webdriver
import time, re,requests,os,time,random,traceback
import urllib.request,threading
from bs4 import BeautifulSoup
import html.parser
from tkinter import *
from tkinter import ttk
import tkinter.messagebox 


def getHtml(questionId,page):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--start-maximized')  # 最大化運行(全屏窗口),不設置,取元素會報錯
    chrome_options.add_argument('--disable-infobars')  # 禁用瀏覽器正在被自動化程序控制的提示
    chrome_options.add_argument('--incognito')  # 隱身模式(無痕模式)
    chrome_options.add_argument('--headless')  # 瀏覽器不提供可視化頁面

    driver = webdriver.Chrome(executable_path = "chromedriver",options=chrome_options)  # 打開瀏覽器
    driver.get("https://www.zhihu.com/question/"+questionId+"/answers/updated?page="+str(page)) # 打開想要爬取的知乎頁面 

    # 模擬用戶操作
    def execute_times(times):
        for i in range(times):
            print(''+str(i)+'次點擊') 
            driver.execute_script("window.scrollTo(0, "+str(1000 * i)+");")
            time.sleep(3)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")    
    execute_times(12)

    result_raw = driver.page_source  # 這是原網頁 HTML 信息
    result_soup = BeautifulSoup(result_raw, 'html.parser')# 然后將其解析
    result_bf = result_soup.prettify()  # 結構化原 HTML 文件
    answers = driver.find_elements_by_class_name("RichContent-inner")
    txt = "start\n"
    for answer in answers:
        if len(answer.text) > 300:
           txt = txt + answer.text + "\n-----------我是分隔符------\n"
    with open(questionId +"/page_"+str(page)+".txt", 'w',encoding="utf-8") as zhpage:  # 存儲路徑里的文件夾需要事先創建。
        zhpage.write(txt)
    zhpage.close()
    print("爬取回答頁面成功!!!")
    driver.quit()
    return result_soup

def readTxt(path):
    f = open(path,'r',encoding='utf-8')
    strTxt = f.read()
    f.close()
    return strTxt
        

def main(questionId,startPage,endPage):
    mkdir([questionId])
    for i in range(startPage,endPage):
        try:
           getHtml(questionId,i)
           time.sleep(random.choice(range(5,8)))
        except Exception:
            traceback.print_exc()
            pass

def mkdir(paths):
    for path in paths:
        if not os.path.exists(path):
            os.mkdir(path)

def getanswer():
    questionId = var_id.get()
    start = var_start.get()
    end = var_end.get()
    main(questionId,start,end)

if __name__ == '__main__':
    main(str(308829198),101,200)


tk = Tk()
tk.title('獲取知乎問題所有答案')
tk.geometry('600x150')

frame = Frame(tk)
Label(tk,text='問題標識:(例:https://www.zhihu.com/question/324405640/answer/720532471中的324405640 )',width=200,anchor=W, justify=LEFT).place(x=10,y=10)
var_id = Variable()
question_id = Entry(tk,textvariable=var_id,width=30)
question_id.place(x=10,y=40)

Label(tk,text='開始頁:').place(x=230,y=40)
var_start = Variable()
e = Entry(tk, textvariable=var_start,width=10).place(x=290,y=40)
var_start.set(1)


Label(tk,text='結束頁:').place(x=360,y=40)
var_end = Variable()
e = Entry(tk, textvariable=var_end,width=10).place(x=420,y=40)
var_end.set(10)

Button(tk, text="獲取答案", command=getanswer).place(x=200,y=80)
#tk.mainloop()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM