爬取網站出現高頻關鍵詞


import requests
from bs4 import BeautifulSoup
import jieba
    
    
#爬取頁面代碼並解析
def get_html(url):
    try:
        response=requests.get(url)
        response.raise_for_status
        response.encoding=response.apparent_encoding
        html=BeautifulSoup(response.text,'html.parser')
        return html
    except:
        print('爬取出錯')


#計算關鍵詞出現次數
def count_word(txt):
    counts={}
    words=jieba.cut(txt)
    for word in words:
        if len(word)==1:
            continue
        else:
            counts[word]=counts.get(word,0)+1
    return counts


def main():
    url='http://www.c114.com.cn/'
    html=get_html(url)
    print('get html')
    t=html.get_text('+',strip=True)
    txt = "".join(i for i in t if ord(i) >= 256)  #txt中除去英文
    print('get txt')
    counts=count_word(txt)
    items=list(counts.items())
    items.sort(key=lambda x:x[1],reverse=True)
    for i in range(15):
        word,count=items[i]
        print('{:<15}{:>5}'.format(word,count))
main()

 分別以    c11通信網[http://www.c114.com.cn/]   &   通信人家園[http://www.txrjy.com/forum.php]  這兩個網站為例:

 

 

 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM