import requests
from bs4 import BeautifulSoup
import jieba
#爬取页面代码并解析
def get_html(url):
try:
response=requests.get(url)
response.raise_for_status
response.encoding=response.apparent_encoding
html=BeautifulSoup(response.text,'html.parser')
return html
except:
print('爬取出错')
#计算关键词出现次数
def count_word(txt):
counts={}
words=jieba.cut(txt)
for word in words:
if len(word)==1:
continue
else:
counts[word]=counts.get(word,0)+1
return counts
def main():
url='http://www.c114.com.cn/'
html=get_html(url)
print('get html')
t=html.get_text('+',strip=True)
txt = "".join(i for i in t if ord(i) >= 256) #txt中除去英文
print('get txt')
counts=count_word(txt)
items=list(counts.items())
items.sort(key=lambda x:x[1],reverse=True)
for i in range(15):
word,count=items[i]
print('{:<15}{:>5}'.format(word,count))
main()
分别以 c11通信网[http://www.c114.com.cn/] & 通信人家园[http://www.txrjy.com/forum.php] 这两个网站为例:


