1.主題
虎撲體育網2016年至今關於巴薩的新聞
2.用python 編寫爬蟲程序,從網絡上爬取相關主題的數據。
在虎撲巴薩新聞中用開發者工具找出相關信息
#新聞列表
pageUrl = 'https://voice.hupu.com/soccer/tag/380.html'
def getListPage(pageUrl): res = requests.get(pageUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') newslist = [] for news in soup.select('.voice-main'): if len(news.select('.time')) > 0: newsUrl = news.select('a')[0].attrs['href'] # 連接 newslist.append(getNewDetail(newsUrl)) return (newslist)
# 詳情信息 def getNewDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') news = {} # 用字典存放信息 news['title'] = soupd.select('h1.headline')[0].text.strip() # 標題 news['source'] = soupd.select('.comeFrom')[0].text.strip().lstrip('來源:') # 來源 d = soupd.select('.time')[0].text.strip() news['time'] = datetime.strptime(d, '%Y-%m-%d %H:%M:%S') # 時間 n = soupd.select('.artical-content-read')[0].text if n.find('編輯:') > 0: news['content'] = n[:n.find('編輯:')].strip() else: news['content'] = n[:n.find('.artical-fn-tip-height')].strip() writeNewsDetail(news['content']) news['newsUrl'] = newsUrl # 鏈接 return (news)
# 導出到Excel表 df = pandas.DataFrame(newstotal) df.to_excel('Barcelona.xlsx') #生成文本 file = open('Barcelona.txt', 'r', encoding='utf-8') word = file.read() file.close()
3.對爬了的數據進行文本分析,生成詞雲。
文本分析:
for w in range(0, len(words)): wordDict[words[w]] = word.count(str(words[w])) dictList = list(wordDict.items()) dictList.sort(key=lambda x: x[1], reverse=True) bwc = {} f = open('count.txt', 'a', encoding="utf-8") for i in range(200): print(dictList[i]) f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '\n') bwc[dictList[i][0]] = dictList[i][1] f.close()
生成詞雲:
font = r'C:\Windows\Fonts\simhei.TTF' image = Image.open('./meixi.png') graph = np.array(image) wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph) wc.generate_from_frequencies(bwc) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.axis("off") plt.show()
原圖:
詞雲:
4.對文本分析結果進行解釋說明。
從結果看,巴薩新聞中出現最多是:梅西、內馬爾、皇馬、
梅西:梅西作為巴薩的頭號家球星,有關於巴塞的報道幾乎與他有關
內馬爾:過去幾年內馬爾是巴薩的二號球星,去年離開巴薩,詞頻略低於梅西
皇馬:皇馬是巴薩近十年的爭冠對手,對陣即是國家德比,也是歐洲關注度最高的比賽
源代碼:
import requests from bs4 import BeautifulSoup from datetime import datetime import re from PIL import Image, ImageSequence import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud, ImageColorGenerator import pandas import jieba def writeNewsDetail(content): f = open('Barcelona.txt', 'a', encoding='utf-8') f.write(content) f.close() # 列表 def getListPage(pageUrl): res = requests.get(pageUrl) res.encoding = 'utf-8' soup = BeautifulSoup(res.text, 'html.parser') newslist = [] for news in soup.select('.voice-main'): if len(news.select('.time')) > 0: newsUrl = news.select('a')[0].attrs['href'] # 連接 newslist.append(getNewDetail(newsUrl)) return (newslist) # 詳情信息 def getNewDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'utf-8' soupd = BeautifulSoup(resd.text, 'html.parser') news = {} # 用字典存放信息 news['title'] = soupd.select('h1.headline')[0].text.strip() # 標題 news['source'] = soupd.select('.comeFrom')[0].text.strip().lstrip('來源:') # 來源 d = soupd.select('.time')[0].text.strip() news['time'] = datetime.strptime(d, '%Y-%m-%d %H:%M:%S') # 時間 n = soupd.select('.artical-content-read')[0].text if n.find('編輯:') > 0: news['content'] = n[:n.find('編輯:')].strip() else: news['content'] = n[:n.find('.artical-fn-tip-height')].strip() writeNewsDetail(news['content']) news['newsUrl'] = newsUrl # 鏈接 return (news) pageUrl = 'https://voice.hupu.com/soccer/tag/380.html' newstotal = [] for i in range(1, 100): listPageUrl = 'https://voice.hupu.com/soccer/tag/380.html?page={}'.format(i) newstotal.extend(getListPage(listPageUrl)) for news in newstotal: print(news) # 導出到Excel表 df = pandas.DataFrame(newstotal) df.to_excel('Barcelona.xlsx') #生成文本 file = open('Barcelona.txt', 'r', encoding='utf-8') word = file.read() file.close() sep = '''-/.·+—…(),,。:?“”:、;!《》【】0123456789''' exclude = {' ', '\n', '\t', '\xa0', '到', '文', '人', '動', '和', '分', '能', '年', '手', '得', '也', '本', '要', '就', '很', '於', '將', '都', '下', '但', '小', '而', '她', '更', '最', '重', '還', '自', '次', '之', '月', '局', '說', '間', '里', '兩', '沒', '家', '用', '級', '被', '並', '感', '回', '讓', '作', '去', '位', '虎撲', '與', '天', '看', '面', '事', '因', '行', '拿', '已', '其', '教', '起', '從', '未', '表', '強', '練', '可', '多', '現', '發', '組', '如', '接', '沒有', '可以', '此', '所','我們','出', '球', '的','在','我','是','一','了','他','這','場', '們','會','不','比','有', '為','隊','日','馬','中','斯','時','梅', '拉', '蒂', '上','對', '前','個','個', '進','虎','(','(','撲','經','日訊', '只','球員','巴','克','加','阿','球隊','好','內','報','特','以','那','過','道','奧''世','杯','他們','已經','當','尼','利','達','德','賽季','戰','世界','足球','你','想','聯','同',自己','表示','望','烏','轉','o','約','第','卡','名','么','據','體育','a', '新','希望','非','踢','根','談','諾', '可能','問','這樣','因為','衛','正','巴黎','西班牙','打','做','非常','聯賽','r','00','羅貝托','奇','如果','認為','布','萬','期','給','地','點','體育報','尼奧','情','離','接受','需','勝','入','相','但是','每','近','不是','消息','n','表現','圖','三','外','變' } jieba.add_word('國家德比') jieba.add_word('C羅') jieba.add_word('梅開二度') jieba.add_word('大四喜') jieba.add_word('拉基蒂奇') for c in sep: word = word.replace(c, ' ') wordList = list(jieba.cut(word)) wordDict = {} words = list(set(wordList) - exclude) for w in range(0, len(words)): wordDict[words[w]] = word.count(str(words[w])) dictList = list(wordDict.items()) dictList.sort(key=lambda x: x[1], reverse=True) bwc = {} f = open('count.txt', 'a', encoding="utf-8") for i in range(200): print(dictList[i]) f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '\n') bwc[dictList[i][0]] = dictList[i][1] f.close() font = r'C:\Windows\Fonts\simhei.TTF' image = Image.open('./meixi.png') graph = np.array(image) wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph) wc.generate_from_frequencies(bwc) image_color = ImageColorGenerator(graph) plt.imshow(wc) plt.axis("off") plt.show()