爬蟲大作業——爬取網站數據生成詞雲


1.主題

虎撲體育網2016年至今關於巴薩的新聞

2.用python 編寫爬蟲程序,從網絡上爬取相關主題的數據。

 在虎撲巴薩新聞中用開發者工具找出相關信息

#新聞列表
pageUrl = 'https://voice.hupu.com/soccer/tag/380.html'
def getListPage(pageUrl):
    res = requests.get(pageUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    newslist = []
    for news in soup.select('.voice-main'):
        if len(news.select('.time')) > 0:
            newsUrl = news.select('a')[0].attrs['href']  # 連接
            newslist.append(getNewDetail(newsUrl))
    return (newslist)

  

# 詳情信息
def getNewDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    news = {}  # 用字典存放信息
    news['title'] = soupd.select('h1.headline')[0].text.strip()  # 標題
    news['source'] = soupd.select('.comeFrom')[0].text.strip().lstrip('來源:')  # 來源
    d = soupd.select('.time')[0].text.strip()
    news['time'] = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')  # 時間
    n = soupd.select('.artical-content-read')[0].text
    if n.find('編輯:') > 0:
        news['content'] = n[:n.find('編輯:')].strip()
    else:
        news['content'] = n[:n.find('.artical-fn-tip-height')].strip()
    writeNewsDetail(news['content'])
    news['newsUrl'] = newsUrl  # 鏈接
    return (news)

  

# 導出到Excel表
df = pandas.DataFrame(newstotal)
df.to_excel('Barcelona.xlsx')  

#生成文本
file = open('Barcelona.txt', 'r', encoding='utf-8')
word = file.read()
file.close()

 

  

 

 

 

3.對爬了的數據進行文本分析,生成詞雲。

 文本分析:

for w in range(0, len(words)):
    wordDict[words[w]] = word.count(str(words[w]))

dictList = list(wordDict.items())
dictList.sort(key=lambda x: x[1], reverse=True)
bwc = {}
f = open('count.txt', 'a', encoding="utf-8")
for i in range(200):
    print(dictList[i])
    f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '\n')
    bwc[dictList[i][0]] = dictList[i][1]
f.close()

  

生成詞雲:

font = r'C:\Windows\Fonts\simhei.TTF'
image = Image.open('./meixi.png')
graph = np.array(image)
wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph)
wc.generate_from_frequencies(bwc)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.axis("off")
plt.show()

  

 

原圖:

詞雲:

 

 

4.對文本分析結果進行解釋說明。

從結果看,巴薩新聞中出現最多是:梅西、內馬爾、皇馬、

梅西:梅西作為巴薩的頭號家球星,有關於巴塞的報道幾乎與他有關

內馬爾:過去幾年內馬爾是巴薩的二號球星,去年離開巴薩,詞頻略低於梅西

皇馬:皇馬是巴薩近十年的爭冠對手,對陣即是國家德比,也是歐洲關注度最高的比賽

 

源代碼:

import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from PIL import Image, ImageSequence
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud, ImageColorGenerator
import pandas
import jieba

def writeNewsDetail(content):
    f = open('Barcelona.txt', 'a', encoding='utf-8')
    f.write(content)
    f.close()

 # 列表
def getListPage(pageUrl):
    res = requests.get(pageUrl)
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.text, 'html.parser')
    newslist = []
    for news in soup.select('.voice-main'):
        if len(news.select('.time')) > 0:
            newsUrl = news.select('a')[0].attrs['href']  # 連接
            newslist.append(getNewDetail(newsUrl))
    return (newslist)

# 詳情信息
def getNewDetail(newsUrl):
    resd = requests.get(newsUrl)
    resd.encoding = 'utf-8'
    soupd = BeautifulSoup(resd.text, 'html.parser')
    news = {}  # 用字典存放信息
    news['title'] = soupd.select('h1.headline')[0].text.strip()  # 標題
    news['source'] = soupd.select('.comeFrom')[0].text.strip().lstrip('來源:')  # 來源
    d = soupd.select('.time')[0].text.strip()
    news['time'] = datetime.strptime(d, '%Y-%m-%d %H:%M:%S')  # 時間
    n = soupd.select('.artical-content-read')[0].text
    if n.find('編輯:') > 0:
        news['content'] = n[:n.find('編輯:')].strip()
    else:
        news['content'] = n[:n.find('.artical-fn-tip-height')].strip()
    writeNewsDetail(news['content'])
    news['newsUrl'] = newsUrl  # 鏈接
    return (news)

pageUrl = 'https://voice.hupu.com/soccer/tag/380.html'
newstotal = []
for i in range(1, 100):
    listPageUrl = 'https://voice.hupu.com/soccer/tag/380.html?page={}'.format(i)
    newstotal.extend(getListPage(listPageUrl))

for news in newstotal:
    print(news)

# 導出到Excel表
df = pandas.DataFrame(newstotal)
df.to_excel('Barcelona.xlsx')

#生成文本
file = open('Barcelona.txt', 'r', encoding='utf-8')
word = file.read()
file.close()
sep = '''-/.·+—…(),,。:?“”:、;!《》【】0123456789'''
exclude = {' ', '\n', '\t', '\xa0',  '到', '文', '人', '動', '和', '分', '能', '年', '手', '得', '也', '本', '要', '就', '很', '於', '將', '都',
           '下', '但', '小', '而', '她', '更', '最', '重', '還', '自', '次', '之', '月', '局', '說', '間', '里', '兩',  '沒', '家',
           '用', '級', '被', '並', '感', '回', '讓', '作', '去', '位', '虎撲', '與', '天', '看', '面', '事', '因', '行', '拿', '已', '其',
           '教', '起', '從', '未', '表', '強', '練', '可', '多', '現', '發', '組', '如', '接', '沒有', '可以',  '此', '所','我們','出',
           '球', '的','在','我','是','一','了','他','這','場', '們','會','不','比','有', '為','隊','日','馬','中','斯','時','梅',
           '拉', '蒂', '上','對',  '前','個','個', '進','虎','(','(','撲','經','日訊', '只','球員','巴','克','加','阿','球隊','好','內','報','特','以','那','過','道','奧''世','杯','他們','已經','當','尼','利','達','德','賽季','戰','世界','足球','你','想','聯','同',自己','表示','望','烏','轉','o','約','第','卡','名','么','據','體育','a', '新','希望','非','踢','根','談','諾', '可能','問','這樣','因為','衛','正','巴黎','西班牙','打','做','非常','聯賽','r','00','羅貝托','奇','如果','認為','布','萬','期','給','地','點','體育報','尼奧','情','離','接受','需','勝','入','相','但是','每','近','不是','消息','n','表現','圖','三','外','變'
           }

jieba.add_word('國家德比')
jieba.add_word('C羅')
jieba.add_word('梅開二度')
jieba.add_word('大四喜')
jieba.add_word('拉基蒂奇')

for c in sep:
    word = word.replace(c, ' ')
wordList = list(jieba.cut(word))
wordDict = {}
words = list(set(wordList) - exclude)

for w in range(0, len(words)):
    wordDict[words[w]] = word.count(str(words[w]))

dictList = list(wordDict.items())
dictList.sort(key=lambda x: x[1], reverse=True)
bwc = {}
f = open('count.txt', 'a', encoding="utf-8")
for i in range(200):
    print(dictList[i])
    f.write(dictList[i][0] + ':' + str(dictList[i][1]) + '\n')
    bwc[dictList[i][0]] = dictList[i][1]
f.close()

font = r'C:\Windows\Fonts\simhei.TTF'
image = Image.open('./meixi.png')
graph = np.array(image)
wc = WordCloud(font_path=font, background_color='White', max_words=50, mask=graph)
wc.generate_from_frequencies(bwc)
image_color = ImageColorGenerator(graph)
plt.imshow(wc)
plt.axis("off")
plt.show()

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM