python 網頁爬取數據生成文字雲圖

本文轉載自查看原文 2017-09-17 10:22 1480 python 文字雲/ PYTHON/ python 數據抓取

1. 需要的三個包：

from wordcloud import WordCloud        #詞雲庫
import matplotlib.pyplot as plt        #數學繪圖庫
import jieba;

2. 定義變量（將對於的變量到一個全局的文件中）：

import re;
pdurl_first='https://movie.douban.com/subject/26363254/comments?start=0'
head={'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/59.0.3071.109 Chrome/59.0.3071.109 Safari/537.36'}
reg=re.compile(r'<a href="(.*?)&amp;.*?class="next">') #下一頁
cookies={"__utma":"30149280.503249607.1504402391.1504402391.1504402391.1",
         "_utmb":"30149280.2.9.1504402391","__utmc":"30149280","__utmt":"1",
         "__utmz":"30149280.1504402391.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)",
         "ap":"1","as":'"https://movie.douban.com/subject/26363254/comments?start=225&limit=20&sort=new_score&status=P"',
         "bid":"g7k4BGd2sRk","ck":"76vs","dbcl2":'"166279730:fohmXhoM9uU"',"ps":"y","push_doumail_num":"0",
         "push_doumail_num":"0"}

3. 抓取數據

import requests;
import re;
from GrabData import Param;
import pandas as pd;
from bs4 import BeautifulSoup;

class GrabComent:
    ren = re.compile(r'<span class="votes">(.*?)</span>.*?comment">.*?</span>.*?<span.*?class="">(.*?)</a>.*?<span>(.*?)</span>.*?title="(.*?)"></span>.*?title="(.*?)"><p .*? > (.*?)</p>',re.S)
    def __init__(self):
        print('開始抓取數據');
        html = requests.get(Param.pdurl_first, headers=Param.head, cookies=Param.cookies);
        while html.status_code == 200:
            url_next = 'https://movie.douban.com/subject/26363254/comments' + re.findall(Param.reg, html.text)[0]
            zhanlang = re.findall(self.ren, html.text)
            print(zhanlang)
            data = pd.DataFrame(zhanlang)
            data.to_csv('H:\\python_projects\\ticket\\zhanlangpinglun.csv', header=False, index=False,
                        mode='a+')  # 寫入csv文件,'a+'是追加模式
            data = []
            zhanlang = []
            print("下一頁地址："+url_next);
            html = requests.get(url_next, cookies=Param.cookies, headers=Param.head)

if __name__ == '__main__':
    GrabComent();

4. 生成雲圖

from wordcloud import WordCloud        #詞雲庫
import matplotlib.pyplot as plt        #數學繪圖庫
import jieba;

class WordYun:

    def __init__(self):
        print("開始讀取文件!");
        self.main();

    def main(self):
        text = self.readFile();
        self.showTitle(text);

    def showTitle(self,text1):
        wc1 = WordCloud(
            background_color="white",
            width=1000,
            height=860,
            font_path="D:\\Windows\\Fonts\\STFANGSO.ttf",  # 不加這一句顯示口字形亂碼
            margin=2);

        wc2 = wc1.generate(text1)  # 我們觀察到generate()接受一個Unicode的對象，所以之前要把文本處理成unicode類型
        plt.imshow(wc2)
        plt.axis("off")
        plt.show();

    def readFile(self):
        a = []
        f = open(r'H:\\python_projects\\ticket\\zhanlangpinglun.csv', 'r').read()
        words = list(jieba.cut(f))
        for word in words:
            if len(word) > 1:
                a.append(word);
        txt = r' '.join(a)
        print("readFile返回的結果："+txt);
        return txt;

if __name__ == '__main__':
    WordYun();

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python 爬取豆瓣電影短評並wordcloud生成詞雲圖 python生成文字雲 python爬取網頁數據 python爬取網頁數據方法如何使用python爬取網頁動態數據 python-2：爬取某個網頁（虎撲）帖子的標題做詞雲圖 python爬取網頁數據 python之爬取網頁數據總結（一） python爬取網頁數據 Java爬取B站彈幕 —— Python雲圖Wordcloud生成彈幕詞雲