Python抓取微博評論(二)


對於新浪微博評論的抓取,首篇做的時候有些考慮不周,然后現在改正了一些地方,因為有人問,抓取評論的時候“爬前50頁的熱評,或者最新評論里的前100頁“,這樣的數據看了看,好像每條微博的評論都只能抓取到前100頁,當page=101時,xhr數據就成空,然后沒有內容,所以現在是抓取每條微博最近的100頁的評論,即1000條評論,

代碼有些改動,但是思路都是一樣

# -*- coding: utf-8 -*-
import re
import urllib
import urllib2
import os
import stat
import itertools
import re
import sys
import requests
import json
import time
import socket
import urlparse
import csv
import random
from datetime import datetime, timedelta
import lxml.html
from wordcloud import WordCloud
import jieba
import PIL
import matplotlib.pyplot as plt
import numpy as np

from zipfile import ZipFile
from StringIO import StringIO
from downloader import Downloader
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser
from itertools import product
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import json,urllib2
def download(url, headers, num_try=2):
    while num_try >0:
        num_try -= 1
        try:
            content = requests.get(url, headers=headers)
            return content.text

        except urllib2.URLError as e:
            print 'Download error', e.reason

    return None
header_dict = {
                'Content-Type':'application/json; charset=utf-8',
                'Accept':'application/json, text/plain, */*',
                'Accept-Encoding':'gzip, deflate, br',
                'Accept-Language':'zh-CN,zh;q=0.9',
                'Connection':'keep-alive',
                'Cookie':'...',
                'Host':'m.weibo.cn',
                'Referer':'https://m.weibo.cn/u/1241148864?display=0&retcode=6102',
                'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
                'X-Requested-With':'XMLHttpRequest'
               }

def wordcloudplot(txt):
    path = '/Users/cy/Downloads/msyh.ttf'
    path = unicode(path, 'utf8').encode('gb18030')
    alice_mask = np.array(PIL.Image.open('/Users/cy/Desktop/1.jpg'))
    wordcloud = WordCloud(font_path=path,
                          background_color="white",
                          margin=5, width=1800, height=800, mask=alice_mask, max_words=2000, max_font_size=60,
                          random_state=42)
    wordcloud = wordcloud.generate(txt)
    wordcloud.to_file('/Users/cy/Desktop/2.jpg')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


def main():
    a = []
    f = open(r'/Users/cy/Downloads/a.json', 'r').read()
    words = list(jieba.cut(f))
    for word in words:
        if len(word) > 1:
            a.append(word)
    txt = r' '.join(a)
    wordcloudplot(txt)

def get_comment(que):
    f = open('/Users/cy/Downloads/a.json', 'w')
    total_number = 10
    for each in que:
        for i in range(1,total_number):
            textmood = {"id": each,
                        "page": i}
            textmood = json.dumps(textmood)
            uu = 'https://m.weibo.cn/status/' + str(each)
            header = {'Connection': 'keep-alive',
                      'Cookie': '.......',
                      'Accept-Language': 'zh-CN,zh;q=0.8',
                      'Host': 'm.weibo.cn',
                      'Referer':uu,
                      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
                      'X-Requested-With': 'XMLHttpRequest'
                      }
            url = 'https://m.weibo.cn/api/comments/show?id=%s&page=%s'%(str(each),str(i))
            print url

            req = urllib2.Request(url=url, data=textmood, headers=header)
            res = urllib2.urlopen(req)
            res = res.read()
            contents = res
            d = json.loads(contents, encoding="utf-8")
            total_numbers = d['total_number']
            print total_numbers
            tto = total_numbers / 10 + 1
            if total_number > tto:
                 total_number = min(tto,10)
            if 'data' in d:
                data = d['data']
                if data != "":
                    for each_one in data:
                        if each_one != "":
                            if each_one['text'] != "":
                                mm = each_one['text'].split('<')
                                if  r'回復' not in mm[0]:
                                    index = mm[0]#filter(lambda x: x not in '0123456789', mm[0])
                                    print index
                                    f.write(index.encode("u8"))

def get_identified():

    que = []
    url = 'https://m.weibo.cn/api/container/getIndex?uid=1241148864&luicode=10000011&lfid=100103type%3D3%26q%3D%E5%BC%A0%E6%9D%B0&featurecode=20000180&type=uid&value=1241148864&containerid=1076031241148864'
    for i in range(1,3):
        if i > 1:
            url = 'https://m.weibo.cn/api/container/getIndex?uid=1241148864&luicode=10000011&lfid=100103type%3D3%26q%3D%E5%BC%A0%E6%9D%B0&featurecode=20000180&type=uid&value=1241148864&containerid=1076031241148864&page='+str(i)
        print url

        req = download(url, header_dict,2)
        print req
        d = json.loads(req,encoding="utf-8")
        print d

        try:
            data = d['data']['cards']
            print data
        except KeyError,e:
            print e.message

        if data != "":
            for each in data:
                print each['itemid']
                mm = each['itemid']
                if mm != "":
                    identity = mm.split('-')
                    num = identity[1][1:]
                    que.append(num)
                    print num

    get_comment(que)

if __name__ == '__main__':
    get_identified()
    main()

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM