前言:本文主要內容是介紹如何用最簡單的辦法去采集新浪微博的數據,主要是采集指定微博用戶發布的微博以及微博收到的回復等內容,可以通過配置項來調整爬取的微博用戶列表以及其他屬性。
既然說是最簡單的辦法,那么我們就得先分析微博爬蟲可能選擇的幾個目標網址,首先肯定是最常見的web網站了
還有就是m站,也就是移動端網頁
以及一個無法舊版本的訪問入口了,首先可以排除web站了,這個是最麻煩的,它的請求是被js加密過,處理起來很麻煩
那我們為何不退而求其次呢,我們觀察下這個m站的請求
可以發現在某個請求里可以發現我們需要的微博信息,既然這樣就好辦了,我們就可以着手我們的代碼了
首先是獲取用戶信息,通過用戶信息獲取用戶的微博總數,這樣就可以知道總共多少頁的數據了,代碼如下所示
def get_json(self, params):
"""獲取網頁中json數據"""
url = 'https://m.weibo.cn/api/container/getIndex?'
r = requests.get(url, params=params, cookies=self.cookie)
return r.json()
def get_page_count(self):
"""獲取微博頁數"""
try:
weibo_count = self.user['statuses_count']
page_count = int(math.ceil(weibo_count / 10.0))
return page_count
except KeyError:
sys.exit(u'程序出錯')
def get_user_info(self):
"""獲取用戶信息"""
params = {'containerid': '100505' + str(weibo_config['user_id'])}
js = self.get_json(params)
if js['ok']:
info = js['data']['userInfo']
user_info = {}
user_info['id'] = weibo_config['user_id']
user_info['screen_name'] = info.get('screen_name', '')
user_info['gender'] = info.get('gender', '')
user_info['statuses_count'] = info.get('statuses_count', 0)
user_info['followers_count'] = info.get('followers_count', 0)
user_info['follow_count'] = info.get('follow_count', 0)
user_info['description'] = info.get('description', '')
user_info['profile_url'] = info.get('profile_url', '')
user_info['profile_image_url'] = info.get('profile_image_url', '')
user_info['avatar_hd'] = info.get('avatar_hd', '')
user_info['urank'] = info.get('urank', 0)
user_info['mbrank'] = info.get('mbrank', 0)
user_info['verified'] = info.get('verified', False)
user_info['verified_type'] = info.get('verified_type', 0)
user_info['verified_reason'] = info.get('verified_reason', '')
user = self.standardize_info(user_info)
self.user = user
分頁采集數據
page1 = 0
random_pages = random.randint(1, 5)
self.start_date = datetime.now().strftime('%Y-%m-%d')
for page in tqdm(range(1, page_count + 1), desc='Progress'):
is_end = self.get_one_page(page)
if is_end:
break
if page % 20 == 0: # 每爬20頁寫入一次文件
self.weibo_to_mysql(wrote_count)
wrote_count = self.got_count
# 通過加入隨機等待避免被限制。爬蟲速度過快容易被系統限制(一段時間后限
# 制會自動解除),加入隨機等待模擬人的操作,可降低被系統限制的風險。默
# 認是每爬取1到5頁隨機等待6到10秒,如果仍然被限,可適當增加sleep時間
if (page - page1) % random_pages == 0 and page < page_count:
sleep(random.randint(6, 10))
page1 = page
random_pages = random.randint(1, 5)
self.weibo_to_mysql(wrote_count) # 將剩余不足20頁的微博寫入文件
print(u'微博爬取完成,共爬取%d條微博' % self.got_count)
具體采集單頁微博代碼如下
def get_one_page(self, page):
"""獲取一頁的全部微博"""
try:
js = self.get_weibo_json(page)
if js['ok']:
weibos = js['data']['cards']
for w in weibos:
if w['card_type'] == 9:
wb = self.get_one_weibo(w)
if wb:
if wb['id'] in self.weibo_id_list:
continue
created_at = datetime.strptime(
wb['created_at'], '%Y-%m-%d')
since_date = datetime.strptime(
self.since_date, '%Y-%m-%d')
if created_at < since_date:
if self.is_pinned_weibo(w):
continue
else:
print(u'{}已獲取{}({})的第{}頁微博{}'.format(
'-' * 30, self.user['screen_name'],
self.user['id'], page, '-' * 30))
return True
if ('retweet' not in wb.keys()):
self.weibo.append(wb)
self.weibo_id_list.append(wb['id'])
self.got_count += 1
print(u'{}已獲取{}({})的第{}頁微博{}'.format('-' * 30,
self.user['screen_name'],
self.user['id'], page,
'-' * 30))
except Exception as e:
print("Error: ", e)
traceback.print_exc()
獲取具體微博信息的代碼
def get_one_weibo(self, info):
"""獲取一條微博的全部信息"""
try:
weibo_info = info['mblog']
weibo_id = weibo_info['id']
retweeted_status = weibo_info.get('retweeted_status')
is_long = weibo_info.get('isLongText')
if retweeted_status: # 轉發
retweet_id = retweeted_status.get('id')
is_long_retweet = retweeted_status.get('isLongText')
if is_long:
weibo = self.get_long_weibo(weibo_id)
if not weibo:
weibo = self.parse_weibo(weibo_info)
else:
weibo = self.parse_weibo(weibo_info)
if is_long_retweet:
retweet = self.get_long_weibo(retweet_id)
if not retweet:
retweet = self.parse_weibo(retweeted_status)
else:
retweet = self.parse_weibo(retweeted_status)
retweet['created_at'] = self.standardize_date(
retweeted_status['created_at'])
weibo['retweet'] = retweet
else: # 原創
if is_long:
weibo = self.get_long_weibo(weibo_id)
if not weibo:
weibo = self.parse_weibo(weibo_info)
else:
weibo = self.parse_weibo(weibo_info)
weibo['created_at'] = self.standardize_date(
weibo_info['created_at'])
return weibo
except Exception as e:
print("Error: ", e)
traceback.print_exc()
def get_long_weibo(self, id):
"""獲取長微博"""
for i in range(5):
url = 'https://m.weibo.cn/detail/%s' % id
html = requests.get(url, cookies=self.cookie).text
html = html[html.find('"status":'):]
html = html[:html.rfind('"hotScheme"')]
html = html[:html.rfind(',')]
html = '{' + html + '}'
js = json.loads(html, strict=False)
weibo_info = js.get('status')
if weibo_info:
weibo = self.parse_weibo(weibo_info)
return weibo
sleep(random.randint(6, 10))
以上就是核心的微博信息采集代碼了,除了微博信息,我們還需要采集微博評論信息,原理是一樣的,找到數據來源
有了微博信息采集的經驗,我們很容易就可以找到我們想要的那個接口
具體代碼如下
def add_comments_json(self,jsondata):
for data in jsondata:
item = dict()
item['id'] = data.get('id')
item['mid'] = data.get('mid')
item['like_count'] = data.get("like_count")
item['source'] = data.get("source")
item['floor_number'] = data.get("floor_number")
item['screen_name'] = data.get("user").get("screen_name")
# 性別
item['gender'] = data.get("user").get("gender")
if(item['gender'] == 'm'):
item['gender'] = '男'
elif(item['gender'] == 'f'):
item['gender'] = '女'
item['rootid'] = data.get("rootid")
item['create_time'] = data.get("created_at")
import time
item['create_time'] = time.strptime(item['create_time'], '%a %b %d %H:%M:%S %z %Y')
item['create_time'] = time.strftime('%Y-%m-%d',item['create_time'])
item['comment'] = data.get("text")
item['comment'] = BeautifulSoup(item['comment'], 'html.parser').get_text()
item['comment'] = self.clear_character_chinese(item['comment'])
print('當前樓層{},評論{}'.format(item['floor_number'],item['comment']))
# 評論這條評論的信息
comments = data.get("comments")
if(comments):
self.add_comments_json(comments)
# print jsondata.dumps(comment, encoding="UTF-8", ensure_ascii=False)
self.comments.append(item)
def get_comments_page(self,max_id, id_type,mid):
from get_weibo_cookie import get_cookie
params = {
'max_id': max_id,
'max_id_type': id_type
}
try:
url = 'https://m.weibo.cn/comments/hotflow?id={id}&amp;amp;mid={mid}&amp;amp;max_id='
headers = {
'Cookie': 'T_WM=96849642965; __guid=52195957.2500582256236055600.1583058027995.9556; WEIBOCN_FROM=1110006030; SCF=Aimq85D9meHNU4Ip0PFUjYBTDjXFB0VtQr3EKoS8DHQDobRNUO3lDIufAcUg69h4J7BQWqryxQpuU3ReIHHxvQ4.; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5H0p180lDMiCjNvXD_-uOh5JpX5KzhUgL.FoM0S0n0eo-0Sh.2dJLoI0qLxKqL1KMLBK-LxK-LBonLBonLxKMLB.-L12-LxK-LBK-LBoeLxK-L1hnL1hqLxKBLB.2LB-zt; XSRF-TOKEN=ca0a29; SUB=_2A25zWlwFDeRhGeFN7FoS8ivPzzWIHXVQpWRNrDV6PUJbkdANLW_9kW1NQ8CH90H5f8j5r1NA4GNPvu6__ERL-Jat; SUHB=0vJIkXXtLIIaZO; SSOLoginState=1583230037; MLOGIN=1; M_WEIBOCN_PARAMS=oid%3D4474164293517551%26luicode%3D20000174%26lfid%3D102803%26uicode%3D20000174; monitor_count=45',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest'
}
r = requests.get(url.format(id=mid,mid=mid), params=params,headers=headers)
print(r.url)
if r.status_code == 200:
return r.json()
except requests.ConnectionError as e:
print('error', e.args)
def add_comments(self,jsondata):
datas = jsondata.get('data').get('data')
for data in datas:
item = dict()
item['id'] = data.get('id')
item['mid'] = data.get('mid')
item['like_count'] = data.get("like_count")
item['source'] = data.get("source")
item['floor_number'] = data.get("floor_number")
item['screen_name'] = data.get("user").get("screen_name")
# 性別
item['gender'] = data.get("user").get("gender")
if(item['gender'] == 'm'):
item['gender'] = '男'
elif(item['gender'] == 'f'):
item['gender'] = '女'
item['created_at'] = self.standardize_date(
data.get(['created_at']))
import time
item['create_time'] = time.strptime(item['create_time'], '%a %b %d %H:%M:%S %z %Y')
item['create_time'] = time.strftime('%Y-%m-%d',item['create_time'])
item['rootid'] = data.get("rootid")
item['comment'] = data.get("text")
item['comment'] = BeautifulSoup(item['comment'], 'html.parser').get_text()
item['comment'] = self.clear_character_chinese(item['comment'])
print('當前樓層{},評論{}'.format(item['floor_number'],item['comment']))
# 評論這條評論的信息
comments = data.get("comments")
# print jsondata.dumps(comment, encoding="UTF-8", ensure_ascii=False)
self.comments.append(item)
我們可以查看下采集到的數據,如下所示
完整代碼可以去我的開源項目中查看或者下載,歡迎star,或者留言與我進行交流。