python 簡單爬取今日頭條熱點新聞(一)

本文轉載自查看原文 2019-02-14 14:15 7926 python/ requests

今日頭條如今在自媒體領域算是比較強大的存在，今天就帶大家利用python爬去今日頭條的熱點新聞，理論上是可以做到無限爬取的；

在瀏覽器中打開今日頭條的鏈接，選中左側的熱點，在瀏覽器開發者模式network下很快能找到一個‘?category=new_hot...’字樣的文件，查看該文件發現新聞內容的數據全部存儲在data里面，且能發現數據類型為json；如下圖：

這樣一來就簡單了，只要找到這個文件的requests url即可通過python requests來爬取網頁了；

查看請求的url，如下圖：

發現鏈接為：https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time=0&max_behot_time_tmp=0&tadrequire=true&as=A1B5AC16548E0FA&cp=5C647E601F9AEE1&_signature=F09fYAAASzBjiSc9oUU9MxdPX3

其中有9個參數，對比如下表：

其中max_behot_time在獲取的json數據中獲得，具體數據見如下截圖：

在網上找了下大神對as和cp算法的分析，發現兩個參數在js文件：home_4abea46.js中有，具體算法如下代碼：

!function(t) {
    var e = {};
    e.getHoney = function() {
        var t = Math.floor((new Date).getTime() / 1e3)
          , e = t.toString(16).toUpperCase()
          , i = md5(t).toString().toUpperCase();
        if (8 != e.length)
            return {
                as: "479BB4B7254C150",
                cp: "7E0AC8874BB0985"
            };
        for (var n = i.slice(0, 5), a = i.slice(-5), s = "", o = 0; 5 > o; o++)
            s += n[o] + e[o];
        for (var r = "", c = 0; 5 > c; c++)
            r += e[c + 3] + a[c];
        return {
            as: "A1" + s + e.slice(-3),
            cp: e.slice(0, 3) + r + "E1"
        }
    }
    ,
    t.ascp = e
}(window, document),

　python獲取as和cp值的代碼如下：(代碼參考blog：https://www.cnblogs.com/xuchunlin/p/7097391.html)

def get_as_cp():  # 該函數主要是為了獲取as和cp參數，程序參考今日頭條中的加密js文件：home_4abea46.js
	zz = {}
	now = round(time.time())
	print(now) # 獲取當前計算機時間
	e = hex(int(now)).upper()[2:] #hex()轉換一個整數對象為16進制的字符串表示
	print('e:', e)
	a = hashlib.md5()  #hashlib.md5().hexdigest()創建hash對象並返回16進制結果
	print('a:', a)
	a.update(str(int(now)).encode('utf-8'))
	i = a.hexdigest().upper()
	print('i:', i)
	if len(e)!=8:
		zz = {'as':'479BB4B7254C150',
		'cp':'7E0AC8874BB0985'}
		return zz
	n = i[:5]
	a = i[-5:]
	r = ''
	s = ''
	for i in range(5):
		s= s+n[i]+e[i]
	for j in range(5):
		r = r+e[j+3]+a[j]
	zz ={
	'as':'A1'+s+e[-3:],
	'cp':e[0:3]+r+'E1'
	}
	print('zz:', zz)
	return zz

　　這樣完整的鏈接就構成了，另外提一點就是：_signature參數去掉也是可以獲取到json數據的，因此這樣請求的鏈接就完成了；下面附上完整代碼：

import requests
import json
from openpyxl import Workbook
import time
import hashlib
import os
import datetime

start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='
url = 'https://www.toutiao.com'

headers={
	'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
cookies = {'tt_webid':'6649949084894053895'} # 此處cookies可從瀏覽器中查找，為了避免被頭條禁止爬蟲

max_behot_time = '0'   # 鏈接參數
title = []       # 存儲新聞標題
source_url = []  # 存儲新聞的鏈接
s_url = []       # 存儲新聞的完整鏈接
source = []      # 存儲發布新聞的公眾號
media_url = {}   # 存儲公眾號的完整鏈接


def get_as_cp():  # 該函數主要是為了獲取as和cp參數，程序參考今日頭條中的加密js文件：home_4abea46.js
	zz = {}
	now = round(time.time())
	print(now) # 獲取當前計算機時間
	e = hex(int(now)).upper()[2:] #hex()轉換一個整數對象為16進制的字符串表示
	print('e:', e)
	a = hashlib.md5()  #hashlib.md5().hexdigest()創建hash對象並返回16進制結果
	print('a:', a)
	a.update(str(int(now)).encode('utf-8'))
	i = a.hexdigest().upper()
	print('i:', i)
	if len(e)!=8:
		zz = {'as':'479BB4B7254C150',
		'cp':'7E0AC8874BB0985'}
		return zz
	n = i[:5]
	a = i[-5:]
	r = ''
	s = ''
	for i in range(5):
		s= s+n[i]+e[i]
	for j in range(5):
		r = r+e[j+3]+a[j]
	zz ={
	'as':'A1'+s+e[-3:],
	'cp':e[0:3]+r+'E1'
	}
	print('zz:', zz)
	return zz


def getdata(url, headers, cookies):  # 解析網頁函數
	r = requests.get(url, headers=headers, cookies=cookies)
	print(url)
	data = json.loads(r.text)
	return data


def savedata(title, s_url, source, media_url):  # 存儲數據到文件
	# 存儲數據到xlxs文件
	wb = Workbook()
	if not os.path.isdir(os.getcwd()+'/result'):   # 判斷文件夾是否存在
		os.makedirs(os.getcwd()+'/result') # 新建存儲文件夾
	filename = os.getcwd()+'/result/result-'+datetime.datetime.now().strftime('%Y-%m-%d-%H-%m')+'.xlsx' # 新建存儲結果的excel文件
	ws = wb.active
	ws.title = 'data'   # 更改工作表的標題
	ws['A1'] = '標題'   # 對表格加入標題
	ws['B1'] = '新聞鏈接'
	ws['C1'] = '頭條號'
	ws['D1'] = '頭條號鏈接'
	for row in range(2, len(title)+2):   # 將數據寫入表格
		_= ws.cell(column=1, row=row, value=title[row-2])
		_= ws.cell(column=2, row=row, value=s_url[row-2])
		_= ws.cell(column=3, row=row, value=source[row-2])
		_= ws.cell(column=4, row=row, value=media_url[source[row-2]])

	wb.save(filename=filename)  # 保存文件



def main(max_behot_time, title, source_url, s_url, source, media_url):   # 主函數
	for i in range(3):   # 此處的數字類似於你刷新新聞的次數，正常情況下刷新一次會出現10條新聞，但夜存在少於10條的情況；所以最后的結果並不一定是10的倍數
		ascp = get_as_cp()    # 獲取as和cp參數的函數
		demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)
		print(demo)
		# time.sleep(1)
		for j in range(len(demo['data'])):
			# print(demo['data'][j]['title'])
			if demo['data'][j]['title'] not in title:
				title.append(demo['data'][j]['title'])  # 獲取新聞標題
				source_url.append(demo['data'][j]['source_url'])  # 獲取新聞鏈接
				source.append(demo['data'][j]['source'])  # 獲取發布新聞的公眾號
			if demo['data'][j]['source'] not in media_url:
				media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url']  # 獲取公眾號鏈接
		print(max_behot_time)
		max_behot_time = str(demo['next']['max_behot_time'])  # 獲取下一個鏈接的max_behot_time參數的值
		for index in range(len(title)):
			print('標題：', title[index])
			if 'https' not in source_url[index]:
				s_url.append(url+source_url[index])
				print('新聞鏈接：', url+source_url[index])
			else:
				print('新聞鏈接：', source_url[index])
				s_url.append(source_url[index])
				# print('源鏈接：', url+source_url[index])
			print('頭條號：', source[index])
			print(len(title))   # 獲取的新聞數量

if __name__ == '__main__':
	main(max_behot_time, title, source_url, s_url, source, media_url)
	savedata(title, s_url, source, media_url)

　　簡單百行代碼搞定今日頭條熱點新聞爬取並存儲到本地，同理也可以爬取其他頻道的新聞；本次的爬取程序到此結束，下次從爬取的公眾號對公眾號下的新聞進行爬取，主要爬取公眾號的粉絲量以及最近10條新聞的或圖文的閱讀量及評論數等數據；請期待...

最后送上程序運行的截圖及數據存儲的表格截圖：

---------------------------------------------------------

歡迎大家留言交流，共同進步。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python3從零開始爬取今日頭條的新聞【二、首頁熱點新聞抓取】 Python爬取今日頭條段子爬取今日頭條 Python 爬蟲實例（2）—— 爬取今日頭條 python爬取今日頭條街拍 Python3：爬取新浪、網易、今日頭條、UC四大網站新聞標題及內容使用python-aiohttp爬取今日頭條 scrapy爬取今日頭條爬取今日頭條文章使用scrapy爬蟲,爬取今日頭條首頁推薦新聞（scrapy+selenium+PhantomJS）