淘寶直播數據爬取 + 淘寶模擬登陸


直播數據爬取

可以在 js 數據中找到 sign 的加密方式

分析得知 sign 加密方式為 (d.token + "&" + 時間戳 + "&" + appkey + "&" + data)

d.token的值,發現這個值在cookie當中出現了,與時間戳結合在一起,這個token值會過期,大概是2小時

請求數據接口的時候請求頭必須帶cookie,且包含_m_h5_tk與_m_h5_tk_enc這兩個字段,而且發現不需要登錄就可以獲取這個cookie,響應返回的信息當中也包括了這兩個字段,這樣就可以獲取到token了

import requests
from urllib import parse

def _m_h5():

    data = parse.unquote('%7B%22param%......innerId%22%3A%22%22%7D')
    params = {
            'appKey': 12574478,
            'data': data
    }
    # 請求空獲取cookies
    url = 'https://acs.m.taobao.com/h5/mtop.taobao.social.feed.aggregate/1.0/'
    resp = requests.get(url, params=params)
    cookiejar = requests.utils.dict_from_cookiejar(resp.cookies)
    m_ht_tk = cookiejar['_m_h5_tk']
    m_h5_tk_enc = cookiejar['_m_h5_tk_enc']
    print(m_ht_tk, m_h5_tk_enc)
    return m_ht_tk, m_h5_tk_enc


_m_h5()

import requests
import time
import json
import hashlib

from urllib import parse
from openpyxl import load_workbook
from apscheduler.schedulers.blocking import BlockingScheduler


class TaobaoLiveSpider:
    def __init__(self):
        self.start_time = parse.quote('2020-10-15 11:00:11')
        self.end_time = parse.quote('2020-11-14 11:00:11')
        # 觀看次數 固定格式
        self.data = parse.unquote(f'%7B%22param%2_abstract_indicator%5C%22%2C%5C%22queryDetail%5C%22%3Afalse%2C%5C%22startTime%5C%22%3A%5C%22{self.start_time}%5C%22%2C%5C%22endTime%5C%22%3A%5C%22{self.end_time}%5C%22%2C%5C%22timeType%5C%22%3A2%2C%5C%22sign%5C%2AfaC%5C%22%7D%5D%5C%22%2C%5C%22extra%5C%22%3Anull%7D%22%2C%22innerId%22%3A%22%22%7D')  # 每次請求需攜帶的 data,錯誤會報 非法請求參數
        # 引導交易 固定格式
        self.guid_deal_data = parse.unquote(f'%7B%22paracbot_slr_lime_rpt_ov_deal%5C%22%2C%5C%22queryDetail%5C%22%3Afalse%2C%5C%22startTime%5C%22%3A%5C%22{self.start_time}%5C%22%2C%5C%22endTime%5C%22%3A%5C%22{self.end_time}%5C%22%2C%5C%22timeType%5C%22%3A2%2C%5C%22sign%5C%22%3Anull%2C%5C%22limit%5C%22%3A1%2C%5C%22row%5C%22%3A%5C%22%5B%5D%5C%22%2C%5C%22measure%5C%22%3A%5C%22%5B%...22%22%7D')
        # 粉絲平均在線 固定格式
        self.fans_average_on_line_time_data = parse.unquote(f'%7B%22pabstract_indicator%5C%22%2C%5C%22queryDetail%5C%22%3Afalse%2C%5C%22startTime%5C%22%3A%5C%22{self.start_time}%5C%22%2C%5C%22endTime%5C%22%3A%5C%22{self.end_time}%5C%22%2C%5C%22timeType%5C%22%3A2%2C%5C%22sign%5C%25C%5C%...2%7D')
        # print(self.data)
        # print(self.guid_deal_data)
        # print(self.fans_average_on_line_time_data)

        self.now_time = str(round(time.time(), 3)).replace('.', '')  # 13 當前位時間戳
        self.url = 'https://h5api.m.taobao.com/h5/mtop.alibaba.iic.xinsightshop.olap.query/1.0/'
        self.m_ht_tk, self.m_h5_tk_enc = '', ''
        self.local_now_time = time.strftime('%Y-%m-%d %H:%M:%S')
        self.line = 2

    def get_taobao_cookie(self):
        """
        cookie 字段
        :return: m_ht_tk, m_h5_tk_enc
        """
        # data = parse.unquote(data)
        params = {
            'appKey': 27522***,
            'data': self.data
        }
        url = 'https://acs.m.taobao.com/h5/mtop.taobao.social.feed.aggregate/1.0/'
        resp = requests.get(url, params=params)
        cookiejar = requests.utils.dict_from_cookiejar(resp.cookies)
        self.m_ht_tk = cookiejar['_m_h5_tk']
        self.m_h5_tk_enc = cookiejar['_m_h5_tk_enc']

    def headers(self):
        self.get_taobao_cookie()
        m_ht_tk, m_h5_tk_enc = self.m_ht_tk, self.m_h5_tk_enc
        # print('m_ht_tk, m_h5_tk_enc', m_ht_tk, m_h5_tk_enc)
        return {
            'cookie': f'_samesite_flag_=true; enc=zUS4Q0thPUweeJoV2t7U6F7Boh...cookie14=Uoe0b0ORnwCV7g%3D%3D; _m_h5_tk={m_ht_tk}; _m_h5_tk_enc={m_h5_tk_enc}; l=eBQZK...M-z4WFdhUU3iV4LClmbXGZtE1Ab_kVQUg5TgQGwt5iPeN5-BXtyeA6i1..',
            'origin': 'https://databot.taobao.com',
            'referer': 'https://databot.taobao.com/',
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
        }

    def params(self, data):
        return {
            'jsv': '2.6.0',
            'appKey': '27522***',
            't': self.now_time,
            'sign': self.sign(data),
            'api': 'mtop.alibaba.iic.xinsightshop.olap.query',
            'v': '1.0',
            'type': 'originaljson',
            'dataType': 'json',
            'timeout': 20000,
            'H5Request': 'true',
            'data': data
                }

    def sign(self, data):
        """
        sign 加密方式
        :return: sign 字符串
        """
        # m_ht_tk, _ = self.get_taobao_cookie(self.data)
        m_ht_tk = self.m_ht_tk.split('_')[0]
        sign = m_ht_tk + '&' + self.now_time + '&' + '27522***' + '&' + data
        sign = hashlib.md5(sign.encode())
        sign = sign.hexdigest()
        return sign

    def spider(self):
        resp = requests.get(self.url, headers=self.headers(), params=self.params(self.data)).json()
        print(resp)
        visit = resp['data']['data']['cellset'][1]
        watching_count = visit[5]['value']
        click_probability = visit[7]['value']
        on_line_count = visit[6]['value']
        average_on_line_time = visit[2]['value']
        live_room_watching_count = visit[0]['value']
        fans_make_up = visit[-2]['value']
        extra_add_flow = visit[1]['value']
        add_fans = visit[4]['value']
        goods_click = visit[3]['value']
        goods_click_fans_make_up = visit[-1]['value']

        resp = requests.get(self.url, headers=self.headers(), params=self.params(self.guid_deal_data)).json()
        visit = resp['data']['data']['cellset'][1]
        guid_deal_count = visit[0]['value']
        guid_deal_percent = visit[1]['value']
        guid_deal_money = visit[2]['value']
        guid_deal_money_percent = visit[3]['value']

        resp = requests.get(self.url, headers=self.headers(), params=self.params(self.fans_average_on_line_time_data)).json()
        visit = resp['data']['data']['cellset'][1]
        fans_average_on_line_time = visit[0]['value']

        print(watching_count, click_probability, on_line_count, average_on_line_time, fans_average_on_line_time,
              live_room_watching_count, fans_make_up, extra_add_flow,
              add_fans, goods_click, goods_click_fans_make_up, guid_deal_count, guid_deal_percent,
              guid_deal_money, guid_deal_money_percent, self.local_now_time)
        return watching_count, click_probability, on_line_count, average_on_line_time, fans_average_on_line_time,\
              live_room_watching_count, fans_make_up, extra_add_flow,\
              add_fans, goods_click, goods_click_fans_make_up, guid_deal_count, guid_deal_percent,\
              guid_deal_money, guid_deal_money_percent, self.local_now_time

    def write_excel(self):
        workbook = load_workbook('data.xlsx')
        wb = workbook['Sheet1']
        line = wb.max_row
        watching_count, click_probability, on_line_count, average_on_line_time, fans_average_on_line_time, \
        live_room_watching_count, fans_make_up, extra_add_flow, \
        add_fans, goods_click, goods_click_fans_make_up, guid_deal_count, guid_deal_percent, \
        guid_deal_money, guid_deal_money_percent, self.local_now_time = self.spider()
        line += 1
        wb[f'A{line}'] = self.local_now_time
        wb[f'B{line}'] = watching_count
        wb[f'C{line}'] = click_probability
        wb[f'D{line}'] = on_line_count
        wb[f'E{line}'] = average_on_line_time
        wb[f'F{line}'] = fans_average_on_line_time
        wb[f'G{line}'] = live_room_watching_count
        wb[f'H{line}'] = fans_make_up
        wb[f'I{line}'] = extra_add_flow
        wb[f'J{line}'] = add_fans
        wb[f'K{line}'] = goods_click
        wb[f'L{line}'] = goods_click_fans_make_up
        wb[f'M{line}'] = guid_deal_count
        wb[f'N{line}'] = guid_deal_percent
        wb[f'O{line}'] = guid_deal_money
        wb[f'P{line}'] = guid_deal_money_percent
        workbook.save('data.xlsx')

def main():
    run = TaobaoLiveSpider()
    run.write_excel()


scheduler = BlockingScheduler()
scheduler.add_job(main, 'interval', seconds=1800, id='main')
scheduler.start()


模擬登陸

import re
import os
import json

import requests


s = requests.Session()
# cookies序列化文件
COOKIES_FILE_PATH = 'taobao_login_cookies.txt'


class UsernameLogin:

    def __init__(self, loginId, umidToken, ua, password2):
        """
        賬號登錄對象
        :param loginId: 用戶名
        :param umidToken: 新版登錄新增參數
        :param ua: 淘寶的ua參數
        :param password2: 加密后的密碼
        """
        # 檢測是否需要驗證碼的URL
        self.user_check_url = 'https://login.taobao.com/newlogin/account/check.do?appName=taobao&fromSite=0'
        # 驗證淘寶用戶名密碼URL
        self.verify_uaername_password_url = "https://login.taobao.com/newlogin/login.do?appName=taobao&fromSite=0"
        # 訪問st碼URL
        self.vst_url = 'https://login.taobao.com/member/vst.htm?st={}'
        # 淘寶個人 主頁
        # self.my_taobao_url = 'http://i.taobao.com/my_taobao.htm'
        self.my_taobao_url = 'https://zhaoshang.tmall.com/channel/index.htm?'

        # 淘寶用戶名
        self.loginId = loginId
        # 淘寶用戶名
        self.umidToken = umidToken
        # 淘寶關鍵參數,包含用戶瀏覽器等一些信息,很多地方會使用,從瀏覽器或抓包工具中復制,可重復使用
        self.ua = ua
        # 加密后的密碼,從瀏覽器或抓包工具中復制,可重復使用
        self.password2 = password2

        # 請求超時時間
        self.timeout = 3

    def _user_check(self):
        """
        檢測賬號是否需要驗證碼
        :return:
        """
        data = {
            'loginId': self.loginId,
            'ua': self.ua,
        }
        try:
            response = s.post(self.user_check_url, data=data, timeout=self.timeout)
            response.raise_for_status()
        except Exception as e:
            print(f'檢測是否需要驗證碼請求失敗,原因:{e}')
            raise e
        check_resp_data = response.json()['content']['data']
        needcode = False
        # 判斷是否需要滑塊驗證,一般短時間密碼錯誤多次可能出現
        if 'isCheckCodeShowed' in check_resp_data:
            needcode = True
        print('是否需要滑塊驗證:{}'.format(needcode))
        return needcode

    def _get_umidToken(self):
        """
        獲取umidToken參數
        :return:
        """
        response = s.get('https://login.taobao.com/member/login.jhtml')
        st_match = re.search(r'"umidToken":"(.*?)"', response.text)
        print(st_match.group(1))
        return st_match.group(1)

    @property
    def verify_login_password(self):
        """
        驗證用戶名密碼,並獲取st碼申請URL
        :return: 驗證成功返回st碼申請地址
        """
        headers = {
            'Referer': 'https://login.taobao.com/member/login.jhtml',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
            'Content-Type': 'application/x-www-form-urlencoded',
            'origin': 'https://login.taobao.com'
        }
        # 登錄toabao.com提交的數據,如果登錄失敗,可以從瀏覽器復制你的form data
        data = {
            'loginId': self.loginId,
            'password2': self.password2,
            'keepLogin': 'false',
            'ua': self.ua,
            'umidGetStatusVal': '255',
            'screenPixel': '1920x1080',
            'navlanguage': 'zh-CN',
            'navUserAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
            'navPlatform': 'Win32',
            'appName': 'taobao',
            'appEntrance': 'taobao_pc',
            '_csrf_token': 'kmnF0VwnseLLEq0kw7qLI',
            'umidToken': '899a8d2fdf9d6a93a5d7472bcb8db95227a9e034',
            'hsiz': '167696b8b2d8a066b7b0e4e0e140d905',
            'bizParams': '',
            'style': 'default',
            'appkey': '00000000',
            'from': 'tbTop',
            'isMobile': 'false',
            'lang': 'zh_CN',
            'returnUrl': 'https://www.taobao.com/',
            'fromSite': '0'
        }

        try:
            response = s.post(self.verify_uaername_password_url, headers=headers, data=data,
                              timeout=self.timeout)
            response.raise_for_status()
            # 從返回的頁面中提取申請st碼地址
        except Exception as e:
            print('驗證用戶名和密碼請求失敗,原因:')
            raise e
        # 提取申請st碼url
        print(response.json())
        apply_st_url_match = response.json()['content']['data']['asyncUrls'][0]
        # 存在則返回
        if apply_st_url_match:
            print('驗證用戶名密碼成功,st碼申請地址:{}'.format(apply_st_url_match))
            return apply_st_url_match
        else:
            raise RuntimeError('用戶名密碼驗證失敗!response:{}'.format(response.text))

    def _apply_st(self):
        """
        申請st碼
        :return: st碼
        """
        apply_st_url = self.verify_login_password
        try:
            response = s.get(apply_st_url)
            response.raise_for_status()
        except Exception as e:
            print('申請st碼請求失敗,原因:')
            raise e
        st_match = re.search(r'"data":{"st":"(.*?)"}', response.text)
        if st_match:
            print('獲取st碼成功,st碼:{}'.format(st_match.group(1)))
            return st_match.group(1)
        else:
            raise RuntimeError('獲取st碼失敗!response:{}'.format(response.text))

    def login(self):
        """
        使用st碼登錄
        :return:
        """
        # 加載cookies文件
        if self._load_cookies():
            return True
        # 判斷是否需要滑塊驗證
        self._user_check()
        st = self._apply_st()
        headers = {
            'Host': 'login.taobao.com',
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        try:
            response = s.get(self.vst_url.format(st), headers=headers)
            response.raise_for_status()
        except Exception as e:
            print('st碼登錄請求,原因:')
            raise e
        # 登錄成功,提取跳轉淘寶用戶主頁url
        my_taobao_match = re.search(r'top.location.href = "(.*?)"', response.text)
        if my_taobao_match:
            print('登錄淘寶成功,跳轉鏈接:{}'.format(my_taobao_match.group(1)))
            self.my_taobao_url = my_taobao_match.group(1)
            self._serialization_cookies()
            return True
        else:
            raise RuntimeError('登錄失敗!response:{}'.format(response.text))

    def _load_cookies(self):
        # 1、判斷cookies序列化文件是否存在
        if not os.path.exists(COOKIES_FILE_PATH):
            return False
        # 2、加載cookies
        s.cookies = self._deserialization_cookies()
        # 3、判斷cookies是否過期
        try:
            self.get_taobao_nick_name()
        except Exception as e:
            os.remove(COOKIES_FILE_PATH)
            print('cookies過期,刪除cookies文件!')
            return False
        print('加載淘寶cookies登錄成功!!!')
        return True

    def _serialization_cookies(self):
        """
        序列化cookies
        :return:
        """
        cookies_dict = requests.utils.dict_from_cookiejar(s.cookies)
        print(cookies_dict)
        with open(COOKIES_FILE_PATH, 'w+', encoding='utf-8') as file:
            json.dump(cookies_dict, file)
            print('保存cookies文件成功!')

    def _deserialization_cookies(self):
        """
        反序列化cookies
        :return:
        """
        with open(COOKIES_FILE_PATH, 'r+', encoding='utf-8') as file:
            cookies_dict = json.load(file)
            cookies = requests.utils.cookiejar_from_dict(cookies_dict)
            return cookies

    def get_taobao_nick_name(self):
        """
        獲取淘寶昵稱
        :return: 淘寶昵稱
        """
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        params = {
            'spm': '687.8433302/new.sidebar.1.296f226aVUQtum',
            'spm': 'a217wi.openworkbeanchtmall_web'
        }
        try:
            response = s.get(self.my_taobao_url, headers=headers)
            # print(response.text)
            response.raise_for_status()
        except Exception as e:
            print('獲取淘寶主頁請求失敗!原因:')
            raise e
        # 提取淘寶昵稱
        # nick_name_match = re.search(r'<input id="mtb-nickname" type="hidden" value="(.*?)"/>', response.text)
        nick_name_match = re.findall("erNick = '(.*?)'", response.text)[0]
        if nick_name_match:
            print(f'登錄淘寶成功,你的用戶名是:{nick_name_match}')
            return nick_name_match
        else:
            raise RuntimeError('獲取淘寶昵稱失敗!response:{}'.format(response.text))

    def get_live(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
            'referer': 'https://liveplatform.taobao.com/live/liveList.htm?'
        }
        url = 'https://liveplatform.taobao.com/live/action.do?currentPage=1&pagesize=20&api=get_live_list'
        response = s.get(url, headers=headers).text
        print(response)


if __name__ == '__main__':
    # 說明:loginId、umidToken、ua、password2這4個參數都是從瀏覽器登錄頁面復制過來的。
    # 如何復制4個參數:
    # # 1、瀏覽器打開:https://login.taobao.com/member/login.jhtml
    # # 2、F12打開調試窗口,左邊有個Preserve log,勾選上,這樣頁面跳轉請求記錄不會丟失
    # # 3、輸入用戶名密碼登錄,然后找到請求:newlogin/login.do 這個是登錄請求
    # # 4、復制上面的4個參數到下面,基本就可以運行了
    # # 5、如果運行報錯可以微信私聊豬哥,沒加豬哥微信的可以關注豬哥微信公眾號[裸睡的豬],回復:加群

    # 淘寶用戶名:手機 用戶名 都可以
    loginId = 'username'
    # 改版后增加的參數,后面考慮解密這個參數
    umidToken = 'ae2d51d',
    # 淘寶重要參數,從瀏覽器或抓包工具中復制,可重復使用
    ua = '137#Qtc9hE9o9IpDz/4p38vMDW2hkgwdkrNMhdtCDn7DaPtK8zuCzjGfX9gy1gOwsfS1'
    # 加密后的密碼,從瀏覽器或抓包工具中復制,可重復使用
    password2 = 'c3107629fbd23e5b0b6c4696e146d7207299d6d80fb8b00154a143736cb'

    ul = UsernameLogin(loginId, umidToken, ua, password2)
    ul.login()
    ul.get_taobao_nick_name()
    ul.get_live()


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM