python3使用requests爬取新浪熱門微博


微博登錄的實現代碼來源:https://gist.github.com/mrluanma/3621775


相關環境

使用的python3.4,發現配置好環境后可以直接使用pip easy_install命令安裝第三方庫,比如本示例需要依賴的庫:

pip install requests
pip install rsa

代碼實現

以下代碼主要是登錄成功后,爬取熱鬧微博的TOP 100,再保存到hotweb.html文件里邊

import re
import json
import urllib.parse
import base64
import binascii
import json
 
import rsa
import requests
import logging

from pprint import pprint 

wbdom = r'd:\pyzone\hotwb.html';
weclient = 'ssologin.js(v1.4.5)'
FORMAT = '%(asctime)-15s %(message)s'
user_agent = (
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) '
    'Chrome/20.0.1132.57 Safari/536.11'
)

logging.basicConfig(level=logging.DEBUG, format=FORMAT)
logger = logging.getLogger('weibo')
session = requests.session()
session.headers['User-Agent'] = user_agent
 
 
def encrypt_passwd(passwd, pubkey, servertime, nonce):
    key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
    message = str(servertime) + '\t' + str(nonce) + '\n' + str(passwd)
    passwd = rsa.encrypt(message.encode(), key)
    return binascii.b2a_hex(passwd)
 
 
def wblogin(username, password):
    resp = session.get(
        'http://login.sina.com.cn/sso/prelogin.php?'
        'entry=sso&callback=sinaSSOController.preloginCallBack&'
        'su=%s&rsakt=mod&client=%s' %
        (base64.b64encode(username), weclient)
    )
 
    pre_login_str = re.match(r'[^{]+({.+?})', resp.content.decode('gbk')).group(1)
    pre_login = json.loads(pre_login_str)
 
    pre_login = json.loads(pre_login_str)
    data = {
        'entry': 'weibo',
        'gateway': 1,
        'from': '',
        'savestate': 7,
        'userticket': 1,
        'ssosimplelogin': 1,
        'su': base64.b64encode(urllib.parse.quote(username).encode()),
        'service': 'miniblog',
        'servertime': pre_login['servertime'],
        'nonce': pre_login['nonce'],
        'vsnf': 1,
        'vsnval': '',
        'pwencode': 'rsa2',
        'sp': encrypt_passwd(password, pre_login['pubkey'],
                             pre_login['servertime'], pre_login['nonce']),
        'rsakv' : pre_login['rsakv'],
        'encoding': 'gbk',
        'prelt': '115',
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si'
               'naSSOController.feedBackUrlCallBack',
        'returntype': 'META'
    }
    resp = session.post(
        'http://login.sina.com.cn/sso/login.php?client=%s' % weclient,
        data=data
    )
 
    login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']',
                          resp.content.decode('gbk')).group(1)
    resp = session.get(login_url)
    login_str = re.match(r'[^{]+({.+?}})', resp.content.decode('gbk'))
    if(login_str):      # result is not None
        logger.info('login success..')
        login_str = json.loads(login_str.group(1))
        pprint(login_str)
        return True
    else:
        logger.info('login fail..')
        return False

def gethotwb(url):
    f = open(wbdom, mode='a', encoding='utf-8')
    for x in range(1,11):       # page 1 to 10
        r = session.get(url + str(x))
        r.encoding = 'utf-8'
        f.write('\n<p>--------page:'+ str(x) +'---------</p>\n\n')
        f.write(json.loads(r.text)['data']['html'])
    f.close()
 
if __name__ == '__main__':
    flag = wblogin(b'xx@163.com', 'xx')
    if(flag):
        gethotwb('http://hot.weibo.com/ajax/feed?type=h&v=9999&page=');

總結

  1. 測試的過程中連接了翻牆的VPN,異地登錄需要驗證碼,此時retcode=4049,登錄成功是0
  2. python各個版本之間不兼容好蛋痛

大家中秋快樂!

參考文檔

requests文檔 http://docs.python-requests.org/zh_CN/latest/

微博登錄過程分析 http://www.cnblogs.com/pzxbc/archive/2012/02/03/2335027.html


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM