python3使用requests爬取新浪热门微博


微博登录的实现代码来源:https://gist.github.com/mrluanma/3621775


相关环境

使用的python3.4,发现配置好环境后可以直接使用pip easy_install命令安装第三方库,比如本示例需要依赖的库:

pip install requests
pip install rsa

代码实现

以下代码主要是登录成功后,爬取热闹微博的TOP 100,再保存到hotweb.html文件里边

import re
import json
import urllib.parse
import base64
import binascii
import json
 
import rsa
import requests
import logging

from pprint import pprint 

wbdom = r'd:\pyzone\hotwb.html';
weclient = 'ssologin.js(v1.4.5)'
FORMAT = '%(asctime)-15s %(message)s'
user_agent = (
    'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.11 (KHTML, like Gecko) '
    'Chrome/20.0.1132.57 Safari/536.11'
)

logging.basicConfig(level=logging.DEBUG, format=FORMAT)
logger = logging.getLogger('weibo')
session = requests.session()
session.headers['User-Agent'] = user_agent
 
 
def encrypt_passwd(passwd, pubkey, servertime, nonce):
    key = rsa.PublicKey(int(pubkey, 16), int('10001', 16))
    message = str(servertime) + '\t' + str(nonce) + '\n' + str(passwd)
    passwd = rsa.encrypt(message.encode(), key)
    return binascii.b2a_hex(passwd)
 
 
def wblogin(username, password):
    resp = session.get(
        'http://login.sina.com.cn/sso/prelogin.php?'
        'entry=sso&callback=sinaSSOController.preloginCallBack&'
        'su=%s&rsakt=mod&client=%s' %
        (base64.b64encode(username), weclient)
    )
 
    pre_login_str = re.match(r'[^{]+({.+?})', resp.content.decode('gbk')).group(1)
    pre_login = json.loads(pre_login_str)
 
    pre_login = json.loads(pre_login_str)
    data = {
        'entry': 'weibo',
        'gateway': 1,
        'from': '',
        'savestate': 7,
        'userticket': 1,
        'ssosimplelogin': 1,
        'su': base64.b64encode(urllib.parse.quote(username).encode()),
        'service': 'miniblog',
        'servertime': pre_login['servertime'],
        'nonce': pre_login['nonce'],
        'vsnf': 1,
        'vsnval': '',
        'pwencode': 'rsa2',
        'sp': encrypt_passwd(password, pre_login['pubkey'],
                             pre_login['servertime'], pre_login['nonce']),
        'rsakv' : pre_login['rsakv'],
        'encoding': 'gbk',
        'prelt': '115',
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.si'
               'naSSOController.feedBackUrlCallBack',
        'returntype': 'META'
    }
    resp = session.post(
        'http://login.sina.com.cn/sso/login.php?client=%s' % weclient,
        data=data
    )
 
    login_url = re.search(r'replace\([\"\']([^\'\"]+)[\"\']',
                          resp.content.decode('gbk')).group(1)
    resp = session.get(login_url)
    login_str = re.match(r'[^{]+({.+?}})', resp.content.decode('gbk'))
    if(login_str):      # result is not None
        logger.info('login success..')
        login_str = json.loads(login_str.group(1))
        pprint(login_str)
        return True
    else:
        logger.info('login fail..')
        return False

def gethotwb(url):
    f = open(wbdom, mode='a', encoding='utf-8')
    for x in range(1,11):       # page 1 to 10
        r = session.get(url + str(x))
        r.encoding = 'utf-8'
        f.write('\n<p>--------page:'+ str(x) +'---------</p>\n\n')
        f.write(json.loads(r.text)['data']['html'])
    f.close()
 
if __name__ == '__main__':
    flag = wblogin(b'xx@163.com', 'xx')
    if(flag):
        gethotwb('http://hot.weibo.com/ajax/feed?type=h&v=9999&page=');

总结

  1. 测试的过程中连接了翻墙的VPN,异地登录需要验证码,此时retcode=4049,登录成功是0
  2. python各个版本之间不兼容好蛋痛

大家中秋快乐!

参考文档

requests文档 http://docs.python-requests.org/zh_CN/latest/

微博登录过程分析 http://www.cnblogs.com/pzxbc/archive/2012/02/03/2335027.html


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM