python3爬蟲-爬取B站排行榜信息

本文轉載自查看原文 2019-04-25 00:36 1246 python爬蟲代碼
import requests, re, time, os

category_dic = {
    "all": "全站榜",
    "origin": "原創榜",
    "rookie": "新人榜",
}

day_dic = {1: "日排行榜", 3: "三日排行榜", 7: "周排行榜", 30: "月排行榜"}
all_or_origin_dic = {
    0: "全站",
    1: "動畫",
    168: "國創相關",
    3: "音樂",
    129: "舞蹈",
    4: "游戲",
    36: "科技",
    188: "數碼",
    160: "生活",
    119: "鬼畜",
    155: "時尚",
    5: "娛樂",
    181: "影視",
}

bangumi_dic = {
    "番劇": 1,
    "國產動畫": 4,
}

cinema_dic = {
    "記錄篇": 177,
    "電影": 23,
    "電視劇": 11,
}

rookie_dic = {
    0: "全站",
    1: "動畫",
    3: "音樂",
    129: "舞蹈",
    4: "游戲",
    36: "科技",
    188: "數碼",
    160: "生活",
    119: "鬼畜",
    155: "時尚",
    5: "娛樂",
    181: "影視",
}

BaseDict = {
    "all": all_or_origin_dic,
    "origin": all_or_origin_dic,
    # "bangumi": bangumi_dic,
    # "cinema": cinema_dic,
    "rookie": rookie_dic,
}

dic = {
    "all": 1,
    "origin": 2,
    "rookie": 3,
}

base_path = "D:\圖片\\bilibili_ranking"       # 文件保存的位置


def get_url():
    for first in category_dic.keys():
        if first in ["all", "origin", "rookie"]:
            for second in BaseDict.get(first).keys():
                for third in day_dic.keys():
                    url = "https://api.bilibili.com/x/web-interface/ranking?jsonp=jsonp&rid={}&day={}&type={}&arc_type=0&callback=__jp1".format(
                        second, third, dic.get(first))
                    yield url, [first, second, third]


s = requests.Session()
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
    "Referer": "https://www.bilibili.com/ranking/all/0/0/3"
}
url_list = get_url()
for url in url_list:
    print("向{}發請求".format(url[0]))
    response = s.get(url=url[0], headers=headers)
    data = response.text.replace('"', "")
    pattern = r'.*?author:(?P<author>.*?),.*?play:(?P<play>.*?),.*?pts:(?P<pts>.*?),.*?title:(?P<title>.*?),'
    result_list = re.findall(pattern, data)
    path = os.path.join(base_path, "{}-{}-{}".format(category_dic.get(url[1][0]),
                                                     rookie_dic.get(url[1][1]) or all_or_origin_dic.get(url[1][1]),
                                                     day_dic.get(url[1][2])))
    f = open(path + ".txt", "a", encoding="utf-8")
    print('正在寫入....{}'.format(path + ".txt"))
    for index, res in enumerate(result_list):
        # print("排名：{}".format(index + 1))
        # print("作者：{}".format(res[0]))
        # print("播放量：{}".format(res[1]))
        # print("綜合分數：{}".format(res[2]))
        # print("標題：{}".format(res[3]))
        # print("-" * 90)
        f.write("排名：{}\n".format(index + 1))
        f.write("標題：{}\n".format(res[3]))
        f.write("作者：{}\n".format(res[0]))
        f.write("播放量：{}\n".format(res[1]))
        f.write("綜合分數：{}\n".format(res[2]))
        f.write("-" * 90 + "\n")
    f.close()
    time.sleep(2)
免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。