記錄python爬取貓眼票房排行榜(帶stonefont字體網頁),保存到text文件,csv文件和MongoDB數據庫中

本文轉載自查看原文 2019-01-08 16:35 965 Python爬蟲

貓眼票房排行榜頁面顯示如下:

注意右邊的票房數據顯示,爬下來的數據是這樣顯示的:

網頁源代碼中是這樣顯示的:

這是因為網頁中使用了某種字體的緣故,分析源代碼可知:

親測可行:

　　代碼中獲取的是國內票房榜,稍加修改也可適用於最受期待榜和北美票房榜

解決思路如下:

1.獲取網頁數據后,查找字體信息,獲取到字體鏈接,下載字體保存到本地

2.使用fontTools讀取字體中的字符集,並構造字典(依據基准字體)

3.根據字典，替換網頁中的相關數據信息。

注意：如果使用BeautifulSoup一定要先使用字典替換字符集，再解析。直接解析BeautifulSoup會將無法識別的字符置為空。

#!/usr/bin/env python # -*- coding: utf-8 -*-
""" @Project:pachong @author:sandu @Email: sandu12345@msn.cn @Software: PyCharm @file: test_maoyan.py @time: 2019-01-08 0008 上午 10:05 """

import csv import json import os import re from hashlib import md5 import pymongo import requests from fontTools.ttLib import TTFont from requests.exceptions import RequestException import woff2otf MONGO_URL = 'localhost' MONGO_DB = 'maoyan' MONGO_TABLE = 'maoyan_beimei' client = pymongo.MongoClient(MONGO_URL, connect=False) db = client[MONGO_DB] # 獲取單頁數據
def get_one_page(url): try: response = requests.get(url) if response.status_code == 200: return response.text return None except RequestException: return None # 解析單頁數據,獲取所需的數據 # '.*?board-index.*?>(\d+).*?' 獲取順序號 # '.*?data-src="(.*?)".*?' 獲取圖片鏈接 # '.*?name.*?title.*?>(.*?)</a>.*?' 獲取電影名稱 # '.*?star">(.*?)</p>.*?',re.S 獲取演員名單(有換行,需要加上re.S,否則獲取不到數據) # '.*?releasetime">(.*?)</p>.*?' 獲取上映時間 # '.*?integer">(.*?)</i>.*?' 獲取主分 # '.*?fraction">(.*?)</i>.*?'獲取輔分

# '.*?realtime.*?stonefont">(.*?)</span></span>(.*?)</p>.*?' 實時票房 # '.*?total-boxoffice.*?stonefont">(.*?)</span></span>(.*?)</p>.*?' 總票房 # 綜合下來,加上最外層的dd

def parse_one_page(html): pattern = re.compile( '<dd>.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name.*?title.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>.*?realtime.*?stonefont">(.*?)</span></span>(.*?)</p>.*?total-boxoffice.*?stonefont">(.*?)</span></span>(.*?)</p>.*?</dd>', re.S) items = re.findall(pattern, html) for item in items: yield { 'index': item[0], 'img': item[1], 'name': item[2].strip(),  # 去除前后空格換行符等
            'star': item[3].strip()[3:],  # 去除前后空格換行符等,切片截取指定的范圍
            'time': item[4][5:], '實時票房': item[5] + item[6].strip(), '總票房': item[7] + item[8].strip(), } # 保存至文件
def save_to_file(content): # 注意:把json數據保存到文件中顯示出中文
    with open('beimei.text', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') # 保存到數據庫中
def save_to_mongo(result): if db[MONGO_TABLE].insert(result): print('Successfully Saved to Mongo', result) return True return False # 請求圖片url,獲取圖片二進制數據
def download_image(url): try: response = requests.get(url) if response.status_code == 200: save_image(response.content) # response.contenter二進制數據 response.text文本數據
        return None except RequestException: print('請求圖片出錯') return None # 數據存儲到csv
def write_to_file3(item): with open('beimei.csv', 'a', encoding='utf_8_sig', newline='') as f: # 'a'為追加模式（添加）
        # utf_8_sig格式導出csv不亂碼
        fieldnames = ['index', 'img', 'name', 'star', 'time', '實時票房', '總票房'] w = csv.DictWriter(f, fieldnames=fieldnames) w.writerow(item) # 解析字體
def get_font_regx(html): p = re.compile(r"url\('(.*?)'\)\sformat\('woff'\);")  # 查找網頁上的字體鏈接
    uni_font_url = re.findall(p, html) url = 'http:%s' % uni_font_url[0] resp = requests.get(url) with open('maoyan.woff', 'wb') as fontfile: for chunk in resp.iter_content(chunk_size=1024): if chunk: fontfile.write(chunk) # 將字體下載到本地
    woff2otf.convert('maoyan.woff', 'maoyan.otf') baseFont = TTFont('base.otf')  # base.otf是某一次訪問獲取的字體文件，然后人工識別內容，作為與后面獲取字體的比對標本，從而讓電腦自動獲得后面獲取字體的實際內容。
    maoyanFont = TTFont('maoyan.otf') uniList = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()  # 解析otf字體后獲得的數據
    numList = []  # 解析otf字體數據轉換成數字
    baseNumList = ['.', '3', '5', '1', '2', '7', '0', '6', '9', '8', '4'] baseUniCode = ['x', 'uniE78E', 'uniF176', 'uniEFE6', 'uniF074', 'uniE9C8', 'uniE912', 'uniEA71', 'uniE74E', 'uniE4B8', 'uniEE71'] for i in range(1, 12): maoyanGlyph = maoyanFont['glyf'][uniList[i]] for j in range(11): baseGlyph = baseFont['glyf'][baseUniCode[j]] if maoyanGlyph == baseGlyph: numList.append(baseNumList[j]) break uniList[1] = 'uni0078' new_dict = dict(zip(uniList[2:], numList[1:]))  # 實時獲取字體映射關系
    html = html.replace('&#x', 'uni') for key in new_dict.keys(): initstr = key.lower() + ';' html = html.replace(initstr, new_dict[key]) return html def save_image(content): file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg') if not os.path.exists(file_path): with open(file_path, 'wb') as f: f.write(content) def main(offset): url = 'https://maoyan.com/board/2?offset=' + str(offset)  # 針對分頁
    html = get_one_page(url) html = get_font_regx(html) for item in parse_one_page(html): print(item) save_to_file(item) # 保存至文件
        write_to_file3(item)  # 寫入到csv文件中,一定要寫在保存到數據庫的前面,因為先保存到數據庫后保存的字典數據中會多一個_id值
        save_to_mongo(item)  # 保存到數據庫
        # download_image(item['img']) # 下載圖片保存到當前目錄


if __name__ == '__main__': # for i in range(0, 100):
    # main(str(i*10))
    # 開啟多線程
    # pool = Pool()
    # pool.map(main,0)
 main(0)

# 注:如何獲取代碼中base.otf相關信息?
# 1.根據獲取到的woff字體文件,使用百度字體編輯器,獲取字體數字等相關信息,地址:http://fontstore.baidu.com/static/editor/index.html


# 2.將獲取到的woff文件使用woff2otf.convert('maoyan.woff', 'base.otf')轉化成base.otf文件保存到當前目錄(./woff2otf.py font.woff font.otf),從而獲得baseFont(代碼中變量)
# 3.根據百度字體編輯器獲取到的信息,構造baseNumList和baseUniCode(代碼中變量)

# 4.再次發起請求根據獲得的字體跟這個構造的基准字體進行對照,從而獲得新的字體映射關系

# 注: woff2otf是導入的一個py文件,鏈接地址:https://github.com/hanikesn/woff2otf,作用是輸入woff字體,輸出otf字體

# 保存到csv文件中

# 保存到MongoDB數據庫中

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 一起學爬蟲——使用xpath庫爬取貓眼電影國內票房榜 Python寫文件：將爬取結果寫入本地txt，寫入本地csv，寫入本地excel，保存到服務端數據庫 Python爬取豆瓣音樂TOP250，爬取的數據保存到csv文件和xls文件 python讀取mongoDb數據庫保存到Excel中 python爬取數據並保存到數據庫中（第一次練手完整代碼） Python爬蟲根據關鍵詞爬取知網論文摘要並保存到數據庫中【入門必學】 python之scrapy爬取數據保存到mysql數據庫 Python爬取百度實時熱點排行榜爬取芒果TV電視劇排行榜 Bilibili動漫排行榜信息爬取分析