爬蟲技術：爬取今日頭條數據-崔慶才思路

本文轉載自查看原文 2019-09-19 17:18 2886 python高級知識

一. urllib庫中將字典轉化為url的查詢參數

二.請求異常的處理，以及內部的判斷邏輯

　　1.返回的json數據為空：原因是requests的請求對象沒有加請求頭和cookies

import requests
from urllib.parse import urlencode
def get_page_index():
    data = {
    "aid": "24",
    "app_name": "web_search",
    "offset": "0",
    "format": "json",
    "keyword": "街拍",
    "autoload": "true",
    "count": "20",
    "en_qc": "1",
    "cur_tab": "1",
    "from": "search_tab",
    "pd": "synthesis",
    "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data)
    response = requests.get(url)
    if response.status_code == 200:
        print(response.text)
if __name__ == '__main__':
     get_page_index()
# 結果：
{"count":0,"return_count":0,"query_id":"6537385837821170952","has_more":0,"request_id":"20190919170154010017090029827CF0A","search_id":"20190919170154010017090029827CF0A","cur_ts":1568883714,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","log_pb":{"impr_id":"20190919170154010017090029827CF0A"},"data":null,"data_head":[{"challenge_code":1366,"cell_type":71,"keyword":"街拍","url":"sslocal://search?keyword=%E8%A1%97%E6%8B%8D\u0026from=\u0026source=search_tab"}],"ab_fields":null,"latency":0,"search_type":2,"tab_rank":null}

　　2.正常獲得數據

import requests
from urllib.parse import urlencode
def get_page_index():
    data = {
    "aid": "24",
    "app_name": "web_search",
    "offset": "0",
    "format": "json",
    "keyword": "街拍",
    "autoload": "true",
    "count": "20",
    "en_qc": "1",
    "cur_tab": "1",
    "from": "search_tab",
    "pd": "synthesis",
    "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data)
    response = requests.get(url,headers=headers,cookies=cookies)
    if response.status_code == 200:
        print(response.content.decode("utf-8"))
if __name__ == '__main__':
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
    cookies = {"Cookie": "tt_webid=6719272225969096196; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6719272225969096196; csrftoken=b28e41c77cd4f268af393de7d3e9d47a; UM_distinctid=16c4159a9ae7e3-04be696c185f6c-3f385c06-1fa400-16c4159a9afa94; CNZZDATA1259612802=1303724616-1564459685-https%253A%252F%252Fwww.toutiao.com%252F%7C1564459685; WIN_WH=1536_710; s_v_web_id=e588fb5c6570d79a16b67e84decce3d8; __tasessionId=y99fyeyyt1568882979794"}
    get_page_index()

# 結果：
{"count":20,"return_count":20,"query_id":"6537385837821170952","has_more":1,"request_id":"20190919170856010017031149086E0FC","search_id":"20190919170856010017031149086E0FC","cur_ts":1568884136,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","tokens":["街拍"],"log_pb":{"impr_id":"20190919170856010017031149086E0FC"},"data":[{"ala_src":"user","app_info":{"query_type":"AladdinRpcQueryType"},"cell_type。。。。。。。。。。。。省略

四：

圖片地址位置定位：要現請求這個網址，獲得相應解析出對應的imag_url

　　解析報錯：SyntaxError: Non-UTF-8 code starting with '\xe5'，在程序上方添加 # -*- coding:utf-8 -*-

　　json中的鍵值對，期望用雙引號而不是單引號。原因是正則錯誤：

五：完整的代碼

# -*- coding:utf-8 -*-
import re
import requests
from urllib.parse import urlencode
import os
from requests.exceptions import RequestException
import json
import pymongo
from bs4 import BeautifulSoup
from config import *
from hashlib import md5

# 建立數據庫的鏈接對象
client = pymongo.MongoClient(MONGO_URL)
# 數據庫的名稱
db = client[MONGO_DB]


def get_page_index(offset, keyword):
    data = {
        "aid": "24",
        "app_name": "web_search",
        "offset": offset,
        "format": "json",
        "keyword": keyword,
        "autoload": "true",
        "count": "20",
        "en_qc": "1",
        "cur_tab": "1",
        "from": "search_tab",
        "pd": "synthesis",
        "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) # 知識點1：urlencode()將字典數據，{"a":"1","b":"2"}----> a=1,b=2
    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content.decode()
            return content
        return None
    except RequestException:  # 知識點2：所有請求異常類的捕獲
        print("請求出錯")
        return None


def parse_page_index(html):
    """構造生成器即可，或者這個函數的返回值是一個列表"""
    data = json.loads(html)
    if data and "data" in data.keys():
        for item in data.get("data"):  # 知識點3:字典獲取鍵的值的get方法
            if "article_url" in item.keys():
                url = item.get("article_url")
                yield url


def get_page_detial(url):
    try:  # 知識點4：請求的異常處理方式
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content.decode()
            return content
        return None
    except RequestException:  
        print("請求出錯")
        return None


def parse_page_detial(html, url):
    """正則獲取gallery"""
    soup = BeautifulSoup(html, "lxml")
    title = soup.select("title")[0].get_text()  # 知識點5：soup的選擇器使用
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)  # 知識點6：正則模式re.S模式
    result = re.search(images_pattern, html)
    if result:
        ret = result.group(1)
        # {\"count\":11,\"sub_images\":[{\"url\":\"http:\\\u002F\\\u002Fp3.pstatp.com\\...}
        # 在進行loads轉換時，報錯json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
        # 因此需要替換\為空字符串
        ret = ret.replace("\\", "")
        ret = ret.replace("u002F", "/")
        data = json.loads(ret)
        if data and 'sub_images' in data.keys():
            sub_images = data.get("sub_images")
            images = [item.get("url") for item in sub_images]
            for img in images:
                download(img)
            return {
                "title": title,
                "images": images,
                "url": url
            }

def save_to_mongo(ret_dict):
    if db[MONGO_TABLE].insert(ret_dict): # 知識點8：mongodb數據庫的鏈接，配置文件方式傳入
        print("插入數據到數據庫成功", ret_dict["title"])
        return True
    return False


def download(url):
    print("正在下載圖片",url)
    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content
            saveimg(content)
        return None
    except RequestException:
        print("請求出錯")
        return None

def saveimg(content):
    file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg")  # 知識點9：運用md5進行去重，md5的簡單回顧
    if not os.path.exists(file_path):  # 知識點10:os方法的使用
        with open(file_path,"wb") as f:
            f.write(content)


def main():
    for offset in range(START_PAGE,END_PAGE,20):
        keyword = "街拍"
        html = get_page_index(offset, keyword)
        if html:
            for url in parse_page_index(html):
                html = get_page_detial(url)
                if html:
                    ret = parse_page_detial(html, url)
                    if ret:
                        save_to_mongo(ret)


if __name__ == '__main__':
    headers = {
        "User-Agent": "xx"}
    cookies = {
        "Cookie": "xx"}
    main()

試運行爬取所有的街拍：報錯json.decoder.JSONDecodeError，因此代碼還得進行優化，排除異常。

六：知識點總結

urlencode是從urllib.parse中的一個方法：將字典變成url的查詢參數

from urllib.parse import urlencode

data = {"a":1,"b":2}

url = "http:www.baidu.com/?"

print(url + urlencode(data))

http:www.baidu.com/?a=1&b=2

md5加密的不一致問題

一直以來都是用 hashlib中的md5進行加密，md5.update(二進制) md5.hexdigest()，可以會出現對相同的字符串進行加密，加密結果不一樣的問題，看來是update方法造成的。

from hashlib import md5
fp = md5()

demo = ["1","1","3","3"]

for i in demo:
    fp.update(i.encode("utf-8"))
    print(fp.hexdigest())

# 結果：

c4ca4238a0b923820dcc509a6f75849b
6512bd43d9caa6e02c990b0a82652dca
73278a4a86960eeb576a8fd4c9ec6997
fd06b8ea02fe5b1c2496fe1700e9d16c

# 原因是md5.updage（）會將上次的串和這次的進行拼接，1,11,113，1133，每次加密的串都不同，結果肯定不同。
所以每加密之前，都對md5進行實例化，才能保證相同內容加密結果一樣，因為以前這個方法都是放在函數里面的，每次調用函數，都會重新實例化md5,因此不存在問題。循環就存在問題
上面代碼可以改為

from hashlib import md5

demo = ["1","1","3","3"]

for i in demo:
　　fp = md5()
　　fp.update(i.encode("utf-8"))
　　print(fp.hexdigest())

# 結果為：

c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3

for i in demo:
    print(md5(i.encode("utf-8")).hexdigest()) # 這種方式行，因為每次都重新實例化了 # 結果

c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3

# 看源碼也沒有理解update真正意圖，只是說用字符串更新對象。 后續解決

os模塊的使用方法

os的基本用法

1. os.getcwd()：查看當前所在路徑。

current_path = os.getcwd()
print(current_path)

# 運行結果
C:\Users\Administrator\AppData\Roaming\Sublime Text 3\Packages\User


2. os.listdir(path):列舉目錄下的所有文件。返回的是列表類型。

dir_list = os.listdir(current)

print(dir_list)

# 運行結果

['11.py', 'cuiqingcai.py', 'Localization.sublime-settings', 'oscrypto-ca-bundle.crt', 'Package Control.cache', 'Package Control.last-run', 'Package Control.merged-ca-bundle', 'Package Control.sublime-settings', 'Package Control.user-ca-bundle', 'Preferences.sublime-settings', 'reids分布式鎖', 'sha1.py', 'test.py', 'untitled.sublime-build']

具體用法見：https://www.cnblogs.com/yufeihlf/p/6179547.html

Mongo數據庫與python的交互

import pyongo  # 交互模塊

# 第一步，建立客戶端，鏈接mogo服務器,ip和port

 from pymongo import MongoClient
 client = MongoClient(host,port)
 collection = client[db名][集合名]   # db名--相當於數據庫的名稱 集合名---相當於表名稱
  
# 第二步，添加數據

ret = collection.insert_one({"name":"test10010","age":33})
 print(ret) # 通過返回的數據進行判斷
if ret:
　　xxxx

示例：

import pymongo

client = pymongo.MongoClient("localhost")

# 鏈接指定數據庫中的指定集合，不存在就新建

collection = client["test"]["new"]

ret = collection.insert({"new":"python"})

print(ret)

# 結果：
5d85ce978a808f42364b045c

插入前：


插入后：

正則表達式知識點回顧：

import re

pattern = re.compile("匹配規則", re.S)

re.compile（） 返回的就是一個匹配規則。陪着search find match等方法使用

import  re

a = """aaaaaaabbbbbbbb
111111ccccc"""

pattern1 = re.compile("aaaaaaa(.*?)cccc")
print(re.search(pattern1,a))

# None  re.S可以匹配全部文本，不擔心換行問題

pattern2 = pattern1 = re.compile("aaaaaaa(.*?)cccc",re.S)
print(re.search(pattern2,a))

# <re.Match object; span=(0, 26), match='aaaaaaabbbbbbbb\n111111cccc'>

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 分析Ajax爬取今日頭條街拍美圖-崔慶才思路爬蟲（二）爬取今日頭條圖片爬取今日頭條爬蟲實例之爬取今日頭條組圖 Python 爬蟲實例（2）—— 爬取今日頭條爬蟲—分析Ajax爬取今日頭條圖片 scrapy爬取今日頭條爬取今日頭條文章今日頭條反反爬思路總結爬取崔慶才大神的爬蟲教程，最后存儲到mysql