爬蟲技術:爬取今日頭條數據-崔慶才思路


一. urllib庫中將字典轉化為url的查詢參數

二.請求異常的處理,以及內部的判斷邏輯

  1.返回的json數據為空:原因是requests的請求對象沒有加請求頭和cookies

import requests
from urllib.parse import urlencode
def get_page_index():
    data = {
    "aid": "24",
    "app_name": "web_search",
    "offset": "0",
    "format": "json",
    "keyword": "街拍",
    "autoload": "true",
    "count": "20",
    "en_qc": "1",
    "cur_tab": "1",
    "from": "search_tab",
    "pd": "synthesis",
    "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data)
    response = requests.get(url)
    if response.status_code == 200:
        print(response.text)
if __name__ == '__main__':
     get_page_index()
# 結果:
{"count":0,"return_count":0,"query_id":"6537385837821170952","has_more":0,"request_id":"20190919170154010017090029827CF0A","search_id":"20190919170154010017090029827CF0A","cur_ts":1568883714,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","log_pb":{"impr_id":"20190919170154010017090029827CF0A"},"data":null,"data_head":[{"challenge_code":1366,"cell_type":71,"keyword":"街拍","url":"sslocal://search?keyword=%E8%A1%97%E6%8B%8D\u0026from=\u0026source=search_tab"}],"ab_fields":null,"latency":0,"search_type":2,"tab_rank":null}

   2.正常獲得數據

import requests
from urllib.parse import urlencode
def get_page_index():
    data = {
    "aid": "24",
    "app_name": "web_search",
    "offset": "0",
    "format": "json",
    "keyword": "街拍",
    "autoload": "true",
    "count": "20",
    "en_qc": "1",
    "cur_tab": "1",
    "from": "search_tab",
    "pd": "synthesis",
    "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data)
    response = requests.get(url,headers=headers,cookies=cookies)
    if response.status_code == 200:
        print(response.content.decode("utf-8"))
if __name__ == '__main__':
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"}
    cookies = {"Cookie": "tt_webid=6719272225969096196; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6719272225969096196; csrftoken=b28e41c77cd4f268af393de7d3e9d47a; UM_distinctid=16c4159a9ae7e3-04be696c185f6c-3f385c06-1fa400-16c4159a9afa94; CNZZDATA1259612802=1303724616-1564459685-https%253A%252F%252Fwww.toutiao.com%252F%7C1564459685; WIN_WH=1536_710; s_v_web_id=e588fb5c6570d79a16b67e84decce3d8; __tasessionId=y99fyeyyt1568882979794"}
    get_page_index()

# 結果:
{"count":20,"return_count":20,"query_id":"6537385837821170952","has_more":1,"request_id":"20190919170856010017031149086E0FC","search_id":"20190919170856010017031149086E0FC","cur_ts":1568884136,"offset":20,"message":"success","pd":"synthesis","show_tabs":1,"keyword":"街拍","city":"西安","tokens":["街拍"],"log_pb":{"impr_id":"20190919170856010017031149086E0FC"},"data":[{"ala_src":"user","app_info":{"query_type":"AladdinRpcQueryType"},"cell_type。。。。。。。。。。。。省略

 

 

四:

圖片地址位置定位:要現請求這個網址,獲得相應解析出對應的imag_url

  解析報錯:SyntaxError: Non-UTF-8 code starting with '\xe5',在程序上方添加 # -*- coding:utf-8 -*-

  json中的鍵值對,期望用雙引號而不是單引號。原因是正則錯誤:

五:完整的代碼

# -*- coding:utf-8 -*-
import re
import requests
from urllib.parse import urlencode
import os
from requests.exceptions import RequestException
import json
import pymongo
from bs4 import BeautifulSoup
from config import *
from hashlib import md5

# 建立數據庫的鏈接對象
client = pymongo.MongoClient(MONGO_URL)
# 數據庫的名稱
db = client[MONGO_DB]


def get_page_index(offset, keyword):
    data = {
        "aid": "24",
        "app_name": "web_search",
        "offset": offset,
        "format": "json",
        "keyword": keyword,
        "autoload": "true",
        "count": "20",
        "en_qc": "1",
        "cur_tab": "1",
        "from": "search_tab",
        "pd": "synthesis",
        "timestamp": "1568883030289"
    }

    url = "https://www.toutiao.com/api/search/content/?" + urlencode(data) # 知識點1:urlencode()將字典數據,{"a":"1","b":"2"}----> a=1,b=2
    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content.decode()
            return content
        return None
    except RequestException:  # 知識點2:所有請求異常類的捕獲
        print("請求出錯")
        return None


def parse_page_index(html):
    """構造生成器即可,或者這個函數的返回值是一個列表"""
    data = json.loads(html)
    if data and "data" in data.keys():
        for item in data.get("data"):  # 知識點3:字典獲取鍵的值的get方法
            if "article_url" in item.keys():
                url = item.get("article_url")
                yield url


def get_page_detial(url):
    try:  # 知識點4:請求的異常處理方式
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content.decode()
            return content
        return None
    except RequestException:  
        print("請求出錯")
        return None


def parse_page_detial(html, url):
    """正則獲取gallery"""
    soup = BeautifulSoup(html, "lxml")
    title = soup.select("title")[0].get_text()  # 知識點5:soup的選擇器使用
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\),', re.S)  # 知識點6:正則模式re.S模式
    result = re.search(images_pattern, html)
    if result:
        ret = result.group(1)
        # {\"count\":11,\"sub_images\":[{\"url\":\"http:\\\u002F\\\u002Fp3.pstatp.com\\...}
        # 在進行loads轉換時,報錯json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
        # 因此需要替換\為空字符串
        ret = ret.replace("\\", "")
        ret = ret.replace("u002F", "/")
        data = json.loads(ret)
        if data and 'sub_images' in data.keys():
            sub_images = data.get("sub_images")
            images = [item.get("url") for item in sub_images]
            for img in images:
                download(img)
            return {
                "title": title,
                "images": images,
                "url": url
            }

def save_to_mongo(ret_dict):
    if db[MONGO_TABLE].insert(ret_dict): # 知識點8:mongodb數據庫的鏈接,配置文件方式傳入
        print("插入數據到數據庫成功", ret_dict["title"])
        return True
    return False


def download(url):
    print("正在下載圖片",url)
    try:
        response = requests.get(url, headers=headers, cookies=cookies)
        if response.status_code == 200:
            content = response.content
            saveimg(content)
        return None
    except RequestException:
        print("請求出錯")
        return None

def saveimg(content):
    file_path = "{0}/{1}.{2}".format(os.getcwd(),md5(content).hexdigest(),"jpg")  # 知識點9:運用md5進行去重,md5的簡單回顧
    if not os.path.exists(file_path):  # 知識點10:os方法的使用
        with open(file_path,"wb") as f:
            f.write(content)


def main():
    for offset in range(START_PAGE,END_PAGE,20):
        keyword = "街拍"
        html = get_page_index(offset, keyword)
        if html:
            for url in parse_page_index(html):
                html = get_page_detial(url)
                if html:
                    ret = parse_page_detial(html, url)
                    if ret:
                        save_to_mongo(ret)


if __name__ == '__main__':
    headers = {
        "User-Agent": "xx"}
    cookies = {
        "Cookie": "xx"}
    main()

試運行爬取所有的街拍:報錯json.decoder.JSONDecodeError,因此代碼還得進行優化,排除異常。

六:知識點總結

urlencode是從urllib.parse中的一個方法:將字典變成url的查詢參數

from urllib.parse import urlencode

data = {"a":1,"b":2}

url = "http:www.baidu.com/?"

print(url + urlencode(data))

http:www.baidu.com/?a=1&b=2

 md5加密的不一致問題

一直以來都是用 hashlib中的md5進行加密,md5.update(二進制) md5.hexdigest(),可以會出現對相同的字符串進行加密,加密結果不一樣的問題,看來是update方法造成的。

from hashlib import md5
fp = md5()

demo = ["1","1","3","3"]

for i in demo:
    fp.update(i.encode("utf-8"))
    print(fp.hexdigest())

# 結果:

c4ca4238a0b923820dcc509a6f75849b
6512bd43d9caa6e02c990b0a82652dca
73278a4a86960eeb576a8fd4c9ec6997
fd06b8ea02fe5b1c2496fe1700e9d16c

# 原因是md5.updage()會將上次的串和這次的進行拼接,1,11,113,1133,每次加密的串都不同,結果肯定不同。
所以每加密之前,都對md5進行實例化,才能保證相同內容加密結果一樣,因為以前這個方法都是放在函數里面的,每次調用函數,都會重新實例化md5,因此不存在問題。循環就存在問題
上面代碼可以改為

from hashlib import md5

demo = ["1","1","3","3"]

 
         

for i in demo:
  fp = md5()
  fp.update(i.encode("utf-8"))
  print(fp.hexdigest())

# 結果為:

c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3

for i in demo:
    print(md5(i.encode("utf-8")).hexdigest()) # 這種方式行,因為每次都重新實例化了 # 結果

c4ca4238a0b923820dcc509a6f75849b
c4ca4238a0b923820dcc509a6f75849b
eccbc87e4b5ce2fe28308fd9f2a7baf3
eccbc87e4b5ce2fe28308fd9f2a7baf3

# 看源碼也沒有理解update真正意圖,只是說用字符串更新對象。 后續解決

 os模塊的使用方法

os的基本用法

1. os.getcwd():查看當前所在路徑。

current_path = os.getcwd()
print(current_path)

# 運行結果
C:\Users\Administrator\AppData\Roaming\Sublime Text 3\Packages\User


2. os.listdir(path):列舉目錄下的所有文件。返回的是列表類型。

dir_list = os.listdir(current)

print(dir_list)

# 運行結果

['11.py', 'cuiqingcai.py', 'Localization.sublime-settings', 'oscrypto-ca-bundle.crt', 'Package Control.cache', 'Package Control.last-run', 'Package Control.merged-ca-bundle', 'Package Control.sublime-settings', 'Package Control.user-ca-bundle', 'Preferences.sublime-settings', 'reids分布式鎖', 'sha1.py', 'test.py', 'untitled.sublime-build']

具體用法見:https://www.cnblogs.com/yufeihlf/p/6179547.html

 Mongo數據庫與python的交互

import pyongo  # 交互模塊

# 第一步,建立客戶端,鏈接mogo服務器,ip和port

 from pymongo import MongoClient
 client = MongoClient(host,port)
 collection = client[db名][集合名]   # db名--相當於數據庫的名稱 集合名---相當於表名稱
  
# 第二步,添加數據

ret = collection.insert_one({"name":"test10010","age":33})
 print(ret) # 通過返回的數據進行判斷
if ret:
  xxxx

示例:
import pymongo

client = pymongo.MongoClient("localhost")

# 鏈接指定數據庫中的指定集合,不存在就新建

collection = client["test"]["new"]

ret = collection.insert({"new":"python"})

print(ret)

# 結果:
5d85ce978a808f42364b045c

插入前:

插入后:
 
        

 正則表達式知識點回顧:

import re

pattern = re.compile("匹配規則", re.S)

re.compile() 返回的就是一個匹配規則。陪着search find match等方法使用

import  re

a = """aaaaaaabbbbbbbb
111111ccccc"""

pattern1 = re.compile("aaaaaaa(.*?)cccc")
print(re.search(pattern1,a))

# None  re.S可以匹配全部文本,不擔心換行問題

pattern2 = pattern1 = re.compile("aaaaaaa(.*?)cccc",re.S)
print(re.search(pattern2,a))

# <re.Match object; span=(0, 26), match='aaaaaaabbbbbbbb\n111111cccc'>

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM