爬蟲(八):分析Ajax請求抓取今日頭條街拍美圖


(1):分析網頁

分析ajax的請求網址,和需要的參數。通過不斷向下拉動滾動條,發現請求的參數中offset一直在變化,所以每次請求通過offset來控制新的ajax請求。

(2)上代碼

 a、通過ajax請求獲取頁面數據

# 獲取頁面數據
def get_page_index(offset, keyword):
    # 參數通過分析頁面的ajax請求獲得
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)  # 將字典轉換為url參數形式
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('請求索引頁錯誤')
        return None

b、分析ajax請求的返回結果,獲取圖片集的url

# 分析ajax請求的返回結果,獲取圖片集的url
def parse_page_index(html):
    data = json.loads(html) # 加載返回的json數據
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

c、得到圖集url后獲取圖集的內容

# 獲取詳情頁的內容
def get_page_detail(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('詳情頁頁錯誤', url)
        return None

d、其他看完整代碼

完整代碼:

# -*- coding: utf-8 -*-
# @Author  : FELIX
# @Date    : 2018/4/4 12:49

import json

import os
from hashlib import md5

import requests
from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import re
import pymongo
from multiprocessing import Pool


MONGO_URL='localhost'

MONGO_DB='toutiao'

MONGO_TABLE='toutiao'

GROUP_START=1
GROUP_END=20
KEYWORD='街拍'


client = pymongo.MongoClient(MONGO_URL)  # 連接MongoDB
db = client[MONGO_DB]  # 如果已經存在連接,否則創建數據庫

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
}


# 獲取頁面數據
def get_page_index(offset, keyword):
    # 參數通過分析頁面的ajax請求獲得
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '1',
        'from': 'search_tab',
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)  # 將字典轉換為url參數形式
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('請求索引頁錯誤')
        return None


# 分析ajax請求的返回結果,獲取圖片集的url
def parse_page_index(html):
    data = json.loads(html) # 加載返回的json數據
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')


# 獲取詳情頁的內容
def get_page_detail(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException:
        print('詳情頁頁錯誤', url)
        return None


def parse_page_detail(html, url):
    # soup=BeautifulSoup(html,'lxml')
    # print(soup)
    # title=soup.select('tetle').get_text()
    # print(title)
    images_pattern = re.compile('articleInfo:.*?title: \'(.*?)\'.*?content.*?\'(.*?)\'', re.S)
    result = re.search(images_pattern, html)
    if result:
        title = result.group(1)
        url_pattern = re.compile('"(http:.*?)"')
        img_url = re.findall(url_pattern, str(result.group(2)))
        if img_url:
            for img in img_url:
                download_img(img)  # 下載
            data = {
                'title': title,
                'url': url,
                'images': img_url,
            }
            return data


def save_to_mongo(result):
    if result:
        if db[MONGO_TABLE].insert(result):  # 插入數據
            print('存儲成功', result)
            return True
    return False


def download_img(url):
    print('正在下載', url)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            save_img(response.content)
        else:
            return None
    except RequestException:
        print('下載圖片錯誤', url)
        return None


def save_img(content):
    # os.getcwd()獲取當前文件路徑,用md5命名,保證不重復
    file_path = '{}/imgs/{}.{}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
    if not os.path.exists(file_path):
        with open(file_path, 'wb')as f:
            f.write(content)


def main(offset):
    html = get_page_index(offset, KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        print(url,'++++++++++++++++++++++++++++++++++++++++++++++++')
        print(html)
        if html:
            result = parse_page_detail(html, url)
            save_to_mongo(result)
            # print(result)
        # print(url)


if __name__ == '__main__':
    groups = [i * 20 for i in range(GROUP_START, GROUP_END + 1)]
    pool = Pool()
    pool.map(main, groups)

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM