今日頭條抓取街拍圖片數據集

本文轉載自查看原文 2018-02-01 23:31 1284

spider1：抓取街拍頁面的所有入口鏈接：

1.數據查看到，街拍頁面需要的數據集都在data這個集合中，而data是整個數據集字典的一個鍵，data這個鍵又包括了一個list，list中是一個個字典。

2. list中包括了是圖集的，以及是單個圖片顯示內容的。

3. 對比list中的數據集，可以發現，單個圖片顯示內容的是有：single_mode這個鍵

4.同時查看data的數據集：可以看到鍵‘url’或‘article_url’所對應的值，是我們需要的url入口鏈接，這邊要注意是否有url這個鍵

5.爬下來的網址發現有一類多余的，要剔除類似：https開頭的，news的，即如果沒有group在url里面的都要剔除，這是跑了到offset=180的結果

# 爬取今日頭條街拍的數據。
import json

import os
from hashlib import md5

import pymongo
from bs4 import BeautifulSoup
import requests
import re
from urllib.parse import urlencode
from requests.exceptions import RequestException
from config import *  # 將config中的對象全部引入
from multiprocessing import Pool  # 開啟進程池

client = pymongo.MongoClient(MONGO_URL, 27017)
db = client[MONGO_DB]
# spider1：負責抓取，街拍的的所有鏈接入口
def get_all_url(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 1,
        'from': 'search_tab'  # some are gallery
    }  # 請求的字符串參數  query string paramenter
    # 對網頁進行響應請求：
    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)  # 對data參數進行網絡編碼
    all_url = []
    try:
        response = requests.get(url)
        if response.status_code == 200:  # 即能夠正常請求網絡，我們就對網絡url進行抓取
            #print(response.text)  # 看一下數據:可以看到數據：\u7687\u57ce類似這種json數據
            response = json.loads(response.text)  # 將其轉化為json對象。
          #  print(response)  # 可以看到數據比較正常，此時里面有我們需要抓取的數據。用BeatifulSoup去解析了一下，發現，json對象不能用它解析
            if response and 'data' in response.keys():
                for data in response['data']:  # data 只是response的一個鍵。
                    if 'single_mode' not in data.keys() and 'article_url' in data.keys():
                      #  all_url.append(data['url'])  # 這個key使用，但是用生成器，貌似會更好，如果需要儲存在mongodb中，這個也不錯
                      #  print(data.get('url'))  現在返回沒有什么問題了
                        yield data.get('article_url')  # 向外面提供url去爬取
    except RequestException as f:
        print('請求頁面失敗！')


def url_washing(offset):
    '''
    負責url的清洗工作，最后得到的是需要的url，有用的數據。
    '''
    url = []
    # offset = [x for x in range(Group_start, Group_end, 20)]  # url也是一個迭代器
    for i in offset:
        for j in get_all_url(i, '街拍'):
            if 'group' in j:
                url.append(j)
                #print(j)
    return url


# spider2: 爬取詳情頁的詳細信息
def get_detail(url):
    '''
    url:每一個詳情頁的url
    :return: 詳情頁的標題，以及圖片網址，一個詳情頁，圖片網址不止一個
    '''
    try:
        response = requests.get(url)
        if response.status_code == 200:  # 即能夠正常請求網絡，我們就對網絡url進行抓取
            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.select('title')[0].get_text()
            images_pattern = re.compile('gallery: JSON.parse\\((.*?)\\)', re.S)
            images = re.search(images_pattern, response.text)
            images = json.loads(images.group(1))  # json.loads 不能轉換單引號數據類型,json對象。
            image_url = re.findall('\\"(http.*?)\\"', images)
            image_url1 = []
            for i in image_url:
                image_url1.append(i.replace('\\',''))
            data = {
                'title': title,
                'url': url,
                'image_url': image_url1
            }
            return data  # 每一個詳情頁的數據
    except RequestException as f:
        print('請求詳情頁失敗！')


def download_image(image_url):
    '''
    :param image_url: 每一個詳情頁圖片的網址
    :return: 將網址中的圖片下載到本地
    '''
    try:
        response = requests.get(image_url)
        if response.status_code == 200:  # 即能夠正常請求網絡，我們就對網絡url進行抓取
            sava_image(response.content) # content 是獲取圖片，視頻的二進制內容
    except RequestException as f:
        print('請求圖片失敗！')

def sava_image(content):

    file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(), 'jpg')
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()

def sava_to_mongo(data):
    if db[MONGO_TABLE].update(data, data, upsert=True):  # 更新數據庫比較好，后面有是否插入的參數
        print('儲存到MongoDB成功！', data)
        return True
    return False


def main(offset): # 定義一個主函數，操作所以函數
    # offset = [x for x in range(Group_start, Group_end, 20)]
    url = url_washing(offset)
    for i in url:
        data = get_detail(i)
        sava_to_mongo(data)
        # for url in data['image_url']:
        #     print('正在下載圖片',url)
        #     download_image(url)



if __name__ == '__main__':
    group = [x * 20 for x in range(Group_start, Group_end + 1)]  # [x for x in range(Group_start, Group_end, 20)]
    main(group)
    # pool = Pool()
    # pool.map(main, group)  # apply_async(main)
    # pool.close()
    # pool.join()

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。