大規模數據爬取 -- Python

本文轉載自查看原文 2017-09-03 10:37 1231 Python爬蟲數據可視化

Python書寫爬蟲，目的是爬取所有的個人商家商品信息及詳情，並進行數據歸類分析

整個工作流程圖：

第一步：采用自動化的方式從前台頁面獲取所有的頻道

from bs4 import BeautifulSoup
import requests

#1、找到左側邊欄所有頻道的鏈接
start_url = 'http://hz.58.com/sale.shtml'
url_host = 'http://hz.58.com'

def get_channel_urls(url):
    wb_data = requests.get(start_url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('ul.ym-mainmnu > li > span > a["href"]')
    for link in links:
        page_url = url_host + link.get('href')
        print(page_url)
    #print(links)

get_channel_urls(start_url)

channel_list = '''
    http://hz.58.com/shouji/
    http://hz.58.com/tongxunyw/
    http://hz.58.com/danche/
    http://hz.58.com/diandongche/
    http://hz.58.com/diannao/
    http://hz.58.com/shuma/
    http://hz.58.com/jiadian/
    http://hz.58.com/ershoujiaju/
    http://hz.58.com/yingyou/
    http://hz.58.com/fushi/
    http://hz.58.com/meirong/
    http://hz.58.com/yishu/
    http://hz.58.com/tushu/
    http://hz.58.com/wenti/
    http://hz.58.com/bangong/
    http://hz.58.com/shebei.shtml
    http://hz.58.com/chengren/
'''

第二步：通過第一步獲取的所有頻道去獲取所有的列表詳情，並存入URL_list表中，同時獲取商品詳情信息

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
url_list = ceshi['url_list']
item_info = ceshi['item_info']


def get_links_from(channel,pages,who_sells=0):
    #http://hz.58.com/shouji/0/pn7/
    list_view = '{}{}/pn{}/'.format(channel,str(who_sells),str(pages))
    wb_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('td.t > a[onclick]')
    if soup.find('td','t'):
        for link in links:
            item_link = link.get('href').split('?')[0]
            url_list.insert_one({'url':item_link})
            print(item_link)
    else:
        pass
        # Nothing


def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    no_longer_exist = '商品已下架' in soup
    if no_longer_exist:
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price_now > i')[0].text
        area = soup.select('div.palce_li > span > i')[0].text
        #url_list.insert_one({'title':title,'price':price,'area':area})
        print({'title':title,'price':price,'area':area})

#get_links_from('http://hz.58.com/pbdn/',7)
#get_item_info('http://zhuanzhuan.58.com/detail/840577950118920199z.shtml')

第三步：采用多進程的方式的main主函數入口

from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from

def get_all_links_from(channel):
    for num in range(1,31):
        get_links_from(channel,num)

if __name__ == '__main__':
    pool = Pool()
    pool.map(get_all_links_from,channel_list.split())

第四步：實時對獲取到的數據進行監控

from time import sleep
from page_parsing import url_list

while True:
    print(url_list.find().count())
    sleep(5)

具體運行效果：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 一個月入門Python爬蟲，輕松爬取大規模數據 python大規模數據處理技巧之一：數據常用操作大規模數據如何實現數據的高效追溯？使用Keras訓練大規模數據集爬蟲大規模數據采集心得和示例大規模數據從SQL SERVER導入到ORACLE方法 hbase大規模數據寫入的優化歷程 C++ 大規模數據排序(100G數據使用 4G 內存排序) 用Elasticsearch做大規模數據的多字段、多類型索引檢索 Kudu專注於大規模數據快速讀寫，同時進行快速分析的利器