大規模數據爬取 -- Python


Python書寫爬蟲,目的是爬取所有的個人商家商品信息及詳情,並進行數據歸類分析

整個工作流程圖:

 

 

 

第一步:采用自動化的方式從前台頁面獲取所有的頻道

from bs4 import BeautifulSoup
import requests

#1、找到左側邊欄所有頻道的鏈接
start_url = 'http://hz.58.com/sale.shtml'
url_host = 'http://hz.58.com'

def get_channel_urls(url):
    wb_data = requests.get(start_url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('ul.ym-mainmnu > li > span > a["href"]')
    for link in links:
        page_url = url_host + link.get('href')
        print(page_url)
    #print(links)

get_channel_urls(start_url)

channel_list = '''
    http://hz.58.com/shouji/
    http://hz.58.com/tongxunyw/
    http://hz.58.com/danche/
    http://hz.58.com/diandongche/
    http://hz.58.com/diannao/
    http://hz.58.com/shuma/
    http://hz.58.com/jiadian/
    http://hz.58.com/ershoujiaju/
    http://hz.58.com/yingyou/
    http://hz.58.com/fushi/
    http://hz.58.com/meirong/
    http://hz.58.com/yishu/
    http://hz.58.com/tushu/
    http://hz.58.com/wenti/
    http://hz.58.com/bangong/
    http://hz.58.com/shebei.shtml
    http://hz.58.com/chengren/
'''

 

第二步:通過第一步獲取的所有頻道去獲取所有的列表詳情,並存入URL_list表中,同時獲取商品詳情信息

from bs4 import BeautifulSoup
import requests
import time
import pymongo

client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
url_list = ceshi['url_list']
item_info = ceshi['item_info']


def get_links_from(channel,pages,who_sells=0):
    #http://hz.58.com/shouji/0/pn7/
    list_view = '{}{}/pn{}/'.format(channel,str(who_sells),str(pages))
    wb_data = requests.get(list_view)
    time.sleep(1)
    soup = BeautifulSoup(wb_data.text,'lxml')
    links = soup.select('td.t > a[onclick]')
    if soup.find('td','t'):
        for link in links:
            item_link = link.get('href').split('?')[0]
            url_list.insert_one({'url':item_link})
            print(item_link)
    else:
        pass
        # Nothing


def get_item_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text,'lxml')
    no_longer_exist = '商品已下架' in soup
    if no_longer_exist:
        pass
    else:
        title = soup.title.text
        price = soup.select('span.price_now > i')[0].text
        area = soup.select('div.palce_li > span > i')[0].text
        #url_list.insert_one({'title':title,'price':price,'area':area})
        print({'title':title,'price':price,'area':area})

#get_links_from('http://hz.58.com/pbdn/',7)
#get_item_info('http://zhuanzhuan.58.com/detail/840577950118920199z.shtml')

 

第三步:采用多進程的方式的main主函數入口

from multiprocessing import Pool
from channel_extract import channel_list
from page_parsing import get_links_from

def get_all_links_from(channel):
    for num in range(1,31):
        get_links_from(channel,num)

if __name__ == '__main__':
    pool = Pool()
    pool.map(get_all_links_from,channel_list.split())

 

第四步:實時對獲取到的數據進行監控

from time import sleep
from page_parsing import url_list

while True:
    print(url_list.find().count())
    sleep(5)

 

 

具體運行效果:

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM