python:爬蟲獲取淘寶/天貓的商品信息


【需求】輸入關鍵字,如書包,可以搜索出對應商品的信息,包括:商品標題、商品鏈接、價格范圍;且最終的商品信息需要符合:包郵、價格差不會超過某數值

 

#coding=utf-8
"""
以下三個字可以自行設置:search_keyword、page、price_interval_max
"""
#設置搜索的關鍵字
search_keyword = "戒指"
#設置需要搜索的商品的頁數,比如設置10,就是淘寶搜出結果中前10頁的商品數據,淘寶默認一頁有44個商品
page = 10
#設置最大價格和最小價格之間可接受的差
price_interval_max = 1000

import re, os, requests, sys, time, shutil
from selenium import webdriver
from lxml import etree
from xlrd import open_workbook
from xlutils.copy import copy
reload(sys)
sys.setdefaultencoding( "utf-8" )

time1 = time.time()
phantomjs_path = os.getcwd() + "phantomjs.exe"
driver=webdriver.PhantomJS(executable_path='D:/Python27/Scripts/phantomjs.exe')
# driver=webdriver.PhantomJS(executable_path=phantomjs_path)
search_url = 'https://s.taobao.com/search'
payload = {'q':search_keyword, 's':'1', 'ie':'utf8'}  #字典傳遞url參數
payload1 = {'ie':'utf8'}
excel_path_ori = os.getcwd() + "//result.xls"
excel_path = os.getcwd() + "//tb_result.xls"
if not os.path.exists(excel_path):
    shutil.copy(excel_path_ori, excel_path)
else:
    os.remove(excel_path)
    shutil.copy(excel_path_ori, excel_path)
file = open('taobao_test.txt', 'w')

sheetName = "Sheet1"
url_lineindex = 0
title_lineindex = 1
price_lineindex = 2
price_interval_lineindex = 3
interval_lineindex = 4
fee_lineindex = 5

def Write_Excel(rowIndex, lineIndex, content):
    """
    - rowIndex:行
    - lineIndex:列
    """
    rowIndex = int(rowIndex)
    lineIndex = int(lineIndex)
    rb = 'r+w'
    rb = open_workbook(excel_path, 'r')
    rbook = open_workbook(excel_path, 'w')
    wb = copy(rbook)
    sheetIndex = rbook.sheet_names().index(sheetName)
    wb.get_sheet(int(sheetIndex)).write(int(rowIndex), int(lineIndex), content)
    wb.save(excel_path)

def get_detail_price(url):
    """
    獲取價格范圍字段
    :param url:
    :return:
    """
    driver.get(url)
    time.sleep(1)
    html=driver.page_source
    selector=etree.HTML(html)
    if "tmall" in url:
        detail_price = selector.xpath('//div[@class="tm-promo-price"]/span[@class="tm-price"]/text()')

    elif "taobao" in url:
        detail_price = selector.xpath('//em[@class="tb-rmb-num"]/text()')
    return detail_price

def get_price_interval(price):
    """
    部分商品的價格是一個范圍,如:12.00-25.00,以下獲取價格范圍,及價格差
    :param price:
    :return:
    """
    print price
    price_interval = price[0]
    price_interval = ''.join(price_interval)
    if "-" in price_interval:
        start_price = price_interval.split("-")[0]
        end_price = price_interval.split("-")[1]
        interval = float(end_price) - float(start_price)
    else:
        interval = 0
    return price_interval, interval

def get_url_test():
    """
    獲取商品信息:標題、鏈接、最大價格、價格范圍、價格差
    :return:NONE
    """
    j = 0
    Write_Excel(j, url_lineindex, u"商品鏈接")
    Write_Excel(j, title_lineindex, u"商品標題")
    Write_Excel(j, price_lineindex, u"最低價格")
    Write_Excel(j, price_interval_lineindex, u"價格范圍")
    Write_Excel(j, interval_lineindex, u"價格差")
    Write_Excel(j, fee_lineindex, u"運費")
    for k in range(0, page):        #10次,就是10頁的商品數據

        payload['s'] = 44 * k + 1   #此處改變的url參數為s,s為1時第一頁,s為45是第二頁,89時第三頁以此類推
        resp = requests.get(search_url, params=payload)
          #設置編碼
        title = re.findall(r'"raw_title":"([^"]+)"', resp.text, re.I)  #正則保存所有raw_title的內容,這個是書名,下面是價格,地址
        price = re.findall(r'"view_price":"([^"]+)"', resp.text, re.I)
        loc = re.findall(r'"i003d568963194127tem_loc":"([^"]+)"', resp.text, re.I)
        url = re.findall(r'"detail_url":"([^"]+)"', resp.text, re.I)
        fee = re.findall(r'"view_fee":"([^"]+)"', resp.text, re.I)
        x = len(title)           #每一頁商品的數量

        for i in range(0, x) :    #把緩沖中的數據保存到文件中
            print i
            print('商品標題:' + title[i])
            print('最低價格:' + price[i])
            print('運費:' + fee[i])
            #獲取商品鏈接
            url[i] = url[i].replace("\u003d","=").replace("\u0026","&")
            # print('goods_url:' + url[i])
            url[i] = "https:" + url[i]
            print('商品鏈接:' + url[i])
            #獲取商品價格區間
            try:
                resp_detail = requests.get(url[i])
                resp_detail.encoding = 'utf-8'
                detail_price = get_detail_price(url[i])
                data = get_price_interval(detail_price)
                price_interval = data[0]
                interval = data[1]
                print('price_interval:' + price_interval)
                print('interval:' + str(interval))
                #保存數據
                file.write(
                    str(k * 44 + i + 1) +
                    '商品鏈接:' + url[i] + '\n' +
                    '商品標題:' + title[i] + '\n' +
                    '最低價格:' + price[i] + '\n' +
                    '價格范圍:' + str(price_interval) + '\n' +
                    '價格差:' + str(interval) + '\n' )
                    # 'goods_fee:' + fee[i] + '\n')
                #將過濾數據寫入excel表格
                if fee[i] == "0.00" and interval < int(price_interval_max):
                    print "該商品符合要求:包郵,且最大價格與最小價格差小於%s" % price_interval_max
                    j = j + 1
                    Write_Excel(j, url_lineindex, url[i])
                    Write_Excel(j, title_lineindex, title[i])
                    Write_Excel(j, price_lineindex, price[i])
                    Write_Excel(j, price_interval_lineindex, price_interval)
                    Write_Excel(j, interval_lineindex, interval)
                    Write_Excel(j, fee_lineindex, fee[i])
            except:
                print "該商品信息獲取失敗,跳過"
                continue


get_url_test()
# #環境恢復
file.close()
os.system("taskkill /im phantomjs.exe")
time2 = time.time()
print u'ok,結束!'
print u'總共耗時:' + str((time2 - time1)/60) + '分鍾'

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM