【需求】輸入關鍵字,如書包,可以搜索出對應商品的信息,包括:商品標題、商品鏈接、價格范圍;且最終的商品信息需要符合:包郵、價格差不會超過某數值
#coding=utf-8 """ 以下三個字可以自行設置:search_keyword、page、price_interval_max """ #設置搜索的關鍵字 search_keyword = "戒指" #設置需要搜索的商品的頁數,比如設置10,就是淘寶搜出結果中前10頁的商品數據,淘寶默認一頁有44個商品 page = 10 #設置最大價格和最小價格之間可接受的差 price_interval_max = 1000 import re, os, requests, sys, time, shutil from selenium import webdriver from lxml import etree from xlrd import open_workbook from xlutils.copy import copy reload(sys) sys.setdefaultencoding( "utf-8" ) time1 = time.time() phantomjs_path = os.getcwd() + "phantomjs.exe" driver=webdriver.PhantomJS(executable_path='D:/Python27/Scripts/phantomjs.exe') # driver=webdriver.PhantomJS(executable_path=phantomjs_path) search_url = 'https://s.taobao.com/search' payload = {'q':search_keyword, 's':'1', 'ie':'utf8'} #字典傳遞url參數 payload1 = {'ie':'utf8'} excel_path_ori = os.getcwd() + "//result.xls" excel_path = os.getcwd() + "//tb_result.xls" if not os.path.exists(excel_path): shutil.copy(excel_path_ori, excel_path) else: os.remove(excel_path) shutil.copy(excel_path_ori, excel_path) file = open('taobao_test.txt', 'w') sheetName = "Sheet1" url_lineindex = 0 title_lineindex = 1 price_lineindex = 2 price_interval_lineindex = 3 interval_lineindex = 4 fee_lineindex = 5 def Write_Excel(rowIndex, lineIndex, content): """ - rowIndex:行 - lineIndex:列 """ rowIndex = int(rowIndex) lineIndex = int(lineIndex) rb = 'r+w' rb = open_workbook(excel_path, 'r') rbook = open_workbook(excel_path, 'w') wb = copy(rbook) sheetIndex = rbook.sheet_names().index(sheetName) wb.get_sheet(int(sheetIndex)).write(int(rowIndex), int(lineIndex), content) wb.save(excel_path) def get_detail_price(url): """ 獲取價格范圍字段 :param url: :return: """ driver.get(url) time.sleep(1) html=driver.page_source selector=etree.HTML(html) if "tmall" in url: detail_price = selector.xpath('//div[@class="tm-promo-price"]/span[@class="tm-price"]/text()') elif "taobao" in url: detail_price = selector.xpath('//em[@class="tb-rmb-num"]/text()') return detail_price def get_price_interval(price): """ 部分商品的價格是一個范圍,如:12.00-25.00,以下獲取價格范圍,及價格差 :param price: :return: """ print price price_interval = price[0] price_interval = ''.join(price_interval) if "-" in price_interval: start_price = price_interval.split("-")[0] end_price = price_interval.split("-")[1] interval = float(end_price) - float(start_price) else: interval = 0 return price_interval, interval def get_url_test(): """ 獲取商品信息:標題、鏈接、最大價格、價格范圍、價格差 :return:NONE """ j = 0 Write_Excel(j, url_lineindex, u"商品鏈接") Write_Excel(j, title_lineindex, u"商品標題") Write_Excel(j, price_lineindex, u"最低價格") Write_Excel(j, price_interval_lineindex, u"價格范圍") Write_Excel(j, interval_lineindex, u"價格差") Write_Excel(j, fee_lineindex, u"運費") for k in range(0, page): #10次,就是10頁的商品數據 payload['s'] = 44 * k + 1 #此處改變的url參數為s,s為1時第一頁,s為45是第二頁,89時第三頁以此類推 resp = requests.get(search_url, params=payload) #設置編碼 title = re.findall(r'"raw_title":"([^"]+)"', resp.text, re.I) #正則保存所有raw_title的內容,這個是書名,下面是價格,地址 price = re.findall(r'"view_price":"([^"]+)"', resp.text, re.I) loc = re.findall(r'"i003d568963194127tem_loc":"([^"]+)"', resp.text, re.I) url = re.findall(r'"detail_url":"([^"]+)"', resp.text, re.I) fee = re.findall(r'"view_fee":"([^"]+)"', resp.text, re.I) x = len(title) #每一頁商品的數量 for i in range(0, x) : #把緩沖中的數據保存到文件中 print i print('商品標題:' + title[i]) print('最低價格:' + price[i]) print('運費:' + fee[i]) #獲取商品鏈接 url[i] = url[i].replace("\u003d","=").replace("\u0026","&") # print('goods_url:' + url[i]) url[i] = "https:" + url[i] print('商品鏈接:' + url[i]) #獲取商品價格區間 try: resp_detail = requests.get(url[i]) resp_detail.encoding = 'utf-8' detail_price = get_detail_price(url[i]) data = get_price_interval(detail_price) price_interval = data[0] interval = data[1] print('price_interval:' + price_interval) print('interval:' + str(interval)) #保存數據 file.write( str(k * 44 + i + 1) + '商品鏈接:' + url[i] + '\n' + '商品標題:' + title[i] + '\n' + '最低價格:' + price[i] + '\n' + '價格范圍:' + str(price_interval) + '\n' + '價格差:' + str(interval) + '\n' ) # 'goods_fee:' + fee[i] + '\n') #將過濾數據寫入excel表格 if fee[i] == "0.00" and interval < int(price_interval_max): print "該商品符合要求:包郵,且最大價格與最小價格差小於%s" % price_interval_max j = j + 1 Write_Excel(j, url_lineindex, url[i]) Write_Excel(j, title_lineindex, title[i]) Write_Excel(j, price_lineindex, price[i]) Write_Excel(j, price_interval_lineindex, price_interval) Write_Excel(j, interval_lineindex, interval) Write_Excel(j, fee_lineindex, fee[i]) except: print "該商品信息獲取失敗,跳過" continue get_url_test() # #環境恢復 file.close() os.system("taskkill /im phantomjs.exe") time2 = time.time() print u'ok,結束!' print u'總共耗時:' + str((time2 - time1)/60) + '分鍾'
