python 簡單實現淘寶關鍵字商品爬取


 
         
本文有2個文件
1:taobao_re_xpath
2:taobao_re_xpath_setting

#
1:taobao_re_xpath

#
-*- coding:utf-8 -*- # author : yesehngbao # time:2018/3/20 import os import requests import re import json import pymongo import hashlib from taobao_re_xpath_setting import * from multiprocessing import Pool from lxml import etree dirname = DIRNAME if not os.path.exists(dirname): os.mkdir(dirname) dirname1 = DIRNAME1 if not os.path.exists(dirname+'/'+dirname1): os.mkdir(dirname+'/'+dirname1) dirname2 = DIRNAME2 if not os.path.exists(dirname+'/'+dirname2): os.mkdir(dirname+'/'+dirname2) dirname3 = DIRNAME3 if not os.path.exists(dirname+'/'+dirname3): os.mkdir(dirname+'/'+dirname3) url = 'https://s.taobao.com/search' headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/64.0.3282.186 Safari/537.36"} def md5(strs): strs = strs + '12sdwz..' strs = hashlib.md5(strs.encode('utf-8')) key = strs.hexdigest() return key def get_html(page): """ 獲取('首頁') :param page: 獲取的頁數, q: 想獲取的寶貝 :return: """ parmas = { 'q': '%s' % GOODS, 's': page, } respons = requests.get(url, headers=headers, params=parmas) if respons.status_code == 200: return respons.text else: return None def get_ajax_html(): """ 解析可能出現的ajax網頁 """ ajax_url = 'https://s.taobao.com/api' parmas = { '_ksTS': 1521612082036_312, 'callback': 'jsonp267', 'ajax': 'true', 'm': 'customized', 'q': '%s' % GOODS, 's': 36, 'bcoffset': 0, 'rn': '4e1dc906143376f8d2e735536fd3ee0c' } respons = requests.get(ajax_url, headers=headers, params=parmas).text comp = re.compile('jsonp\d+\((.*?)\)', re.S) strs = re.findall(comp, respons) if strs: strs = json.loads(strs[0]) commdity_list = strs.get('API.CustomizedApi').get('itemlist').get('auctions') if commdity_list: for commdity in commdity_list: addr = commdity.get('item_loc') nick = commdity.get('nick') sales = commdity.get('view_sales') detail = 'http:' + commdity.get('detail_url') yield { 'addr': addr, 'nick': nick, 'sales': sales, 'detail': detail, } def analysis(html): """ 解析列表頁的數據 html:列表頁源碼 content : js中的數據和ajax數據 的列表 """ content = [] comp = re.compile('g_page_config = (.*?)g_srp_loadCss', re.S) if comp: strs = re.findall(comp, html)[0] strs = strs.replace(';', '') strs = json.loads(strs) data = strs.get('mods').get('itemlist').get('data').get('auctions') if data: for i in data: detail = i.get('detail_url') if re.match('http', detail): pass else: detail = 'http:'+detail addr = i.get('item_loc') nick = i.get('nick') sales = i.get('view_sales') item = { 'addr': addr, 'nick': nick, 'sales': sales, 'detail': detail, } content.append(item) if len(data) < 44: cont = get_ajax_html() for i in cont: content.append(i) return content else: print(len(data)) return content else: return None else: return None def save_img(img_new, page): """ 主圖的下載 img_new : 主圖地址 """ if img_new: img_name = img_new[30:].replace('/', '-') respons = requests.get(img_new, headers=headers).content with open(dirname+'/'+dirname1+'/'+md5(img_name)+'.jpg', 'wb')as fp: fp.write(respons) return print('第%s頁————' % page + '主圖片保存完成: ', img_name) def save_color_img(color_url, page): """ 顏色圖的下載 color_url : 顏色圖地址 """ if color_url: img_name = color_url[30:].replace('/', '-') respons = requests.get(color_url, headers=headers).content with open(dirname + '/' + dirname2 + '/' + md5(img_name)+'.jpg', 'wb')as fp: fp.write(respons) return print('第%s頁————' % page + 'color圖片保存完成: ', img_name) def save_video(detail_url, title, page): """ 視頻的下載 url : 視頻地址 title : 視頻名字 """ if detail_url: respons = requests.get(detail_url, headers=headers).content with open(dirname+'/'+dirname3+'/' + md5(title)+'.mp4', 'wb') as fp: fp.write(respons) print('第%s頁————' % page + '視頻下載完成 :('+title+')') return 'download_ok' def alaysis_detail(respons, page): """ 獲取詳情頁的圖片, 顏色(類別)圖, 視頻 respons : 詳情頁源碼 doc : xpath解析對象 img_new : 圖片地址 color_url : 顏色圖地址 video_new : 視頻地址 """ # 主圖 if page == 0: page = 1 else: page = page // 44 + 1 doc = etree.HTML(respons) li_list = doc.xpath('.//ul[@class="tb-clearfix" or @id="J_UlThumb"]/li') for li in li_list: img_old = li.xpath('./a/img/@src') if img_old: img_old = img_old[0] img_new = img_old[-15:].replace('60', '400') img_new = 'http:'+img_old[:-15]+img_new save_img(img_new, page) else: pass # 顏色(類別)圖 compi = re.compile('style="background:url\((.*?)\)') color_img = re.findall(compi, respons) for color in color_img: if color: color_url = color[-15:].replace('40', '400').replace('30', '400') color_url = 'http:'+color[:-15]+color_url save_color_img(color_url, page) # 獲取視頻 comp = re.compile('TShop.Setup\(\s(.*?)\s\)', re.S) strs = re.findall(comp, respons) if strs: strs = json.loads(strs[0]) video_lod = strs.get('itemDO').get('imgVedioUrl') if video_lod: video_new = video_lod.replace('e/1', 'e/6').replace('t/8', 't/1') title = strs.get('itemDO').get('title') save_video('http:'+video_new, title, page) else: return '無視頻' else: return '無視頻(或有加密)' return '下載完成' def get_detail(content, page): """ 獲取詳情頁信息 content: 列表頁數據 detail_url : 詳情頁入口 """ if content: for cont in content: detail_url = cont.get('detail') respons = requests.get(detail_url, headers=headers) if respons.status_code == 200: alaysis_detail(respons.text, page) else: print(respons.status_code) return '詳情頁爬取完成' def save_mongo(content): """ 實現保存列表頁的數據 content: 列表頁數據 """ mongo_client = pymongo.MongoClient(host=MONGO_HOST, port=MONGO_PORT) db = mongo_client[MONGO_DB] coll = db[MONGO_COLL] coll.insert(content) print('數據保存成功: ', content, len(content)) return None def main(page): """ 此程序為了爬取淘寶寶貝而生,可以獲得寶貝圖片,店面,地址,信息,視頻等。。 page :獲取列表頁的數量 html :獲取列表頁的源碼 content : 列表頁寶貝的詳細信息和地址 """ html = get_html(page) content = analysis(html) save_mongo(content) get_detail(content, page) if __name__ == '__main__': pool = Pool() pool.map(main, [page*44 for page in range(NUM)]) print('程序結束')





 
          
#
 
          
2:taobao_re_xpath_setting

#
-*- coding:utf-8 -*- # author : yesehngbao # time:2018/3/21 # 需要爬取得寶貝 GOODS = '皮鞋' # 需要爬取的頁數(不大於100) NUM = 100 # 配置mongodb MONGO_HOST = 'localhost' MONGO_PORT = 27017 MONGO_DB = 'test' MONGO_COLL = 'shoe' # 目錄包 DIRNAME = 'taobao' # 主圖片包 DIRNAME1 = 'shoe_park_img' # 顏色(類別)圖片包 DIRNAME2 = 'shoe_color_img' # 視頻包 DIRNAME3 = 'shoe_video'

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM