本博文主要是對我的這篇:https://www.cnblogs.com/tszr/p/12193744.html爬取某一大型電商網站的商品數據博文代碼的優化和整理。
代碼優化可以提高代碼的可讀性。
import time import json import pymongo import requests import urllib.request #使用MongoDB創建數據庫、表 client = pymongo.MongoClient('localhost',27017) book_qunar = client['qunarr'] sheet_qunar_zyx = book_qunar['qunar_zyxx'] def get_list(dep,item): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery={}&limit=0,24&includeAD=true&qsact=search'.format(urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item)) time.sleep(3) strhtml = requests.get(url) #獲取當前目的地的產品數量 routeCount = int(strhtml.json()['data']['limit']['routeCount']) for limit in range(0,routeCount,24): url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&originalquery={}&limit={},24&includeAD=true&qsact=search'.format(urllib.request.quote(dep),urllib.request.quote(item),urllib.request.quote(item),limit) time.sleep(3) strhtml = requests.get(url) #用一個字典保存當前這個產品的信息 result = { 'date':time.strftime('%Y-%m-%d',time.localtime(time.time())), 'dep':dep, 'arrive':item, 'limit':limit, 'result':strhtml.json() } #向數據庫中插入這條產品信息記錄 sheet_qunar_zyx.insert_one(result) def get_json(url): strhtml = requests.get(url) time.sleep(3) return strhtml.json() if __name__ == '__main__': #獲取產品 url = 'https://touch.dujia.qunar.com/depCities.qunar' strhtml = requests.get(url) dep_dict = strhtml.json() for dep_item in dep_dict['data']: for dep in dep_dict['data'][dep_item]: #這里聲明一個列表a用來保存當前這個出發點對應的所有目的地 a = [] url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep)) time.sleep(3) strhtml = requests.get(url) arrive_dict = strhtml.json() for arr_item in arrive_dict['data']: for arr_item_1 in arr_item['subModules']: for query in arr_item_1['items']: #如果當前這個目的地不在a中的話,那就添加進去,否則不添加,這樣就可以達到目的地去重的目的了 if(query['query'] not in a): a.append(query['query']) #逐個地取出當前出發點對應的目的地item for item in a: get_list(dep,item) #再寫一個程序用來定時監控運行結果 while True: print(sheet_qunar_zyx.find().count()) time.sleep(10)
#再寫一個程序用來定時監控運行結果(新建一個文件)來運行,記得要導入對應的文件 while True: print(sheet_qunar_zyx.find().count()) time.sleep(10)