天貓商品數據爬取代碼分享
雖然很簡陋但是寫這個程序我學到了一些新的技術,比如openpyxl庫的使用,python的打包啊,設置圖標啥的,還是收獲很多.
閑話不多說,直接上代碼
#導入需要的庫
import re
import urllib.parse
import requests
from openpyxl import Workbook
from openpyxl.styles import Font,Alignment
import os
#創建工作簿對象
wb = Workbook()
#我們使用Workbook對象的默認創建的工作表
ws = wb.active
#標題命名
ws['A1'] = '商品名稱'
ws['B1'] = '商品價格'
ws['C1'] = '產地'
ws['D1'] = '月成交量'
ws['E1'] = '商品鏈接'
#提示用戶輸入,查找的商品名,頁數,保存的地址,和文件名
print('----歡迎使用----')
keyword = input("請輸入你要查找的商品名稱:")
frequency = int(input("請輸入你要下載的頁數(1~100):"))
name = input("請輸入你要保存的文件名:")+'.xlsx'
#列寬判斷的依據
width = 0
#凍結第一行
ws.freeze_panes = 'A2'
#標題格式設置居中
ws['A1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['B1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['C1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['D1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['E1'].alignment =Alignment(horizontal = 'center',vertical ='center')
#設置標題字號為20,加粗
ws['A1'].font = Font(size= 20,bold=True)
ws['B1'].font = Font(size= 20,bold=True)
ws['C1'].font = Font(size= 20,bold=True)
ws['D1'].font = Font(size= 20,bold=True)
ws['E1'].font = Font(size= 20,bold=True)
#設置商品價格,成交量,產地的列寬
ws.column_dimensions['B'].width = 20
ws.column_dimensions['C'].width = 20
ws.column_dimensions['D'].width = 20
#中文關鍵字的編碼
keyword = urllib.parse.quote(keyword)
#請求頭的處理
headers = {'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.10 Safari/537.36',
'cookie':'cna=dzhnFJcvPFYCAcplZsR6KtPL; hng=CN%7Czh-CN%7CCNY%7C156; lid=su%E3%80%81%E9%9F%A9; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; tk_trace=1; t=cf98821548a85be3261b5d3e02dfc50c; _tb_token_=53317e736e697; cookie2=171ec1fa788a97042140b1dc23ea8cbd; _m_h5_tk=ce5a4b969c5fee0ce43fbecb1c8b5698_1544366646609; _m_h5_tk_enc=08f79f8dd140168b3e95d192521140f3; x=__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; uc1=cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=UIHiLt3xTIkz&cookie15=UtASsssmOIJ0bQ%3D%3D&existShop=false&pas=0&cookie14=UoTYMh2PRIzYCw%3D%3D&tag=8&lng=zh_CN; uc3=vt3=F8dByR1fSHC9rqvO0Hw%3D&id2=UUGk2VnYR7N9og%3D%3D&nk2=saDewT4jhA05glO9&lg2=V32FPkk%2Fw0dUvg%3D%3D; tracknick=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; _l_g_=Ug%3D%3D; ck1=""; unb=2967018108; lgc=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; cookie1=AiVdFlFBrPvLkxJuDQ%2FIWWWqMYV30iZYcqUsqvmxAjc%3D; login=true; cookie17=UUGk2VnYR7N9og%3D%3D; _nk_=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; uss=""; csg=da3a7af3; skt=22f8e8af802abead; enc=P0JAHDOULky9KTinsCWQ4Ib6YVG7q7qPW5KKCJd4YWKlwiYOGGRObgbMOWOpxn4w12VNH34hJK%2FVCxsPmDqs%2FQ%3D%3D; pnm_cku822=098%23E1hvc9vUvbpvUvCkvvvvvjiPR2FWljlUn2qw6jEUPmPZ1jrERFdO1jYUnLS9zjtUiQhvCvvvpZptvpvhvvCvpvGCvvpvvPMMvphvC9mvphvvvvyCvhQv7sg%2FjNpBKBh78BoxfXkXdiYso%2BLpjXe4Vc3Z0f06W3vOJ1kHsfUpeB6AxYjxRLwprj6OfwoKjd8rJm7g%2BfUz%2BsIIHYFpeiQa5javuphvmvvvpoX8LTuKkphvC9hvpyPw1byCvm9vvhCvvvvvvvvvBfIvvvjivvCVB9vv9LvvvhXVvvmCjvvvByOvvUhw; cq=ccp%3D0; swfstore=199766; isg=BG5uugvevlR2SM3ay3guf99Uv8ScezhtaUhC2pg36XEsew_VAP0keVBxNqcy_yqB'}
#url的處理
url1 = "https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.36105702i4oQH9&s="
url2 ="&q=" +keyword+"&sort=s&style=g&from=..pc_1_searchbutton&active=2&type=pc#J_Filte"
#循環爬取每一頁,正則提取商品鏈接,商品名,價格,銷量
for i in range(1,frequency+1):
#跳過訪問發生的異常,並輸出異常信息
try:
print("----正在爬取第%d頁----"%i)
url = url1 +str((i-1)*60)+url2
r = requests.get(url,headers = headers)
names = re.compile('target="_blank" title="(.*?)"',re.S).findall(r.text)
if(len(names)):
print('訪問成功')
else:
print("訪問失敗,請更改代碼的里cookie,或者明天再使用")
urls = re.compile('<div class="productImg-wrap">\n<a href="(.*?)" class="productImg" target="_blank" data-p="',re.S).findall(r.text)
chengjiaoliangs = re.compile('<span>月成交 <em>(.*?)筆',re.S).findall(r.text)
moneys = re.compile('<em title="(.*?)"><b>¥',re.S).findall(r.text)
#訪問商品詳細信息,爬取產地
wheres = []
for x in range(len(urls)):
#跳過獲取商品地區信息時發生的異常,並輸出異常
try:
wurl = 'http:'+urls[x]
w = requests.get(wurl,headers = headers)
wheres.append((re.compile('name="region" value="(.*?)"',re.S).findall(w.text))[0])
if(len(wheres[x])):
print('第%d頁%d條商品信息爬取成功'%(i,x+1))
except Exception as er:
print(er)
if (len(names)):
print("----第%d頁爬取成功----" % i)
print("----第%d頁開始寫入----"%i)
#循環寫入excel表格
for y in range(1,len(urls)):
#跳過寫入表格時發生的異常,並輸出異常的信息
try:
#設置商品名稱和商品鏈接自適應列寬
if(len(names[y-1])> width):
ws.column_dimensions['A'].width = 2*len(names[y-1])
if (len(urls[y - 1]) > width):
ws.column_dimensions['E'].width = len(urls[y - 1])
#分別寫入數據
ws['A%d' % ((i - 1) * 60 + y + 1)] = names[y-1]
ws['B%d' % ((i - 1) * 60 + y + 1)] = moneys[y - 1]+'元'
ws['C%d' % ((i - 1) * 60 + y + 1)] = wheres[y - 1]
ws['D%d' % ((i - 1) * 60 + y + 1)] = chengjiaoliangs[y - 1]+'筆'
ws['E%d' % ((i - 1) * 60 + y + 1)] = 'http:'+ urls[y - 1]
print('----第%d頁第%d條寫入成功----'%(i,y+1))
except Exception as e:
print('----第%d頁第%d條寫入失敗----'%(i,y+1))
print("----第%d頁寫入成功----" % i)
except Exception as err:
print(err)
#創建文件下載路徑
path = './天貓數據爬取excel文件/'
if not os.path.exists(path):
os.mkdir(path)
#保存工作簿到指定路徑
wb.save(path +name)
print('----已經全部寫入----')
print('----感謝使用----')
os.system('pause')