簡易天貓爬蟲


天貓商品數據爬取代碼分享

雖然很簡陋但是寫這個程序我學到了一些新的技術,比如openpyxl庫的使用,python的打包啊,設置圖標啥的,還是收獲很多.

閑話不多說,直接上代碼

#導入需要的庫
import re
import urllib.parse
import requests
from openpyxl import Workbook
from openpyxl.styles import Font,Alignment
import os

#創建工作簿對象
wb = Workbook()

#我們使用Workbook對象的默認創建的工作表
ws = wb.active

#標題命名
ws['A1'] = '商品名稱'
ws['B1'] = '商品價格'
ws['C1'] = '產地'
ws['D1'] = '月成交量'
ws['E1'] = '商品鏈接'

#提示用戶輸入,查找的商品名,頁數,保存的地址,和文件名
print('----歡迎使用----')
keyword = input("請輸入你要查找的商品名稱:")
frequency = int(input("請輸入你要下載的頁數(1~100):"))
name = input("請輸入你要保存的文件名:")+'.xlsx'

#列寬判斷的依據
width = 0

#凍結第一行
ws.freeze_panes = 'A2'

#標題格式設置居中
ws['A1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['B1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['C1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['D1'].alignment =Alignment(horizontal = 'center',vertical ='center')
ws['E1'].alignment =Alignment(horizontal = 'center',vertical ='center')

#設置標題字號為20,加粗
ws['A1'].font = Font(size= 20,bold=True)
ws['B1'].font = Font(size= 20,bold=True)
ws['C1'].font = Font(size= 20,bold=True)
ws['D1'].font = Font(size= 20,bold=True)
ws['E1'].font = Font(size= 20,bold=True)

#設置商品價格,成交量,產地的列寬
ws.column_dimensions['B'].width = 20
ws.column_dimensions['C'].width = 20
ws.column_dimensions['D'].width = 20

#中文關鍵字的編碼
keyword = urllib.parse.quote(keyword)

#請求頭的處理
headers = {'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.10 Safari/537.36',
        'cookie':'cna=dzhnFJcvPFYCAcplZsR6KtPL; hng=CN%7Czh-CN%7CCNY%7C156; lid=su%E3%80%81%E9%9F%A9; otherx=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0; tk_trace=1; t=cf98821548a85be3261b5d3e02dfc50c; _tb_token_=53317e736e697; cookie2=171ec1fa788a97042140b1dc23ea8cbd; _m_h5_tk=ce5a4b969c5fee0ce43fbecb1c8b5698_1544366646609; _m_h5_tk_enc=08f79f8dd140168b3e95d192521140f3; x=__ll%3D-1%26_ato%3D0; whl=-1%260%260%260; uc1=cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=UIHiLt3xTIkz&cookie15=UtASsssmOIJ0bQ%3D%3D&existShop=false&pas=0&cookie14=UoTYMh2PRIzYCw%3D%3D&tag=8&lng=zh_CN; uc3=vt3=F8dByR1fSHC9rqvO0Hw%3D&id2=UUGk2VnYR7N9og%3D%3D&nk2=saDewT4jhA05glO9&lg2=V32FPkk%2Fw0dUvg%3D%3D; tracknick=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; _l_g_=Ug%3D%3D; ck1=""; unb=2967018108; lgc=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; cookie1=AiVdFlFBrPvLkxJuDQ%2FIWWWqMYV30iZYcqUsqvmxAjc%3D; login=true; cookie17=UUGk2VnYR7N9og%3D%3D; _nk_=%5Cu4E00%5Cu5207%5Cu968F%5Cu7F18%5Cu4E28%5Cu4E36; uss=""; csg=da3a7af3; skt=22f8e8af802abead; enc=P0JAHDOULky9KTinsCWQ4Ib6YVG7q7qPW5KKCJd4YWKlwiYOGGRObgbMOWOpxn4w12VNH34hJK%2FVCxsPmDqs%2FQ%3D%3D; pnm_cku822=098%23E1hvc9vUvbpvUvCkvvvvvjiPR2FWljlUn2qw6jEUPmPZ1jrERFdO1jYUnLS9zjtUiQhvCvvvpZptvpvhvvCvpvGCvvpvvPMMvphvC9mvphvvvvyCvhQv7sg%2FjNpBKBh78BoxfXkXdiYso%2BLpjXe4Vc3Z0f06W3vOJ1kHsfUpeB6AxYjxRLwprj6OfwoKjd8rJm7g%2BfUz%2BsIIHYFpeiQa5javuphvmvvvpoX8LTuKkphvC9hvpyPw1byCvm9vvhCvvvvvvvvvBfIvvvjivvCVB9vv9LvvvhXVvvmCjvvvByOvvUhw; cq=ccp%3D0; swfstore=199766; isg=BG5uugvevlR2SM3ay3guf99Uv8ScezhtaUhC2pg36XEsew_VAP0keVBxNqcy_yqB'}

#url的處理
url1 = "https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.36105702i4oQH9&s="
url2 ="&q=" +keyword+"&sort=s&style=g&from=..pc_1_searchbutton&active=2&type=pc#J_Filte"

#循環爬取每一頁,正則提取商品鏈接,商品名,價格,銷量
for i in range(1,frequency+1):

    #跳過訪問發生的異常,並輸出異常信息
    try:
        print("----正在爬取第%d頁----"%i)
        url = url1 +str((i-1)*60)+url2
        r = requests.get(url,headers = headers)
        names = re.compile('target="_blank" title="(.*?)"',re.S).findall(r.text)
        if(len(names)):
            print('訪問成功')
        else:
            print("訪問失敗,請更改代碼的里cookie,或者明天再使用")
        urls = re.compile('<div class="productImg-wrap">\n<a href="(.*?)" class="productImg" target="_blank" data-p="',re.S).findall(r.text)
        chengjiaoliangs = re.compile('<span>月成交 <em>(.*?)筆',re.S).findall(r.text)
        moneys = re.compile('<em title="(.*?)"><b>&yen',re.S).findall(r.text)

        #訪問商品詳細信息,爬取產地
        wheres = []
        for x in range(len(urls)):

            #跳過獲取商品地區信息時發生的異常,並輸出異常
            try:
                wurl = 'http:'+urls[x]
                w = requests.get(wurl,headers = headers)
                wheres.append((re.compile('name="region" value="(.*?)"',re.S).findall(w.text))[0])
                if(len(wheres[x])):
                    print('第%d頁%d條商品信息爬取成功'%(i,x+1))
            except Exception as er:
                print(er)
        if (len(names)):
            print("----第%d頁爬取成功----" % i)
        print("----第%d頁開始寫入----"%i)

        #循環寫入excel表格
        for y in range(1,len(urls)):

            #跳過寫入表格時發生的異常,並輸出異常的信息
            try:

                #設置商品名稱和商品鏈接自適應列寬
                if(len(names[y-1])> width):
                    ws.column_dimensions['A'].width = 2*len(names[y-1])
                if (len(urls[y - 1]) > width):
                    ws.column_dimensions['E'].width = len(urls[y - 1])

                #分別寫入數據
                ws['A%d' % ((i - 1) * 60 + y + 1)] = names[y-1]
                ws['B%d' % ((i - 1) * 60 + y + 1)] = moneys[y - 1]+'元'
                ws['C%d' % ((i - 1) * 60 + y + 1)] = wheres[y - 1]
                ws['D%d' % ((i - 1) * 60 + y + 1)] = chengjiaoliangs[y - 1]+'筆'
                ws['E%d' % ((i - 1) * 60 + y + 1)] = 'http:'+ urls[y - 1]
                print('----第%d頁第%d條寫入成功----'%(i,y+1))
            except Exception as e:
                print('----第%d頁第%d條寫入失敗----'%(i,y+1))
        print("----第%d頁寫入成功----" % i)
    except Exception as err:
        print(err)

#創建文件下載路徑
path = './天貓數據爬取excel文件/'
if not os.path.exists(path):
    os.mkdir(path)

#保存工作簿到指定路徑
wb.save(path +name)

print('----已經全部寫入----')
print('----感謝使用----')
os.system('pause')


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM