介紹: 本次數據爬取只進行一些簡單數據的爬取,如商品標題、價格、圖片鏈接以及詳情頁中的銷量、評價和送的天貓積分,相信看過這個博客后的小伙伴,一定可以把功能更加完善。
一、淘寶登錄
有關登錄這部分的話,不做講解,想要知道的小伙伴可以參考我的另一篇博客Python爬蟲:Selenium和動作鏈實現淘寶模擬登錄,分析的很清楚。
二、准備
1.創建Scrapy的tTaobao項目
</article>
scrapy startproject Taobao
cd Taobao
scrapy genspider taobao "taobao.com"

有個這個文件,整個scrapy項目可以直接右鍵start.py運行,不用到命令行輸入命令啟動。
start.py
from scrapy import cmdline cmdline.execute("scrapy crawl taobao".split())
2.更改setting配置文件

'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'


三、數據爬取、分析
分析以注釋形式存在。
#數據爬取、分析 def parse(self, response): #由於我的start_urls = ['https://s.taobao.com/search?q=java&s=0'],直接請求會被攔截需要登錄,此時的response格式為 <200 xxx.com> ,而xxx.com就是淘寶登錄的網址,把它提取出來就ok response = str(response).split(" ")[1].replace(">","") bro = self.login(response) #傳入登陸網址進行模擬登錄 # print(response.text) num = 0 for i in range(2): #進行多頁數據爬取 url = "https://s.taobao.com/search?q=java&s=" + str(num) #請求鏈接格式分析可參考上圖1 num += 44 bro.get(url) #get方式進行請求 html = bro.page_source soup = BeautifulSoup(html, 'lxml') #使用BeautifulSoup進行分析、爬取 data_list = soup.find_all(class_='item J_MouserOnverReq') #根據class拿到全部標簽,參考圖二 for data in data_list: #遍歷 data_soup = BeautifulSoup(str(data), 'lxml') # 圖片鏈接 #參考圖三,根據class找到圖片拿到其中的data-src屬性數據 #涉及到圖片懶加載問題,data-src時真正存放圖片地址的地方 img_url = "http:" + data_soup.find(class_='J_ItemPic img')['data-src'] # 圖片價格,根據標簽拿值,參考圖四 # 拿到標簽中的文本內容要在后面加上.string price = data_soup.find('strong').string # 圖片標題 # 參考圖五,根據class拿到img中的alt屬性 title = data_soup.find(class_='J_ItemPic img')['alt'] # 詳情頁 #參考圖六,根據class拿到data-href原因與拿data-src一樣 detail_url = "https:" + data_soup.find(class_="pic-link J_ClickStat J_ItemPicA")["data-href"] bro.get(detail_url) #請求詳情頁 time.sleep(1) html_second = bro.page_source soup = BeautifulSoup(html_second, 'lxml') #因為有的商品是有銷量、評價數量、積分的,但有的商品缺一個兩個的。 #由於find的特性,取不到值就會報異常,則我們使用try-except進行包裹,沒有值時賦值為0 #參考圖七 try: #月銷量 svolume = soup.find(class_="tm-ind-item tm-ind-sellCount").text.replace("月銷量", "") except: svolume = 0 try: #評價 evaluate = soup.find(class_="tm-ind-item tm-ind-reviewCount canClick tm-line3").text.replace("累計評價", "") except: evaluate = 0 try: #贈送的積分 integral = soup.find(class_="tm-ind-item tm-ind-emPointCount").text.replace("送天貓積分", "") except: integral = 0 item = TaobaoItem(img_url=img_url, price=price, title=title, svolume=svolume, evaluate=evaluate, integral=integral, detail_url=detail_url) yield item
圖一、

圖二、

圖三、

圖四、

圖五、

圖六、

圖七、

四、完整代碼
taobao.py
# -*- coding: utf-8 -*- import scrapy from selenium import webdriver import time from PIL import Image from selenium.webdriver import ActionChains from bs4 import BeautifulSoup from Taobao.items import TaobaoItem class TaobaoSpider(scrapy.Spider): name = 'taobao' # allowed_domains = ['xxx.com'] start_urls = ['https://s.taobao.com/search?q=java&s=0'] #登錄 def login(self,url): bro = webdriver.Chrome() bro.maximize_window() time.sleep(1) bro.get(url) time.sleep(1) bro.find_element_by_class_name("icon-qrcode").click() time.sleep(3) # bro.find_element_by_name("fm-login-id").send_keys("淘寶賬號") # time.sleep(1) # bro.find_element_by_name("fm-login-password").send_keys("淘寶密碼") # time.sleep(1) # # # save_screenshot 就是將當前頁面進行截圖且保存 # bro.save_screenshot('taobao.png') # # code_img_ele = bro.find_element_by_xpath("//*[@id='nc_1__scale_text']/span") # location = code_img_ele.location # 驗證碼圖片左上角的坐標 x,y # size = code_img_ele.size # 驗證碼的標簽對應的長和寬 # # 左上角和右下角的坐標 # rangle = ( # int(location['x']), int(location['y']), int(location['x'] + size['width']), # int(location['y'] + size['height']) # ) # # i = Image.open("./taobao.png") # # crop裁剪 # frame = i.crop(rangle) # # # 動作鏈 # action = ActionChains(bro) # # 長按且點擊 # action.click_and_hold(code_img_ele) # # # move_by_offset(x,y) x水平方向,y豎直方向 # # perform()讓動作鏈立即執行 # action.move_by_offset(270, 0).perform() # time.sleep(0.5) # # # 釋放動作鏈 # action.release() # # 登錄 # bro.find_element_by_xpath("//*[@id='login-form']/div[4]/button").click() return bro #數據爬取 def parse(self, response): response = str(response).split(" ")[1].replace(">","") bro = self.login(response) # print(response.text) num = 0 for i in range(2): url = "https://s.taobao.com/search?q=java&s=" + str(num) num += 44 bro.get(url) html = bro.page_source soup = BeautifulSoup(html, 'lxml') data_list = soup.find_all(class_='item J_MouserOnverReq') for data in data_list: data_soup = BeautifulSoup(str(data), 'lxml') # 圖片鏈接 img_url = "http:" + data_soup.find(class_='J_ItemPic img')['data-src'] # 圖片價格 price = data_soup.find('strong').string # 圖片標題 title = data_soup.find(class_='J_ItemPic img')['alt'] # 詳情頁 detail_url = "https:" + data_soup.find(class_="pic-link J_ClickStat J_ItemPicA")["data-href"] bro.get(detail_url) time.sleep(1) html_second = bro.page_source soup = BeautifulSoup(html_second, 'lxml') try: svolume = soup.find(class_="tm-ind-item tm-ind-sellCount").text.replace("月銷量", "") except: svolume = 0 try: evaluate = soup.find(class_="tm-ind-item tm-ind-reviewCount canClick tm-line3").text.replace("累計評價", "") except: evaluate = 0 try: integral = soup.find(class_="tm-ind-item tm-ind-emPointCount").text.replace("送天貓積分", "") except: integral = 0 #處理獲取的的數據,做數據清洗 item = TaobaoItem(img_url=img_url, price=price, title=title, svolume=svolume, evaluate=evaluate, integral=integral, detail_url=detail_url) yield item
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class TaobaoItem(scrapy.Item): img_url = scrapy.Field() price = scrapy.Field() title = scrapy.Field() svolume = scrapy.Field() evaluate = scrapy.Field() integral = scrapy.Field() detail_url = scrapy.Field()
pipelines.py
保存數據到mysql

數據庫建表語句
CREATE TABLE `taobao` ( `id` int(11) NOT NULL AUTO_INCREMENT, `img_url` varchar(255) DEFAULT NULL, `title` varchar(255) DEFAULT NULL, `price` decimal(10,2) DEFAULT NULL, `svolume` varchar(255) DEFAULT NULL, `evaluate` varchar(255) DEFAULT NULL, `integral` varchar(255) DEFAULT NULL, `detail_url` varchar(255) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=7 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class TaobaoPipeline: def __init__(self): dbparams = { 'host': '127.0.0.1', 'port': 3306, 'user': '賬號', 'password': '密碼', 'database': '數據庫名', 'charset': 'utf8' } self.conn = pymysql.connect(**dbparams) self.cursor = self.conn.cursor() self._sql = None def process_item(self, item, spider): self.cursor.execute(self.sql,(item['img_url'],item['title'],item['price'], item['svolume'],item['evaluate'],item['integral'],item['detail_url'])) self.conn.commit() return item @property def sql(self): if not self._sql: self._sql = """ insert into taobao(id,img_url,title,price,svolume,evaluate,integral,detail_url) values(null ,%s,%s,%s,%s,%s,%s,%s) """ return self._sql return self._sql
附:登錄及滑塊驗證
一、滑塊驗證碼
from selenium import webdriver import time from PIL import Image from selenium.webdriver import ActionChains #初始 def main(): bro = webdriver.Chrome() bro.maximize_window() bro.get("https://login.taobao.com/member/login.jhtml") time.sleep(1) bro.find_element_by_name("fm-login-id").send_keys("淘寶賬號") time.sleep(1) bro.find_element_by_name("fm-login-password").send_keys("淘寶密碼") time.sleep(1) time.sleep(10) if __name__ == "__main__": main()
想要破解滑塊驗證碼其實不難,大體來說肯定是要找到滑塊驗證碼然后使用動作鏈進行點擊滑動,然后再點擊登錄就OK了。
那接下來開始代碼分析。
第一部分
#初始 def main(): bro = webdriver.Chrome() bro.maximize_window() bro.get("https://login.taobao.com/member/login.jhtml") #get方式請求淘寶登陸頁面 time.sleep(1) #休眠一秒,不要太快 bro.find_element_by_name("fm-login-id").send_keys("淘寶賬號") 根據name找到賬號input標簽 time.sleep(1) bro.find_element_by_name("fm-login-password").send_keys("淘寶密碼") 根據name找到密碼input標簽 time.sleep(1) GetImage(bro) #得到滑塊截圖
第二部分
#獲取 def GetImage(bro): # save_screenshot 就是將當前頁面進行截圖且保存 bro.save_screenshot('taobao.png') code_img_ele = bro.find_element_by_xpath("//*[@id='nc_1__scale_text']/span") #根據xpath語法找到滑塊驗證碼 Action(bro,code_img_ele) #執行
注意截圖時機,登錄頁面一開始加載后滑塊驗證碼並不會出現,等到賬號和密碼輸入后才會出現,所以截圖的時機要放在賬號和密碼輸入之后。
本來這個找到滑塊驗證碼我一開始是想用clss值找的,但class相同的有兩個,所以我選擇了根據xpath語法找。
第三部分
做到這里其實就沒多少了,接下來要做的就是要點擊滑塊並右移實現驗證。
#執行 def Action(bro,code_img_ele): # 動作鏈 action = ActionChains(bro) # 長按且點擊 action.click_and_hold(code_img_ele) # move_by_offset(x,y) x水平方向,y豎直方向 # perform()讓動作鏈立即執行 action.move_by_offset(300, 0).perform() #填寫300的原因可看下圖 time.sleep(0.5) # 釋放動作鏈 action.release() # 登錄 bro.find_element_by_xpath("//*[@id='login-form']/div[4]/button").click() #根據xpath語法找到登錄按鈕點擊登錄 time.sleep(10) bro.quit() #關閉瀏覽器
- 到這里就算完結了,也可以實現自動登錄功能了,還是有小小的成就感的!
完整代碼如下:
from selenium import webdriver import time from PIL import Image from selenium.webdriver import ActionChains #初始 def main(): bro = webdriver.Chrome() bro.maximize_window() bro.get("https://login.taobao.com/member/login.jhtml") time.sleep(1) bro.find_element_by_name("fm-login-id").send_keys("賬號") time.sleep(1) bro.find_element_by_name("fm-login-password").send_keys("密碼") time.sleep(1) GetImage(bro) #=================================================================================== #獲取 def GetImage(bro): # save_screenshot 就是將當前頁面進行截圖且保存 bro.save_screenshot('taobao.png') code_img_ele = bro.find_element_by_xpath("//*[@id='nc_1__scale_text']/span") Action(bro,code_img_ele) #=================================================================================== #執行 def Action(bro,code_img_ele): # 動作鏈 action = ActionChains(bro) # 長按且點擊 action.click_and_hold(code_img_ele) # move_by_offset(x,y) x水平方向,y豎直方向 # perform()讓動作鏈立即執行 action.move_by_offset(300, 0).perform() time.sleep(0.5) # 釋放動作鏈 action.release() # 登錄 bro.find_element_by_xpath("//*[@id='login-form']/div[4]/button").click() time.sleep(10) bro.quit() #關閉瀏覽器 if __name__ == "__main__": main()