--***2019-3-27測試有效***----
第一步:
打開cmd,輸入scrapy startproject taobao_s新建一個項目。
接着cd 進入我們的項目文件夾內輸入scrapy genspider taobao www.taobao.com新建一個爬蟲
文件內是這樣的,tools是我建的一個工具模塊,里面有一個處理數據的函數和selenium登錄的函數。
class TaobaoSpider(scrapy.Spider): name = 'taobao' # allowed_domains = ['www.taobao.com'] base_url = ['https://s.taobao.com/search?q='] pages = 100 re_headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'referer': 'https://www.taobao.com/', 'accept-encoding': 'gzip, deflate, b', } i = 1 def start_requests(self): keys = self.settings.get('KEYS')#獲取要搜索的關鍵詞 self.browser,list = register()#這里調用selenium登錄的方法並返回browser和一個cookies self.browser.get(self.base_url[0]+keys)#使用browser登錄淘寶商品搜索頁面 self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")#使用execute_script執行js操作,這里是下拉到最底下 url_i = self.browser.current_url#獲取selenium界面當前的url用來錯誤處理 html = self.browser.page_source#獲取源代碼 yield scrapy.Request(url=self.base_url[0]+keys,headers=self.re_headers,cookies=list,callback=self.parse,meta={'html':html,'i':self.i,'url':url_i}) def parse(self, response): time.sleep(5)#等待時間,可調 html = response.meta.get('html') i = response.meta.get("i") url_i = response.meta.get("url") i +=1 if i > 100:#因為翻一百頁,到了之后就不在執行循環 return try: soup = BeautifulSoup(html,'html.parser') lists = soup.select('#mainsrp-itemlist > div > div > div > div') for list in lists:#這一段是解析數據 item = TaobaoSItem() url = list.select('a[class="pic-link J_ClickStat J_ItemPicA"]')[0].attrs.get('href','') name = list.select("a[class='J_ClickStat']")[0].get_text().strip() name = data_cleaning(name) price = list.select('div[class="price g_price g_price-highlight"] strong')[0].get_text() num = list.select('div[class="deal-cnt"]')[0].get_text() shop_name = list.select("a[class='shopname J_MouseEneterLeave J_ShopInfo']")[0].get_text().strip() shop_name = data_cleaning(shop_name) item['url'] = url item['name'] = name item['price'] = price item['num'] = num item['shop_name'] = shop_name yield item button = self.browser.find_elements(By.XPATH,'//a[@class="J_Ajax num icon-tag"]')[-1]#這里是獲取點擊下一頁的,因為到第二頁以后會有二個一樣class的,一個是上一頁,一個是下一頁。 button.click()#點擊進入下一頁 time.sleep(random.random()*2) self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")#下拉操作 html = self.browser.page_source yield scrapy.Request(url=response.url,headers=self.re_headers,callback=self.parse,meta={'html':html,'i':i,'url':url_i},dont_filter=True) except Exception as e:#如果被淘寶抓到就重新登錄,用保存的url在接着獲取數據 time.sleep(10) print(e) self.browser.close() self.browser,list = register() self.browser.get(url=url_i) time.sleep(random.random()*2) self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") html = self.browser.page_source yield scrapy.Request(url=response.url,headers=self.re_headers,callback=self.parse,meta={'html':html,'i':i,'url':url_i},dont_filter=True) def close(spider, reason):#這是結束時執行的函數,用來關掉開啟的瀏覽器進程 spider.browser.close()
這是tools
def data_cleaning(data):#這是清洗數據的 if ' ' in data: data = re.sub(' ', '', data) if "'" in data: data = re.sub("'", '', data) if r'\n' in data: data = re.sub(r'\\n', '', data) return data def register():#這是登錄的函數,主要 while True: #因為淘寶能夠識別出selenium,有時我們會登錄失敗,會重新登錄 browser = webdriver.FirefoxOptions() browser.add_argument('-headless') #無頭瀏覽器 browser = webdriver.Firefox(firefox_options=browser) # browser = webdriver.Firefox() browser.get('https://login.taobao.com/member/login.jhtml')#進入登錄頁面try: input = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'forget-pwd.J_Quick2Static'))) #因為登錄頁面有時候是掃碼登錄,使用需要我們點擊切換到密碼登錄 input.click() except Exception as e: #因為頁面有時是直接密碼登錄,使用如果直接是密碼登錄就不需要點擊 print(e) user = browser.find_element(By.ID, 'TPL_username_1')#找到賬號輸入框 password = browser.find_element(By.ID, 'TPL_password_1')#密碼輸入框 user.send_keys(USER) #輸入賬號並等待一下 time.sleep(random.random() * 2) password.send_keys(PASSWORD)#輸入密碼並等待一下 time.sleep(random.random() * 1) browser.execute_script("Object.defineProperties(navigator,{webdriver:{get:() => false}})") #淘寶對selenium的識別主要是通過navigator.webdriver,使用selenium的瀏覽器api顯示的是True,所有我們改成fALSE就可以過淘寶的檢測 action = ActionChains(browser) time.sleep(random.random() * 1) butt = browser.find_element(By.ID, 'nc_1_n1z') browser.switch_to.frame(browser.find_element(By.ID, '_oid_ifr_')) browser.switch_to.default_content() action.click_and_hold(butt).perform() action.reset_actions() action.move_by_offset(285, 0).perform()#輸入賬號密碼后會有一個滑動驗證 time.sleep(random.random() * 1) button = browser.find_element(By.ID, 'J_SubmitStatic')#登錄按鈕 time.sleep(random.random() * 2) button.click() time.sleep(random.random() * 2) # browser.get('https://www.taobao.com/') cookie = browser.get_cookies()#獲取cookies,原本想selenium實現登錄,其他使用scrapy來,但是淘寶的商品搜索頁的js找不到加上時間不夠就沒寫了。 list = {}#scrapy攜帶的cookies需要字典類型的 for cookiez in cookie: name = cookiez['name'] value = cookiez['value'] list[name] = value if len(list) > 10: break else: browser.close() return browser,list
然后是數據保存
class TaobaoSPipeline(object): def open_spider(self,spider): #scrapy打開時啟動,這里是打開或者新建一個txt文件,文件路徑是當前路徑 self.f = open('淘寶店鋪數據.txt','w') def process_item(self, item, spider):#數據保存以字典的形式,也可以改成數據庫或者csv data = {} data['url'] = item['url'] data['name'] = item['name'] data['price'] = item['price'] data['num'] = item['num'] data['shop_name'] = item['shop_name'] self.f.write(str(data)+'\n') return item def close_spider(self,spider):#scrapy結束時啟動,用來關掉文件。 self.f.close()
代碼地址 https://github.com/18370652038/taobao.git