Selenium+Chrome實現淘寶自動登錄和商品信息的爬取


思路

- 由於在未登錄的情況下,進行搜索商品信息操作,頁面會自動跳轉到登錄界面,所以我們首先要解決自動登錄的問題,經過測試發現,通過微博登錄比較方便,所以我就通過微博登錄了; - 登錄成功后,搜索相關的商品信息存儲到MongoDB中

代碼

``` # -*- coding: utf-8 -*- """ @Time : 2020/1/19 15:13 @Auth : peter @File :food.py @IDE :PyCharm @Motto:ABC(Always Be Coding)

"""
import io
import sys

改變標准輸出的默認編碼

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gbk')
import time
import re
import pymongo
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
from config import *
client = pymongo.MongoClient(MONG_URL)
db = client[MONGO_DB]

options = webdriver.ChromeOptions()

此步驟很重要,設置為開發者模式,防止被各大網站識別出來使用了Selenium

options.add_experimental_option('excludeSwitches', ['enable-automation'])

options.add_argument("blink-settings=imagesEnabled=false") # 不加載圖片

brower = webdriver.Chrome(options=options)

brower = webdriver.Chrome()

封裝一個函數,用來判斷屬性值是否存在

def isElementPresent(brower,by, value):
"""
用來判斷元素標簽是否存在,
"""
try:
element = brower.find_element(by=by, value=value)
# 原文是except NoSuchElementException, e:
except Exception as e:
# 發生了NoSuchElementException異常,說明頁面中未找到該元素,返回False
return False
else:
print('存在該元素',element.text)
return element.text
# 沒有發生異常,表示在頁面中找到了該元素,返回True

def login():
# 實現淘寶的登錄
try:
brower.get("https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2F")
# 密碼登錄點擊
submit1 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_QRCodeLogin > div.login-links > a.forget-pwd.J_Quick2Static")))
submit1.click()
# 微博登錄點擊
submit2 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_OtherLogin > a.weibo-login")))
submit2.click()
# 這里的判讀是為了處理微博登錄成功了,但是頁面未來得及跳轉的情況
if isElementPresent(brower,By.CSS_SELECTOR,'#pl_login_logged > div.W_login_form.logged_status > div.logged_info > div.info_list > div:nth-child(1) > a > span'):

    # if brower.find_element_by_css_selector('#pl_login_logged > div.W_login_form.logged_status > div.logged_info > div.info_list > div:nth-child(1) > a'):
    #     print(1)
        submit_login = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div.W_login_form.logged_status > div.logged_info > div.info_list > div:nth-child(1) > a')))
        submit_login.click()
        return brower
    else:
        input_user = WebDriverWait(brower, 10).until(EC.presence_of_element_located((By.NAME, 'username')))
        input_pwd = WebDriverWait(brower, 10).until(EC.presence_of_element_located((By.NAME, "password")))
        username = input("請輸入微博賬號:")
        pwd = input("請輸入密碼:")
        input_user.send_keys(username)
        input_pwd.send_keys(pwd)
        # submit3 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.NAME,"submitStates")))
        # submit3.click()
        time.sleep(2)
        # 判斷是否需要驗證碼登錄
        if isElementPresent(brower,By.XPATH,'//*[@id="pl_login_logged"]/div/div[4]/div/a[1]/img'):

            brower.save_screenshot("1.png")

            # img = brower.find_element_by_class_name('code')
            img = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="pl_login_logged"]/div/div[4]/div/a[1]/img')))
            time.sleep(2)
            location = img.location
            print(location)
            size = img.size
            print(size)
            left = location['x'] + size['width']
            top = location['y'] + size['height']
            right = left + size['width'] + size['width']+40
            bottom = top + size['height'] + size['height']
            print(right,bottom)
            page = Image.open('1.png')
            image_obj = page.crop((left, top, right, bottom))
            image_obj.show()
            code = input('請輸入驗證碼:')
            input_code = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(4) > div > input')))
            input_code.send_keys(code)

        submit3 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a')))
        submit3.click()
        time.sleep(2)
    # print(brower.get_cookies())
    # imgObject = Image.open("1.png")  # 獲得截屏的圖片
    # box = ()
    # imgCaptcha = imgObject.crop((box)  # 裁剪
        return brower
except Exception as e:
    print(e)
    # submit3 = WebDriverWait(brower, 10).until(EC.element_to_be_clickable(
    #     (By.CSS_SELECTOR, '#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a')))
    # submit3.click()
    login()

def search():
# 搜索商品信息
try:
input_key = WebDriverWait(brower,7).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))
submit = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
key = input("請輸入要搜索的商品:")
input_key.send_keys(key)
submit.click()
total = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
print(total.text)
get_products()
return total.text
except Exception as e:
print(e)
search()

def next_page(page_number):
# 點擊下一頁
try:
input = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')))
submit = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page_number)
time.sleep(2)
submit.click()
WebDriverWait(brower,10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
get_products()
except Exception as e:
print('網絡出錯刷新一次',e)
brower.refresh()
next_page(page_number)

def get_products():
# 獲取商品的信息
WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
html = brower.page_source
# print('---------------------------------------------------',html)
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
# 圖片鏈接
'image':item.find('.pic .img').attr('src'),
# 價格
'price':item.find('.price').text(),
# 成交數量
'deal':item.find('.deal-cnt').text()[:-3],
# 標題
'title':item.find('.title').text(),
# 商品名稱
'shop':item.find('.shop').text(),
# 地址
'location':item.find('.location').text(),
# 網店地址
'shop-addr':item.find('.shop .shopname').attr('href')
}
# print(product)
save_to_mongo(product)

def save_to_mongo(result):
# 存儲到mongodb
try:
if db[MONG_TABLE].insert_many(result):
print('存儲到MONGODB成功',result)
except Exception as e:
print('存儲到MONGODB失敗',e,result)

def main():
login()
total = search()
print(total)
total = re.compile(r'(\d+)').search(total).group(1)
total = int(total)
# print(total)
for i in range(2,total+1):
next_page(i)
brower.close()

if name == "main":
main()

<h3>配置文件</h3>
**config.py**

mongodb地址

MONG_URL = 'localhost'

數據庫名稱

MONGO_DB = 'taobao'

表名

MONG_TABLE = 'kouzhao'


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM