思路
- 由於在未登錄的情況下,進行搜索商品信息操作,頁面會自動跳轉到登錄界面,所以我們首先要解決自動登錄的問題,經過測試發現,通過微博登錄比較方便,所以我就通過微博登錄了; - 登錄成功后,搜索相關的商品信息存儲到MongoDB中代碼
``` # -*- coding: utf-8 -*- """ @Time : 2020/1/19 15:13 @Auth : peter @File :food.py @IDE :PyCharm @Motto:ABC(Always Be Coding)"""
import io
import sys
改變標准輸出的默認編碼
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gbk')
import time
import re
import pymongo
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from PIL import Image
from config import *
client = pymongo.MongoClient(MONG_URL)
db = client[MONGO_DB]
options = webdriver.ChromeOptions()
此步驟很重要,設置為開發者模式,防止被各大網站識別出來使用了Selenium
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument("blink-settings=imagesEnabled=false") # 不加載圖片
brower = webdriver.Chrome(options=options)
brower = webdriver.Chrome()
封裝一個函數,用來判斷屬性值是否存在
def isElementPresent(brower,by, value):
"""
用來判斷元素標簽是否存在,
"""
try:
element = brower.find_element(by=by, value=value)
# 原文是except NoSuchElementException, e:
except Exception as e:
# 發生了NoSuchElementException異常,說明頁面中未找到該元素,返回False
return False
else:
print('存在該元素',element.text)
return element.text
# 沒有發生異常,表示在頁面中找到了該元素,返回True
def login():
# 實現淘寶的登錄
try:
brower.get("https://login.taobao.com/member/login.jhtml?redirectURL=https%3A%2F%2Fwww.taobao.com%2F")
# 密碼登錄點擊
submit1 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_QRCodeLogin > div.login-links > a.forget-pwd.J_Quick2Static")))
submit1.click()
# 微博登錄點擊
submit2 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_OtherLogin > a.weibo-login")))
submit2.click()
# 這里的判讀是為了處理微博登錄成功了,但是頁面未來得及跳轉的情況
if isElementPresent(brower,By.CSS_SELECTOR,'#pl_login_logged > div.W_login_form.logged_status > div.logged_info > div.info_list > div:nth-child(1) > a > span'):
# if brower.find_element_by_css_selector('#pl_login_logged > div.W_login_form.logged_status > div.logged_info > div.info_list > div:nth-child(1) > a'):
# print(1)
submit_login = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div.W_login_form.logged_status > div.logged_info > div.info_list > div:nth-child(1) > a')))
submit_login.click()
return brower
else:
input_user = WebDriverWait(brower, 10).until(EC.presence_of_element_located((By.NAME, 'username')))
input_pwd = WebDriverWait(brower, 10).until(EC.presence_of_element_located((By.NAME, "password")))
username = input("請輸入微博賬號:")
pwd = input("請輸入密碼:")
input_user.send_keys(username)
input_pwd.send_keys(pwd)
# submit3 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.NAME,"submitStates")))
# submit3.click()
time.sleep(2)
# 判斷是否需要驗證碼登錄
if isElementPresent(brower,By.XPATH,'//*[@id="pl_login_logged"]/div/div[4]/div/a[1]/img'):
brower.save_screenshot("1.png")
# img = brower.find_element_by_class_name('code')
img = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.XPATH,'//*[@id="pl_login_logged"]/div/div[4]/div/a[1]/img')))
time.sleep(2)
location = img.location
print(location)
size = img.size
print(size)
left = location['x'] + size['width']
top = location['y'] + size['height']
right = left + size['width'] + size['width']+40
bottom = top + size['height'] + size['height']
print(right,bottom)
page = Image.open('1.png')
image_obj = page.crop((left, top, right, bottom))
image_obj.show()
code = input('請輸入驗證碼:')
input_code = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(4) > div > input')))
input_code.send_keys(code)
submit3 = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a')))
submit3.click()
time.sleep(2)
# print(brower.get_cookies())
# imgObject = Image.open("1.png") # 獲得截屏的圖片
# box = ()
# imgCaptcha = imgObject.crop((box) # 裁剪
return brower
except Exception as e:
print(e)
# submit3 = WebDriverWait(brower, 10).until(EC.element_to_be_clickable(
# (By.CSS_SELECTOR, '#pl_login_logged > div > div:nth-child(7) > div:nth-child(1) > a')))
# submit3.click()
login()
def search():
# 搜索商品信息
try:
input_key = WebDriverWait(brower,7).until(EC.presence_of_element_located((By.CSS_SELECTOR,"#q")))
submit = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,"#J_TSearchForm > div.search-button > button")))
key = input("請輸入要搜索的商品:")
input_key.send_keys(key)
submit.click()
total = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
print(total.text)
get_products()
return total.text
except Exception as e:
print(e)
search()
def next_page(page_number):
# 點擊下一頁
try:
input = WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > input')))
submit = WebDriverWait(brower,10).until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page_number)
time.sleep(2)
submit.click()
WebDriverWait(brower,10).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
get_products()
except Exception as e:
print('網絡出錯刷新一次',e)
brower.refresh()
next_page(page_number)
def get_products():
# 獲取商品的信息
WebDriverWait(brower,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
html = brower.page_source
# print('---------------------------------------------------',html)
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
# 圖片鏈接
'image':item.find('.pic .img').attr('src'),
# 價格
'price':item.find('.price').text(),
# 成交數量
'deal':item.find('.deal-cnt').text()[:-3],
# 標題
'title':item.find('.title').text(),
# 商品名稱
'shop':item.find('.shop').text(),
# 地址
'location':item.find('.location').text(),
# 網店地址
'shop-addr':item.find('.shop .shopname').attr('href')
}
# print(product)
save_to_mongo(product)
def save_to_mongo(result):
# 存儲到mongodb
try:
if db[MONG_TABLE].insert_many(result):
print('存儲到MONGODB成功',result)
except Exception as e:
print('存儲到MONGODB失敗',e,result)
def main():
login()
total = search()
print(total)
total = re.compile(r'(\d+)').search(total).group(1)
total = int(total)
# print(total)
for i in range(2,total+1):
next_page(i)
brower.close()
if name == "main":
main()
<h3>配置文件</h3>
**config.py**
mongodb地址
MONG_URL = 'localhost'
數據庫名稱
MONGO_DB = 'taobao'
表名
MONG_TABLE = 'kouzhao'