Amazon后台模擬登陸

本文轉載自查看原文 2016-10-20 09:58 2830 python
本文基於python3.4的selenium庫打開瀏覽器，並將瀏覽器中的登陸cookie保存到本地，那么下次登陸就可以直接利用cookie了：
  1 # !/usr/bin/python3.4
  2 # -*- coding: utf-8 -*-
  3 
  4 from selenium import webdriver
  5 import time
  6 import requests
  7 from bs4 import BeautifulSoup
  8 import os
  9 import re
 10 import random
 11 import xlsxwriter
 12 
 13 
 14 # 找出文件夾下所有xml后綴的文件，可選擇遞歸
 15 def listfiles(rootdir, prefix='.xml', iscur=False):
 16     file = []
 17     for parent, dirnames, filenames in os.walk(rootdir):
 18         if parent == rootdir:
 19             for filename in filenames:
 20                 if filename.endswith(prefix):
 21                     file.append(filename)
 22             if not iscur:
 23                 return file
 24         else:
 25             if iscur:
 26                 for filename in filenames:
 27                     if filename.endswith(prefix):
 28                         file.append(filename)
 29             else:
 30                 pass
 31     return file
 32 
 33 
 34 # 抓取dp的正則表達式
 35 def getdp(string):
 36     reg = r'(http.+?/dp/)(.+)'
 37     all = re.compile(reg)
 38     alllist = re.findall(all, string)
 39     return alllist[0][1]
 40 
 41 
 42 # 抓取filter的正則表達式
 43 # https://sellercentral.amazon.com/productsearch?filter=grocery&q=fish
 44 def getfilter(string):
 45     reg = r'(https.+?filter=)(.+?)(&)'
 46     all = re.compile(reg)
 47     alllist = re.findall(all, string)
 48     return alllist[0][1]
 49 
 50 
 51 # 抓取最大頁數的正則
 52 def getpagenum(string):
 53     reg = r'(.+?\()(\d+)(\))'
 54     all = re.compile(reg)
 55     alllist = re.findall(all, string)
 56     return alllist[0][1]
 57 
 58 
 59 # 創建文件夾
 60 def createjia(path):
 61     try:
 62         os.makedirs(path)
 63     except:
 64         pass
 65 
 66 
 67 def timetochina(longtime, formats='{}天{}小時{}分鍾{}秒'):
 68     day = 0
 69     hour = 0
 70     minutue = 0
 71     second = 0
 72     try:
 73         if longtime > 60:
 74             second = longtime % 60
 75             minutue = longtime // 60
 76         else:
 77             second = longtime
 78         if minutue > 60:
 79             hour = minutue // 60
 80             minutue = minutue % 60
 81         if hour > 24:
 82             day = hour // 24
 83             hour = hour % 24
 84         return formats.format(day, hour, minutue, second)
 85     except:
 86         raise Exception('時間非法')
 87 
 88 
 89 # 打開瀏覽器抓取cookie
 90 def openbrowser(url):
 91     # 打開谷歌瀏覽器
 92     # Firefox() Chrome()
 93     browser = webdriver.Chrome()
 94     # browser = webdriver.Chrome(executable_path='C:/Python34/chromedriver.exe')
 95     # 輸入網址
 96     browser.get(url)
 97     # 打開瀏覽器時間
 98     # print("等待10秒打開瀏覽器...")
 99     # time.sleep(10)
100 
101     # 找到id="ap_email"的對話框
102     # 清空輸入框
103     browser.find_element_by_id("ap_email").clear()
104     browser.find_element_by_id("ap_password").clear()
105 
106     # 輸入賬號密碼
107     inputemail = input("請輸入賬號：")
108     inputpassword = input("請輸入密碼：")
109     browser.find_element_by_id("ap_email").send_keys(inputemail)
110     browser.find_element_by_id("ap_password").send_keys(inputpassword)
111 
112     # 點擊登陸sign in
113     # id="signInSubmit"
114     browser.find_element_by_id("signInSubmit").click()
115 
116     # 等待登陸10秒
117     # print('等待登陸10秒...')
118     # time.sleep(10)
119     print("等待網址加載完畢...")
120 
121     select = input("請觀察瀏覽器網站是否已經登陸(y/n)：")
122     while 1:
123         if select == "y" or select == "Y":
124             print("登陸成功！")
125             # 獲取cookie
126             cookie = [item["name"] + ":" + item["value"] for item in browser.get_cookies()]
127             cookiestr = ';'.join(item for item in cookie)
128             print("正在復制網頁cookie...")
129 
130             # 寫入本地txt
131             if "jp" in url:
132                 path = "../data/Japcookie.txt"
133             else:
134                 path = "../data/Amecookie.txt"
135 
136             filecookie = open(path, "w")
137             filecookie.write(cookiestr)
138             filecookie.close()
139 
140             time.sleep(1)
141             print("准備關閉瀏覽器...")
142             browser.quit()
143             # print(cookiestr)
144             break
145 
146         elif select == "n" or select == "N":
147             selectno = input("賬號密碼錯誤請按0，驗證碼出現請按1...")
148             # 賬號密碼錯誤則重新輸入
149             if selectno == "0":
150 
151                 # 找到id="ap_email"的對話框
152                 # 清空輸入框
153                 browser.find_element_by_id("ap_email").clear()
154                 browser.find_element_by_id("ap_password").clear()
155 
156                 # 輸入賬號密碼
157                 inputemail = input("請輸入賬號：")
158                 inputpassword = input("請輸入密碼：")
159                 browser.find_element_by_id("ap_email").send_keys(inputemail)
160                 browser.find_element_by_id("ap_password").send_keys(inputpassword)
161                 # 點擊登陸sign in
162                 # id="signInSubmit"
163                 browser.find_element_by_id("signInSubmit").click()
164 
165             elif selectno == "1":
166                 # 驗證碼的id為id="ap_captcha_guess"的對話框
167                 input("請在瀏覽器中輸入驗證碼並登陸...")
168                 select = input("請觀察瀏覽器網站是否已經登陸(y/n)：")
169 
170         else:
171             print("請輸入“y”或者“n”！")
172             select = input("請觀察瀏覽器網站是否已經登陸(y/n)：")
173 
174     return cookiestr
175 
176 
177 def gethtml(url):
178     # 讀取cookie
179     # 寫入字典
180     mycookie = {}
181     if "jp" in url:
182         path = "../data/Japcookie.txt"
183     else:
184         path = "../data/Amecookie.txt"
185 
186     try:
187         filecookie = open(path, "r")
188         cookies = filecookie.read().split(";")
189         for items in cookies:
190             item = items.split(":")
191             mycookie[item[0]] = item[1]
192         # print(mycookie)
193         filecookie.close()
194     except:
195         print("cookie為空...")
196 
197     if "jp" in url:
198         referer = "https://sellercentral.amazon.co.jp/"
199         host = "www.amazon.co.jp"
200     else:
201         referer = "https://sellercentral.amazon.com/"
202         host = "www.amazon.com"
203 
204     # 制作頭部
205     header = {
206         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0',
207         'Referer': referer,
208         'Host': host,
209         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
210         'Connection': 'keep-alive',
211         'Upgrade-Insecure-Requests': '1',
212         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213         'Accept-Encoding': 'gzip, deflate, br'
214     }
215 
216     htmlget = requests.get(url=url, headers=header, cookies=mycookie, timeout=60)
217     htmlcontent = htmlget.content.decode("UTF-8", "ignore")
218 
219     return htmlcontent
220 
221 
222 def getinfo(html, Loginurl):
223     # BeautifulSoup解析需要的東西
224     soups = BeautifulSoup(html, "html.parser")
225     # 篩選出商品的div
226     sellyours = soups.find_all("div", attrs={"class": "a-box product"})
227     information = []
228     for item in sellyours:
229         # 一個一個商品篩選
230         # 第一次篩選，得到有“出售您的”的商品列表
231         temp = item.find("a", attrs={"class", "a-button-text"})
232 
233         if temp != None:
234             if "sellYoursClick" in temp["data-csm"]:
235                 # 第二次篩選得到“無數字、無新品”字樣的商品列表
236                 temp = item.find("span", attrs={"class", "offerCountDetails"})
237                 if temp == None:
238                     temp = item.find("div", attrs={"class", "a-fixed-right-grid-col description a-col-left"})
239 
240                     # 得到詳情頁網址
241                     hrefurl = temp.find('a').get('href')
242                     # 得到所有當前class下的文本信息
243                     # 包括title、UPC、EAN、Rank
244                     try:
245                         spans = temp.get_text()
246                     except:
247                         spans = "Nothing"
248                     # 將得到的文本信息寫入數組里面
249                     temparr = spans.strip().split("\n")
250                     # 正則得到Asin
251                     asin = getdp(hrefurl)
252                     temparr.append(asin)
253                     temparr.append(hrefurl)
254 
255                     # 這里記錄一份副本到txt中，防止程序中斷什么都沒保存
256                     txtcontent = ' '.join(temparr)
257                     filename = time.strftime('%Y%m%d', time.localtime())
258                     path = "../xls/" + filename
259                     createjia(path)
260                     file = open(path + "/" + filename + ".txt", "a")
261                     file.write("\n" + txtcontent)
262                     file.close()
263 
264                     # 這里是解析詳情頁，如果詳情頁有price，就抓取review下來
265                     # 並且將抓取的東西儲存到數組，並寫入excel中
266                     # 解析詳情頁
267                     htmldetail = gethtml(hrefurl)
268 
269                     if 'id="words"' in htmldetail or 'ap_email' in htmldetail or "Amazon.com Page Not Found" in htmldetail:
270                         print("抓取得太快！需要重新登陸...")
271                         openbrowser(Loginurl)
272                         htmldetail = gethtml(hrefurl)
273 
274                     # BeautifulSoup解析需要的東西
275                     soups = BeautifulSoup(htmldetail, "html.parser")
276                     # 篩選出商品的centerCol列表
277                     centerCols = soups.findAll('div', attrs={'id': "centerCol"})
278                     if centerCols:
279                         for item in centerCols:
280                             temp = item.find("td", attrs={"id": "priceblock_ourprice_lbl"})
281                             if temp == None:
282                                 # 得到評分等級
283                                 star = item.find("a", attrs={"id": "reviewStarsLinkedCustomerReviews"}).get_text()
284                                 # 得到評分人數
285                                 reviews = item.find("span", attrs={"id": "acrCustomerReviewText"}).get_text()
286                                 # 將抓取的東西寫入數組
287                                 if star:
288                                     temparr.append(star.strip().replace(" out of 5 stars", ""))
289                                 else:
290                                     temparr.append("")
291                                 if reviews:
292                                     temparr.append(reviews.strip().replace(" customer reviews", ""))
293                                 else:
294                                     temparr.append("")
295 
296                                 information.append(temparr)
297                                 print(information)
298                     else:
299                         temparr.append("")
300                         temparr.append("")
301                         information.append(temparr)
302                         print(information)
303     return information
304 
305 
306 def begin():
307     taoyanbai = '''
308             -----------------------------------------
309             | 歡迎使用后台爬蟲系統                   |
310             | 時間：2016年10月21日                  |
311             | 出品：技術部                          |
312             -----------------------------------------
313         '''
314     print(taoyanbai)
315 
316 
317 if __name__ == "__main__":
318 
319     a = time.clock()
320 
321     while 1:
322         try:
323             LoginWhere = int(input("抓取美國請按0，日本請按1："))
324             if LoginWhere == 0:
325                 Loginurl = "https://sellercentral.amazon.com/"
326                 break
327             elif LoginWhere == 1:
328                 Loginurl = "https://sellercentral.amazon.co.jp/"
329                 break
330         except:
331             print("請正確輸入0或1！！")
332             LoginWhere = int(input("抓取美國請按0，日本請按1："))
333 
334     keywords = input("請輸入查找的關鍵詞：")
335     keyword = keywords.replace(" ", "+")
336 
337     print("正在檢查登陸狀態...")
338 
339     if "jp" in Loginurl:
340         seekurl = "https://sellercentral.amazon.co.jp/productsearch?q=" + str(keyword)
341     else:
342         seekurl = "https://sellercentral.amazon.com/productsearch?q=" + str(keyword)
343 
344     try:
345         htmlpage = gethtml(seekurl)
346     except Exception as err:
347         input("網絡似乎有點問題...")
348         print(err)
349         exit()
350 
351     while 1:
352         if 'ap_email' in htmlpage or "Amazon.com Page Not Found" in htmlpage or "<title>404" in htmlpage:
353             print("cookie已經過期，需要重新登陸...")
354             print("等待網頁打開...")
355             openbrowser(Loginurl)
356             htmlpage = gethtml(seekurl)
357         else:
358             print("直接使用cookie登陸...")
359             break
360 
361     # BeautifulSoup解析需要的東西
362     soups = BeautifulSoup(htmlpage, "html.parser")
363     # 篩選出類別及其網址
364     categorys = soups.findAll('ul', attrs={'class': "a-nostyle a-vertical"})
365     categoryurl = []
366     categoryname = ""
367     pagenum = []
368     filtername = []
369 
370     for item in categorys:
371         for temp in item.find_all("a"):
372             hrefurl = temp.get('href')
373             categoryurl.append(hrefurl)
374 
375         for temp in item.find_all("span", attrs={"class", "a-color-tertiary"}):
376             spantext = temp.get_text()
377             pagenum.append(getpagenum(spantext))
378     for i in range(0, len(categoryurl)):
379         name = getfilter(categoryurl[i])
380         filtername.append(name)
381         categoryname = categoryname + "抓取(" + str(name) + ")請按" + str(i) + ","
382 
383     # 選擇抓取的類型
384     try:
385         print(categoryname)
386         selectcategory = int(input("請選擇你要抓取類型的數字號碼："))
387     except:
388         print("請正確輸入前面的數字!!!")
389         print(categoryname)
390         selectcategory = int(input("請選擇你要抓取類型的數字編碼："))
391 
392     filter = filtername[selectcategory]
393     mustpage = int(pagenum[selectcategory]) // 10
394 
395     try:
396         print("溫馨提醒：（1）后台僅僅展現1000頁...（2）你要抓取的類型大約有" + str(mustpage) + "頁...")
397         page = int(input("請問你要抓取多少頁？（默認15頁）："))
398         if page > 1000:
399             print("后台最多只能看到1000頁！！！")
400             page = int(input("后台僅僅展現1000頁！！！你要抓取的類型大約有" + str(mustpage) + "頁！！！請問你要抓取多少頁？（默認15頁）："))
401     except:
402         page = 15
403 
404     # 儲存抓取到的東西
405     information = []
406     temparr = []
407 
408     for i in range(0, page):
409         try:
410             if "jp" in Loginurl:
411                 # https://sellercentral.amazon.co.jp/productsearch?filter=sporting&q=空気入れ&page=2
412                 openurl = "https://sellercentral.amazon.co.jp/productsearch?filter=" + str(filter) + "&q=" + str(
413                         keyword) + "&page=" + str(i + 1)
414             else:
415                 # https://sellercentral.amazon.com/productsearch?filter=pets&q=dog
416                 openurl = "https://sellercentral.amazon.com/productsearch?filter=" + str(filter) + "&q=" + str(
417                         keyword) + "&page=" + str(i + 1)
418 
419             print("開始抓取：" + str(openurl))
420             openhtml = gethtml(openurl)
421 
422             # BeautifulSoup解析需要的東西
423             soups = BeautifulSoup(openhtml, "html.parser")
424             # 篩選出商品的div
425             sellyours = soups.findAll('div', attrs={'class': "product"})
426 
427             if 'ap_email' in openhtml or "Amazon.com Page Not Found" in openhtml:
428                 print("抓取得太快！需要重新登陸...")
429                 openbrowser(Loginurl)
430                 openhtml = gethtml(openurl)
431 
432             elif sellyours == None:
433                 print("已經翻到最后一頁了...")
434                 break
435             temparr = getinfo(openhtml, Loginurl)
436         except Exception as err:
437             print(err)
438             print("訪問抓取過程中出現小錯誤...")
439             print("暫停20秒記錄bug並嘗試自我修復...")
440             time.sleep(20)
441 
442         if temparr:
443             information.append(temparr[0])
444         loadtime = random.randint(5, 10)
445         print("防止反爬蟲設定暫停" + str(loadtime) + "秒...")
446         time.sleep(loadtime)
447 
448     print("抓到的列表如下：")
449     print(information)
450 
451     # 這里寫入excel
452     # 創建文件夾
453     filename = time.strftime('%Y%m%d', time.localtime())
454     path = "../xls/" + filename
455     createjia(path)
456 
457     # 寫入excel
458     timename = time.strftime('%Y%H%M%S', time.localtime())
459     with xlsxwriter.Workbook(path + "/" + timename + '.xlsx') as workbook:
460         # workbook = xlsxwriter.Workbook(path + "/" + timename + '.xlsx')
461         worksheet = workbook.add_worksheet()
462 
463         first = ['title', 'UPC', 'EAN', 'Rank', 'Nothing', 'ASIN', 'DetailUrl', 'Star', 'Reviews']
464         # 寫入第一行
465         for i in range(0, len(first)):
466             worksheet.write(0, i, first[i])
467         # 寫入后面幾行
468         for m in range(0, len(information)):
469             for n in range(0, len(information[m])):
470                 insert = str(information[m][n]).replace("UPC: ", "").replace("EAN: ", "").replace("Sales Rank:",
471                                                                                                   "").replace(
472                         "customer reviews", "").replace("out of 5 stars", "")
473                 worksheet.write(m + 1, n, insert)
474         workbook.close()
475 
476     b = time.clock()
477     print('運行時間：' + timetochina(b - a))
478     input('請關閉窗口')  ##防止運行完畢后窗口直接關閉而看不到運行時間
由於selenium庫支持低版本的瀏覽器，例如本文的谷歌瀏覽器需要下載插件，並將插件放到目錄C:\Python34即可：
插件為chromedriver.exe，自己搜索，網上很多噠
免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。
猜您在找 模擬QQ登陸模擬登陸淘寶 HttpClient模擬登陸模擬登陸CSDN——就是這么簡單 HttpURLConnection模擬用戶登陸利用selenium模擬登陸 java模擬Cookies登陸 java實現模擬登陸 python模擬登陸 pixiv PHP CURL模擬登陸