本文基於python3.4的selenium庫打開瀏覽器,並將瀏覽器中的登陸cookie保存到本地,那么下次登陸就可以直接利用cookie了:
1 # !/usr/bin/python3.4 2 # -*- coding: utf-8 -*- 3 4 from selenium import webdriver 5 import time 6 import requests 7 from bs4 import BeautifulSoup 8 import os 9 import re 10 import random 11 import xlsxwriter 12 13 14 # 找出文件夾下所有xml后綴的文件,可選擇遞歸 15 def listfiles(rootdir, prefix='.xml', iscur=False): 16 file = [] 17 for parent, dirnames, filenames in os.walk(rootdir): 18 if parent == rootdir: 19 for filename in filenames: 20 if filename.endswith(prefix): 21 file.append(filename) 22 if not iscur: 23 return file 24 else: 25 if iscur: 26 for filename in filenames: 27 if filename.endswith(prefix): 28 file.append(filename) 29 else: 30 pass 31 return file 32 33 34 # 抓取dp的正則表達式 35 def getdp(string): 36 reg = r'(http.+?/dp/)(.+)' 37 all = re.compile(reg) 38 alllist = re.findall(all, string) 39 return alllist[0][1] 40 41 42 # 抓取filter的正則表達式 43 # https://sellercentral.amazon.com/productsearch?filter=grocery&q=fish 44 def getfilter(string): 45 reg = r'(https.+?filter=)(.+?)(&)' 46 all = re.compile(reg) 47 alllist = re.findall(all, string) 48 return alllist[0][1] 49 50 51 # 抓取最大頁數的正則 52 def getpagenum(string): 53 reg = r'(.+?\()(\d+)(\))' 54 all = re.compile(reg) 55 alllist = re.findall(all, string) 56 return alllist[0][1] 57 58 59 # 創建文件夾 60 def createjia(path): 61 try: 62 os.makedirs(path) 63 except: 64 pass 65 66 67 def timetochina(longtime, formats='{}天{}小時{}分鍾{}秒'): 68 day = 0 69 hour = 0 70 minutue = 0 71 second = 0 72 try: 73 if longtime > 60: 74 second = longtime % 60 75 minutue = longtime // 60 76 else: 77 second = longtime 78 if minutue > 60: 79 hour = minutue // 60 80 minutue = minutue % 60 81 if hour > 24: 82 day = hour // 24 83 hour = hour % 24 84 return formats.format(day, hour, minutue, second) 85 except: 86 raise Exception('時間非法') 87 88 89 # 打開瀏覽器抓取cookie 90 def openbrowser(url): 91 # 打開谷歌瀏覽器 92 # Firefox() Chrome() 93 browser = webdriver.Chrome() 94 # browser = webdriver.Chrome(executable_path='C:/Python34/chromedriver.exe') 95 # 輸入網址 96 browser.get(url) 97 # 打開瀏覽器時間 98 # print("等待10秒打開瀏覽器...") 99 # time.sleep(10) 100 101 # 找到id="ap_email"的對話框 102 # 清空輸入框 103 browser.find_element_by_id("ap_email").clear() 104 browser.find_element_by_id("ap_password").clear() 105 106 # 輸入賬號密碼 107 inputemail = input("請輸入賬號:") 108 inputpassword = input("請輸入密碼:") 109 browser.find_element_by_id("ap_email").send_keys(inputemail) 110 browser.find_element_by_id("ap_password").send_keys(inputpassword) 111 112 # 點擊登陸sign in 113 # id="signInSubmit" 114 browser.find_element_by_id("signInSubmit").click() 115 116 # 等待登陸10秒 117 # print('等待登陸10秒...') 118 # time.sleep(10) 119 print("等待網址加載完畢...") 120 121 select = input("請觀察瀏覽器網站是否已經登陸(y/n):") 122 while 1: 123 if select == "y" or select == "Y": 124 print("登陸成功!") 125 # 獲取cookie 126 cookie = [item["name"] + ":" + item["value"] for item in browser.get_cookies()] 127 cookiestr = ';'.join(item for item in cookie) 128 print("正在復制網頁cookie...") 129 130 # 寫入本地txt 131 if "jp" in url: 132 path = "../data/Japcookie.txt" 133 else: 134 path = "../data/Amecookie.txt" 135 136 filecookie = open(path, "w") 137 filecookie.write(cookiestr) 138 filecookie.close() 139 140 time.sleep(1) 141 print("准備關閉瀏覽器...") 142 browser.quit() 143 # print(cookiestr) 144 break 145 146 elif select == "n" or select == "N": 147 selectno = input("賬號密碼錯誤請按0,驗證碼出現請按1...") 148 # 賬號密碼錯誤則重新輸入 149 if selectno == "0": 150 151 # 找到id="ap_email"的對話框 152 # 清空輸入框 153 browser.find_element_by_id("ap_email").clear() 154 browser.find_element_by_id("ap_password").clear() 155 156 # 輸入賬號密碼 157 inputemail = input("請輸入賬號:") 158 inputpassword = input("請輸入密碼:") 159 browser.find_element_by_id("ap_email").send_keys(inputemail) 160 browser.find_element_by_id("ap_password").send_keys(inputpassword) 161 # 點擊登陸sign in 162 # id="signInSubmit" 163 browser.find_element_by_id("signInSubmit").click() 164 165 elif selectno == "1": 166 # 驗證碼的id為id="ap_captcha_guess"的對話框 167 input("請在瀏覽器中輸入驗證碼並登陸...") 168 select = input("請觀察瀏覽器網站是否已經登陸(y/n):") 169 170 else: 171 print("請輸入“y”或者“n”!") 172 select = input("請觀察瀏覽器網站是否已經登陸(y/n):") 173 174 return cookiestr 175 176 177 def gethtml(url): 178 # 讀取cookie 179 # 寫入字典 180 mycookie = {} 181 if "jp" in url: 182 path = "../data/Japcookie.txt" 183 else: 184 path = "../data/Amecookie.txt" 185 186 try: 187 filecookie = open(path, "r") 188 cookies = filecookie.read().split(";") 189 for items in cookies: 190 item = items.split(":") 191 mycookie[item[0]] = item[1] 192 # print(mycookie) 193 filecookie.close() 194 except: 195 print("cookie為空...") 196 197 if "jp" in url: 198 referer = "https://sellercentral.amazon.co.jp/" 199 host = "www.amazon.co.jp" 200 else: 201 referer = "https://sellercentral.amazon.com/" 202 host = "www.amazon.com" 203 204 # 制作頭部 205 header = { 206 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0', 207 'Referer': referer, 208 'Host': host, 209 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 210 'Connection': 'keep-alive', 211 'Upgrade-Insecure-Requests': '1', 212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 213 'Accept-Encoding': 'gzip, deflate, br' 214 } 215 216 htmlget = requests.get(url=url, headers=header, cookies=mycookie, timeout=60) 217 htmlcontent = htmlget.content.decode("UTF-8", "ignore") 218 219 return htmlcontent 220 221 222 def getinfo(html, Loginurl): 223 # BeautifulSoup解析需要的東西 224 soups = BeautifulSoup(html, "html.parser") 225 # 篩選出商品的div 226 sellyours = soups.find_all("div", attrs={"class": "a-box product"}) 227 information = [] 228 for item in sellyours: 229 # 一個一個商品篩選 230 # 第一次篩選,得到有“出售您的”的商品列表 231 temp = item.find("a", attrs={"class", "a-button-text"}) 232 233 if temp != None: 234 if "sellYoursClick" in temp["data-csm"]: 235 # 第二次篩選得到“無數字、無新品”字樣的商品列表 236 temp = item.find("span", attrs={"class", "offerCountDetails"}) 237 if temp == None: 238 temp = item.find("div", attrs={"class", "a-fixed-right-grid-col description a-col-left"}) 239 240 # 得到詳情頁網址 241 hrefurl = temp.find('a').get('href') 242 # 得到所有當前class下的文本信息 243 # 包括title、UPC、EAN、Rank 244 try: 245 spans = temp.get_text() 246 except: 247 spans = "Nothing" 248 # 將得到的文本信息寫入數組里面 249 temparr = spans.strip().split("\n") 250 # 正則得到Asin 251 asin = getdp(hrefurl) 252 temparr.append(asin) 253 temparr.append(hrefurl) 254 255 # 這里記錄一份副本到txt中,防止程序中斷什么都沒保存 256 txtcontent = ' '.join(temparr) 257 filename = time.strftime('%Y%m%d', time.localtime()) 258 path = "../xls/" + filename 259 createjia(path) 260 file = open(path + "/" + filename + ".txt", "a") 261 file.write("\n" + txtcontent) 262 file.close() 263 264 # 這里是解析詳情頁,如果詳情頁有price,就抓取review下來 265 # 並且將抓取的東西儲存到數組,並寫入excel中 266 # 解析詳情頁 267 htmldetail = gethtml(hrefurl) 268 269 if 'id="words"' in htmldetail or 'ap_email' in htmldetail or "Amazon.com Page Not Found" in htmldetail: 270 print("抓取得太快!需要重新登陸...") 271 openbrowser(Loginurl) 272 htmldetail = gethtml(hrefurl) 273 274 # BeautifulSoup解析需要的東西 275 soups = BeautifulSoup(htmldetail, "html.parser") 276 # 篩選出商品的centerCol列表 277 centerCols = soups.findAll('div', attrs={'id': "centerCol"}) 278 if centerCols: 279 for item in centerCols: 280 temp = item.find("td", attrs={"id": "priceblock_ourprice_lbl"}) 281 if temp == None: 282 # 得到評分等級 283 star = item.find("a", attrs={"id": "reviewStarsLinkedCustomerReviews"}).get_text() 284 # 得到評分人數 285 reviews = item.find("span", attrs={"id": "acrCustomerReviewText"}).get_text() 286 # 將抓取的東西寫入數組 287 if star: 288 temparr.append(star.strip().replace(" out of 5 stars", "")) 289 else: 290 temparr.append("") 291 if reviews: 292 temparr.append(reviews.strip().replace(" customer reviews", "")) 293 else: 294 temparr.append("") 295 296 information.append(temparr) 297 print(information) 298 else: 299 temparr.append("") 300 temparr.append("") 301 information.append(temparr) 302 print(information) 303 return information 304 305 306 def begin(): 307 taoyanbai = ''' 308 ----------------------------------------- 309 | 歡迎使用后台爬蟲系統 | 310 | 時間:2016年10月21日 | 311 | 出品:技術部 | 312 ----------------------------------------- 313 ''' 314 print(taoyanbai) 315 316 317 if __name__ == "__main__": 318 319 a = time.clock() 320 321 while 1: 322 try: 323 LoginWhere = int(input("抓取美國請按0,日本請按1:")) 324 if LoginWhere == 0: 325 Loginurl = "https://sellercentral.amazon.com/" 326 break 327 elif LoginWhere == 1: 328 Loginurl = "https://sellercentral.amazon.co.jp/" 329 break 330 except: 331 print("請正確輸入0或1!!") 332 LoginWhere = int(input("抓取美國請按0,日本請按1:")) 333 334 keywords = input("請輸入查找的關鍵詞:") 335 keyword = keywords.replace(" ", "+") 336 337 print("正在檢查登陸狀態...") 338 339 if "jp" in Loginurl: 340 seekurl = "https://sellercentral.amazon.co.jp/productsearch?q=" + str(keyword) 341 else: 342 seekurl = "https://sellercentral.amazon.com/productsearch?q=" + str(keyword) 343 344 try: 345 htmlpage = gethtml(seekurl) 346 except Exception as err: 347 input("網絡似乎有點問題...") 348 print(err) 349 exit() 350 351 while 1: 352 if 'ap_email' in htmlpage or "Amazon.com Page Not Found" in htmlpage or "<title>404" in htmlpage: 353 print("cookie已經過期,需要重新登陸...") 354 print("等待網頁打開...") 355 openbrowser(Loginurl) 356 htmlpage = gethtml(seekurl) 357 else: 358 print("直接使用cookie登陸...") 359 break 360 361 # BeautifulSoup解析需要的東西 362 soups = BeautifulSoup(htmlpage, "html.parser") 363 # 篩選出類別及其網址 364 categorys = soups.findAll('ul', attrs={'class': "a-nostyle a-vertical"}) 365 categoryurl = [] 366 categoryname = "" 367 pagenum = [] 368 filtername = [] 369 370 for item in categorys: 371 for temp in item.find_all("a"): 372 hrefurl = temp.get('href') 373 categoryurl.append(hrefurl) 374 375 for temp in item.find_all("span", attrs={"class", "a-color-tertiary"}): 376 spantext = temp.get_text() 377 pagenum.append(getpagenum(spantext)) 378 for i in range(0, len(categoryurl)): 379 name = getfilter(categoryurl[i]) 380 filtername.append(name) 381 categoryname = categoryname + "抓取(" + str(name) + ")請按" + str(i) + "," 382 383 # 選擇抓取的類型 384 try: 385 print(categoryname) 386 selectcategory = int(input("請選擇你要抓取類型的數字號碼:")) 387 except: 388 print("請正確輸入前面的數字!!!") 389 print(categoryname) 390 selectcategory = int(input("請選擇你要抓取類型的數字編碼:")) 391 392 filter = filtername[selectcategory] 393 mustpage = int(pagenum[selectcategory]) // 10 394 395 try: 396 print("溫馨提醒:(1)后台僅僅展現1000頁...(2)你要抓取的類型大約有" + str(mustpage) + "頁...") 397 page = int(input("請問你要抓取多少頁?(默認15頁):")) 398 if page > 1000: 399 print("后台最多只能看到1000頁!!!") 400 page = int(input("后台僅僅展現1000頁!!!你要抓取的類型大約有" + str(mustpage) + "頁!!!請問你要抓取多少頁?(默認15頁):")) 401 except: 402 page = 15 403 404 # 儲存抓取到的東西 405 information = [] 406 temparr = [] 407 408 for i in range(0, page): 409 try: 410 if "jp" in Loginurl: 411 # https://sellercentral.amazon.co.jp/productsearch?filter=sporting&q=空気入れ&page=2 412 openurl = "https://sellercentral.amazon.co.jp/productsearch?filter=" + str(filter) + "&q=" + str( 413 keyword) + "&page=" + str(i + 1) 414 else: 415 # https://sellercentral.amazon.com/productsearch?filter=pets&q=dog 416 openurl = "https://sellercentral.amazon.com/productsearch?filter=" + str(filter) + "&q=" + str( 417 keyword) + "&page=" + str(i + 1) 418 419 print("開始抓取:" + str(openurl)) 420 openhtml = gethtml(openurl) 421 422 # BeautifulSoup解析需要的東西 423 soups = BeautifulSoup(openhtml, "html.parser") 424 # 篩選出商品的div 425 sellyours = soups.findAll('div', attrs={'class': "product"}) 426 427 if 'ap_email' in openhtml or "Amazon.com Page Not Found" in openhtml: 428 print("抓取得太快!需要重新登陸...") 429 openbrowser(Loginurl) 430 openhtml = gethtml(openurl) 431 432 elif sellyours == None: 433 print("已經翻到最后一頁了...") 434 break 435 temparr = getinfo(openhtml, Loginurl) 436 except Exception as err: 437 print(err) 438 print("訪問抓取過程中出現小錯誤...") 439 print("暫停20秒記錄bug並嘗試自我修復...") 440 time.sleep(20) 441 442 if temparr: 443 information.append(temparr[0]) 444 loadtime = random.randint(5, 10) 445 print("防止反爬蟲設定暫停" + str(loadtime) + "秒...") 446 time.sleep(loadtime) 447 448 print("抓到的列表如下:") 449 print(information) 450 451 # 這里寫入excel 452 # 創建文件夾 453 filename = time.strftime('%Y%m%d', time.localtime()) 454 path = "../xls/" + filename 455 createjia(path) 456 457 # 寫入excel 458 timename = time.strftime('%Y%H%M%S', time.localtime()) 459 with xlsxwriter.Workbook(path + "/" + timename + '.xlsx') as workbook: 460 # workbook = xlsxwriter.Workbook(path + "/" + timename + '.xlsx') 461 worksheet = workbook.add_worksheet() 462 463 first = ['title', 'UPC', 'EAN', 'Rank', 'Nothing', 'ASIN', 'DetailUrl', 'Star', 'Reviews'] 464 # 寫入第一行 465 for i in range(0, len(first)): 466 worksheet.write(0, i, first[i]) 467 # 寫入后面幾行 468 for m in range(0, len(information)): 469 for n in range(0, len(information[m])): 470 insert = str(information[m][n]).replace("UPC: ", "").replace("EAN: ", "").replace("Sales Rank:", 471 "").replace( 472 "customer reviews", "").replace("out of 5 stars", "") 473 worksheet.write(m + 1, n, insert) 474 workbook.close() 475 476 b = time.clock() 477 print('運行時間:' + timetochina(b - a)) 478 input('請關閉窗口') ##防止運行完畢后窗口直接關閉而看不到運行時間
由於selenium庫支持低版本的瀏覽器,例如本文的谷歌瀏覽器需要下載插件,並將插件放到目錄C:\Python34即可:
插件為chromedriver.exe,自己搜索,網上很多噠