一、編程思路
1.模擬登陸 采用selenium PhantomJS 采用Chrome Firefox 這些,我的電腦無法截取驗證碼位置,讀者可以自行嘗試 驗證碼識別可采用tesserocr 我采用手動輸入
2、查詢,獲取搜索框,用戶輸入關鍵字並查詢
3、頁面信息,F12查看即可 ,若采用find_element_by_xpath()查詢需注意element 返回是第一個節點信息 elements返回是一個列表
4、書本具體信息、F12查看,后面操作很簡單
5、文獻傳遞頁面、這個地方是最難的,右鍵查看文獻傳遞這個按鈕,點擊其中href是無法進入的,這個只是一種綁定關系,需要仔細觀察進入文獻傳遞頁面前后的network中第一個文本中的信息,里面存在很多url,
只有refer 點擊可以進入,分析refer url里面的元素,在進入前的那個頁面的url可找到,后面采用切片即可
6、下載書名頁...............,此處我采用的是觀察圖片的鏈接直接的關系,從而獲取,這個地方需要注意的是,圖片需要不斷的滑動滑動條才能加載圖片,否則無法下載
7、保存圖片 ,注意 'w' 和‘wb’使用即可
8、最后需要注意爬取頻率,否則很容易被發現。
from selenium import webdriver import time #import tesserocr import pytesseract from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import Select from PIL import Image import requests from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # from pyquery import PyQuery as pq from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from lxml import etree from urllib.parse import quote import os # cookie_bro = browser.get_cookies() # # cookie1=cookie_bro[1]['value'] # print('獲取cookie成功') def login(): #輸入用戶名 input_user = browser.find_element_by_id("userName") #查找輸入框 input_user.send_keys("ckho") time.sleep(2) #輸入密碼 input_pwd = browser.find_element_by_id("passWord") input_pwd.send_keys("chen13560034370") time.sleep(2) #選擇 input_gid = Select(browser.find_element_by_id("gid")) #實例化input_gid input_gid.select_by_value("7320") #選擇class=7320的選擇 browser.save_screenshot("screen.png") # 定位驗證碼圖片的位置,並截取該位置的圖片 code_element = browser.find_element_by_id("vimg") print(code_element.location) # {'x': 548, 'y': 523} left = code_element.location['x'] top = code_element.location['y'] right = code_element.size['width'] + left height = code_element.size['height'] + top im = Image.open("screen.png") img = im.crop((left, top, right, height)) img.save("screen4.png") #驗證碼識別 try: input_verify = wait.until( EC.element_to_be_clickable((By.ID,"verifyCode")) )#browser.find_element_by_id("verifyCode") result = input("請輸入驗證碼") input_verify.send_keys(result) #單擊登錄 enter = wait.until( EC.element_to_be_clickable((By.ID,"submit")) )#browser.find_element_by_id("submit").click() enter.click() print("登錄成功") browser.save_screenshot("screen6.png") return browser.current_url except BaseException: print(" Enter Error") #查詢書籍信息並且用戶可選擇頁數 def index_page(url): book_name = input("請輸入查找的書名") input_bookname = browser.find_element_by_id("sw").send_keys(book_name) enter = browser.find_element_by_xpath('//*[@id="f2"]/div[2]/input[1]').click() print("當前頁數為第一頁") all_book_information() page = input("請輸入想看頁數:") print("...正在爬取第"+str(page)+"頁") current_url = browser.current_url #輸入書名后的頁面的鏈接 try: if int(page)>1: browser.get(current_url) print(current_url) #查找輸入框 input_page = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#jpage")) ) #查找登錄建 sumbit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#pageinfo > input[type=button]:nth-child(13)')) ) input_page.clear() input_page.send_keys(str(page)) sumbit.click() all_book_information() return browser.current_url #換頁后的鏈接 except TimeoutError: index_page() def all_book_information(): #返回當前頁面的書本信息 addres ='//table[@class="book1"]' addres_list = browser.find_elements_by_xpath(addres) book_list = [] for book in addres_list: book_list.append(book.text) for i in enumerate(book_list, start=1): print(i) #獲取每本書具體鏈接並且返回每本書具體信息 def get_detail_book(url): number = input("請輸入你想要了解書的編號:") browser.get(url) addres = '//table[{}][@class="book1"]//a[@class="px14"]'.format(number) book_url = browser.find_element_by_xpath(addres).get_attribute("href") browser.get(book_url) detail_book_information = browser.find_elements_by_xpath('//div[@class="tubox"]//dd') for book in detail_book_information: print(book.text) return browser.current_url #進入圖書館文獻傳遞頁面 def sent_book_emial(url): bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href") all_page_name(bqy_url) answer = input("是否需要下載此書 是請輸入是1 否2 看其他書按3 ,下載書名頁4 下載前言頁5 下載版權頁6 下載目錄頁7 下載正文頁8") if int(answer)==1: base_url = 'http://book.ucdrs.superlib.net/gofirstdrs.jsp?' browser.get(url) sent_href = browser.find_element_by_xpath('//*[@id="libinfo"]/div[2]//a').get_attribute("href") #頁面上文獻傳遞對應href值 #拆分href,拼接可進入圖書參考咨詢頁面的url 通過進入文獻傳遞,觀察network中找到可進入圖書參考咨詢頁面url,直接點擊href是不能訪問 list1 = sent_href.split("?", 1) list2 = list1[1].split("'", 1) tscx_url = base_url+list2[0] browser.get(tscx_url) browser.save_screenshot("screen5.png") book_download() elif int(answer)==2: print("\n") print("本次查詢結束,歡迎下次使用!") elif int(answer) == 4: browser.get(url) bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href") base_url = bqy_download(bqy_url) smy_img(base_url) elif int(answer)==5: browser.get(url) bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href") base_url = bqy_download(bqy_url) qyy_img(base_url) elif int(answer)==6: browser.get(url) bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href") base_url = bqy_download(bqy_url) bqy_url(base_url) elif int(answer)==7: browser.get(url) bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href") base_url = bqy_download(bqy_url) mly_img(base_url) elif int(answer)==8: browser.get(url) bqy_url = browser.find_element_by_xpath('//*[@id="libinfo"]/div[1]/div/a[1]').get_attribute("href") base_url = bqy_download(bqy_url) zwy_img(base_url) else: url = "http://www.ucdrs.superlib.net/" browser.get(url) all_book_url_page = index_page(url) detail_book_url = get_detail_book(all_book_url_page) sent_book_emial(detail_book_url) def all_page_name(url): browser.get(url) all_page_name = browser.find_element_by_id("pagejump") t1 = all_page_name.text.replace(" ", "") #刪除空格 print("該書可看部分僅有:"+t1+"請按照此選擇下載,否則可能導致下載錯誤") #圖書下載 def book_download(): all_page = browser.find_element_by_xpath('//*[@id="content"]/form/ul/li[3]/p[1]').text print(all_page) print("每本圖書咨詢每次不超過50頁") input1 = input("請輸入想看的書初始頁") input2 = input("請輸入想看的書的末頁") input_start = browser.find_element_by_id("frompage").send_keys(input1) input_led = browser.find_element_by_id("endpage").send_keys(input2) email = input("請輸入你的郵箱賬號") input_email = browser.find_element_by_id("email").send_keys(email) verifycode1 = input("請輸入驗證碼") input_verifycode1 = browser.find_element_by_id("verifycode").send_keys(verifycode1) input_enter = browser.find_element_by_xpath('//li[@class="sumbit"]').click() #返回圖片的url共同部分 def bqy_download(url): browser.get(url) print(url) time.sleep(4) #注意 需要留個頁面加載時間,模仿人閱讀時候網頁加載速度 否則加載不出來想要的圖片鏈接 browser.save_screenshot("screen8.png") first_img_url = browser.find_element_by_xpath('//*[@id="reader"]/div/div[1]/input').get_attribute("src") print(first_img_url) base_url = first_img_url[0:-13] print(base_url) return base_url #下載書名頁 def smy_img(base_url): i=1 print("僅下載1頁") while i<2: img_url = base_url + 'bok00{}'.format(i) + '?zoom=0&f=0' i += 1 response = requests.get(img_url) print(img_url) with open("D:/pycharm/實戰案例/前言頁/" + str(i-1) + '.png', "wb") as f: f.write(response.content) print("success download") time.sleep(2) #下載版權頁 def bqy_img(base_url): i=1 print("僅下載1頁") while i<2: img_url = base_url + 'leg00{}'.format(i) + '?zoom=0&f=0' i += 1 response = requests.get(img_url) print(img_url) with open("D:/pycharm/實戰案例/版權頁/" + str(i-1) + '.png', "wb") as f: f.write(response.content) print("success download") #下載前言頁 def qyy_img(base_url): i=1 print("僅下載5頁") while i<6: img_url = base_url + 'fow00{}'.format(i) + '?zoom=0&f=0' i += 1 response = requests.get(img_url) print(img_url) with open("D:/pycharm/實戰案例/前言頁/" + str(i-1) + '.png', "wb") as f: f.write(response.content) print("success download") # try: # response.headers["Accept-Encoding"] # except: # break time.sleep(2) #下載目錄頁 def mly_img(base_url): i=1 print("僅下載3頁") while i<4: img_url = base_url + '!0000{}'.format(i) + '?zoom=0&f=0' i += 1 response = requests.get(img_url) print(img_url) with open("D:/pycharm/實戰案例/目錄頁/" + str(i-1) + '.png', "wb") as f: f.write(response.content) print("success download") time.sleep(2) #下載正文頁 def zwy_img(base_url): i=1 print("僅下載15頁") while i<12: if i<16: img_url = base_url + '00000{}'.format(i) + '?zoom=0&f=0' else: img_url = base_url + '0000{}'.format(i) + '?zoom=0&f=0' i += 1 response = requests.get(img_url) print(img_url) with open("D:/pycharm/實戰案例/正文頁/" + str(i-1) + '.png', "wb") as f: f.write(response.content) print("success download") time.sleep(2) if __name__ == '__main__': url = "http://www.ucdrs.superlib.net/login/login.action" # headers = { # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", # "Accept-Encoding": "gzip, deflate", # "Accept-Language": "zh-CN,zh;q=0.9", # "Cache-Control": "max-age=0", # "Connection": "keep-alive", # "Cookie": "JSESSIONID=E9B8FFC8B023F0FC12A07A3ECDE91581.jp26; __dxca=d04d4dbb-57fb-4114-b080-190507ee4cbf; route=5ead36b501ee59635125fd6ef4221d0e; UM_distinctid=170b290e53493e-0372017c841b37-4313f6a-144000-170b290e5364a3; CNZZDATA2088844=cnzz_eid%3D382770322-1583543803-%26ntime%3D1583551301", # "Host": "img.duxiu.com", # "Upgrade-Insecure-Requests":"1", # "User-Agent":'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36' # } # # cookies = { # "Cookie": "JSESSIONID=E9B8FFC8B023F0FC12A07A3ECDE91581.jp26; __dxca=d04d4dbb-57fb-4114-b080-190507ee4cbf; route=5ead36b501ee59635125fd6ef4221d0e; UM_distinctid=170b290e53493e-0372017c841b37-4313f6a-144000-170b290e5364a3; CNZZDATA2088844=cnzz_eid%3D382770322-1583543803-%26ntime%3D1583551301", # } browser = webdriver.PhantomJS() browser.get(url) wait = WebDriverWait(browser, 8) print("歡迎使用圖書查詢小程序") login() # 登錄 需要手動驗證碼 all_book_url_page = index_page(url) # 查看第幾頁及返回當前頁的書本信息 detail_book_url = get_detail_book(all_book_url_page) #返回每本書的具體的鏈接 sent_book_emial(detail_book_url)
若有錯誤,請留言告訴我,謝謝!