1.selenium模擬登陸
2.定位進入高級搜索頁面
3.對高級搜索進行定位,設置。
4.代碼實現
import time from selenium import webdriver from lxml import etree from selenium.webdriver import ChromeOptions import requests from PIL import Image from hashlib import md5 from selenium.webdriver.support.select import Select headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36', } # 超級鷹 class Chaojiying_Client(object): """超級鷹源代碼""" def __init__(self, username, password, soft_id): self.username = username password = password.encode('utf8') self.password = md5(password).hexdigest() self.soft_id = soft_id self.base_params = { 'user': self.username, 'pass2': self.password, 'softid': self.soft_id, } self.headers = { 'Connection': 'Keep-Alive', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', } def PostPic(self, im, codetype): """ im: 圖片字節 codetype: 題目類型 參考 http://www.chaojiying.com/price.html """ params = { 'codetype': codetype, } params.update(self.base_params) files = {'userfile': ('ccc.jpg', im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers) return r.json() def ReportError(self, im_id): """ im_id:報錯題目的圖片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json() # 輸入用戶名 密碼 option = ChromeOptions() option.add_experimental_option('excludeSwitches', ['enable-automation']) url = 'https://www.weibo.com/' bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option) bro.maximize_window() bro.get(url=url) time.sleep(10) # 視網速而定 bro.find_element_by_id('loginname').send_keys('你的賬號') time.sleep(2) bro.find_element_by_css_selector(".info_list.password input[node-type='password']").send_keys( "你的密碼") time.sleep(1) #識別驗證碼 def recognize(bro): bro.save_screenshot('weibo.png') pic = bro.find_element_by_xpath('//div[@class="login_innerwrap"]/div[3]/div[3]/a/img') location = pic.location size = pic.size rangle = (location['x'] * 1.25, location['y'] * 1.25, (location['x'] + size['width']) * 1.25, (location['y'] + size['height']) * 1.25) i = Image.open('./weibo.png') code_img_name = 'code.png' # 裁剪文件的文件名稱 frame = i.crop(rangle) # 根據指定區域進行裁剪 frame.save(code_img_name) chaojiying = Chaojiying_Client('超級鷹賬號', '密碼', ' 905993') im = open('./code.png', 'rb').read() result = chaojiying.PostPic(im, 3005)['pic_str'] return result # 輸入驗證碼 # 微博第一次點擊登陸可能不成功 確保成功登陸 for i in range(5): try: bro.find_element_by_xpath('//div[@class="login_innerwrap"]/div[3]/div[3]/div/input').send_keys(recognize(bro)) bro.find_element_by_xpath('//div[@class="login_innerwrap"]/div[3]/div[6]/a').click() except Exception: continue time.sleep(5) # 進入高級搜索頁面 bro.find_element_by_xpath('//div[@class="gn_header clearfix"]/div[2]/a').click() bro.find_element_by_xpath('//div[@class="m-search"]/div[3]/a').click() # 填入關鍵詞 key_word = bro.find_element_by_xpath('//div[@class="m-layer"]/div[2]/div/div[1]/dl//input') key_word.clear() key_word.send_keys('霧霾') # 填入地點 province = bro.find_element_by_xpath('//div[@class="m-adv-search"]/div/dl[5]//select[1]') city = bro.find_element_by_xpath('//div[@class="m-adv-search"]/div/dl[5]//select[2]') Select(province).select_by_visible_text('陝西') Select(city).select_by_visible_text('西安') # 填入時間 # 起始 bro.find_element_by_xpath('//div[@class="m-adv-search"]/div[1]/dl[4]//input[1]').click() # 點擊input輸入框 sec_1 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[1]') Select(sec_1).select_by_visible_text('一月') sec_2 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[2]') Select(sec_2).select_by_visible_text('2019') bro.find_element_by_xpath('//div[@class="m-caldr"]/ul[2]/li[3]').click() # 起始日期 # 終止 bro.find_element_by_xpath('//div[@class="m-adv-search"]/div[1]/dl[4]//input[2]').click() # 點擊input輸入框 sec_1 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[1]') Select(sec_1).select_by_visible_text('一月') #月份 sec_2 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[2]') Select(sec_2).select_by_visible_text('2019') # 年份 bro.find_element_by_xpath('//div[@class="m-caldr"]/ul[2]/li[6]').click() # 日期 sec_3 = bro.find_element_by_xpath('//div[@class="m-adv-search"]/div[1]/dl[4]//select[2]') # 點擊input輸入框 Select(sec_3).select_by_visible_text('8時') # 小時 bro.find_element_by_xpath('//div[@class="btn-box"]/a[1]').click() # 爬取用戶ID 發帖內容 時間 客戶端 評論數 轉發量 點贊數 並持久化存儲 page_num = 1 with open('2019.txt','a',encoding='utf-8') as f: while page_num<=50: page_text = bro.page_source tree = etree.HTML(page_text) div_list = tree.xpath('//div[@id="pl_feedlist_index"]/div[2]/div') for div in div_list: try: user_id = div.xpath('./div/div[1]/div[2]/div[1]/div[2]/a[1]/text()')[0] # 用戶id flag = div.xpath('./div/div[1]/div[2]/p[3]') if not flag: content = div.xpath('./div/div[1]/div[2]/p[1]')[0].xpath('string(.)') # 內容 time = div.xpath('./div/div[1]/div[2]/p[2]/a/text()')[0] # 發布時間 client = div.xpath('./div/div[1]/div[2]/p[2]/a[2]/text()')[0] else: content = div.xpath('./div/div[1]/div[2]/p[2]')[0].xpath('string(.)') # 內容 time = div.xpath('./div/div[1]/div[2]/p[3]/a/text()')[0] # 發布時間 client = div.xpath('./div/div[1]/div[2]/p[3]/a[2]/text()')[0] #客戶端 up=div.xpath('./div/div[2]/ul/li[4]/a')[0].xpath('string(.)') #點贊數 transfer = div.xpath('./div/div[2]/ul/li[2]/a/text()')[0] # 轉發量 comment = div.xpath('./div/div[2]/ul/li[3]/a/text()')[0] # 評論數 f.write('\n'+user_id+'\n'+content+'\n'+time+client+' '+transfer+' '+comment+' '+'贊'+up+'\n'+'\n') except IndexError: continue if page_num ==1: # 第一頁 元素位置不同 進行判斷 bro.find_element_by_xpath('//div[@class="m-page"]/div/a').click() else: bro.find_element_by_xpath('//div[@class="m-page"]/div/a[@class="next"]').click() page_num+=1