(十)selenium實現微博高級搜索信息爬取

本文轉載自查看原文 2020-07-26 22:39 614 爬蟲實戰系列

1.selenium模擬登陸

2.定位進入高級搜索頁面

3.對高級搜索進行定位，設置。

4.代碼實現

import time
from selenium import webdriver
from lxml import etree
from selenium.webdriver import ChromeOptions
import requests
from PIL import Image
from hashlib import md5
from selenium.webdriver.support.select import Select
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36',
}


# 超級鷹
class Chaojiying_Client(object):
    """超級鷹源代碼"""

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 圖片字節
        codetype: 題目類型 參考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:報錯題目的圖片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()

# 輸入用戶名 密碼
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
url = 'https://www.weibo.com/'
bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option)
bro.maximize_window()
bro.get(url=url)
time.sleep(10)  # 視網速而定
bro.find_element_by_id('loginname').send_keys('你的賬號')
time.sleep(2)
bro.find_element_by_css_selector(".info_list.password input[node-type='password']").send_keys(
            "你的密碼")
time.sleep(1)

#識別驗證碼
def recognize(bro):
    bro.save_screenshot('weibo.png')
    pic = bro.find_element_by_xpath('//div[@class="login_innerwrap"]/div[3]/div[3]/a/img')
    location = pic.location
    size = pic.size
    rangle = (location['x'] * 1.25, location['y'] * 1.25, (location['x'] + size['width']) * 1.25,
              (location['y'] + size['height']) * 1.25)
    i = Image.open('./weibo.png')
    code_img_name = 'code.png'  # 裁剪文件的文件名稱
    frame = i.crop(rangle)  # 根據指定區域進行裁剪
    frame.save(code_img_name)
    chaojiying = Chaojiying_Client('超級鷹賬號', '密碼', '    905993')
    im = open('./code.png', 'rb').read()
    result = chaojiying.PostPic(im, 3005)['pic_str']
    return result

# 輸入驗證碼
# 微博第一次點擊登陸可能不成功 確保成功登陸
for i in range(5):
    try:
        bro.find_element_by_xpath('//div[@class="login_innerwrap"]/div[3]/div[3]/div/input').send_keys(recognize(bro))
        bro.find_element_by_xpath('//div[@class="login_innerwrap"]/div[3]/div[6]/a').click()
    except Exception:
        continue

time.sleep(5)
# 進入高級搜索頁面
bro.find_element_by_xpath('//div[@class="gn_header clearfix"]/div[2]/a').click()
bro.find_element_by_xpath('//div[@class="m-search"]/div[3]/a').click()
# 填入關鍵詞
key_word = bro.find_element_by_xpath('//div[@class="m-layer"]/div[2]/div/div[1]/dl//input')
key_word.clear()
key_word.send_keys('霧霾')

# 填入地點
province = bro.find_element_by_xpath('//div[@class="m-adv-search"]/div/dl[5]//select[1]')
city = bro.find_element_by_xpath('//div[@class="m-adv-search"]/div/dl[5]//select[2]')
Select(province).select_by_visible_text('陝西')
Select(city).select_by_visible_text('西安')

# 填入時間
# 起始
bro.find_element_by_xpath('//div[@class="m-adv-search"]/div[1]/dl[4]//input[1]').click()  # 點擊input輸入框
sec_1 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[1]')
Select(sec_1).select_by_visible_text('一月')
sec_2 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[2]')
Select(sec_2).select_by_visible_text('2019')
bro.find_element_by_xpath('//div[@class="m-caldr"]/ul[2]/li[3]').click() # 起始日期




# 終止
bro.find_element_by_xpath('//div[@class="m-adv-search"]/div[1]/dl[4]//input[2]').click()  # 點擊input輸入框
sec_1 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[1]')
Select(sec_1).select_by_visible_text('一月') #月份
sec_2 = bro.find_element_by_xpath('//div[@class="m-caldr"]/div/select[2]')
Select(sec_2).select_by_visible_text('2019') # 年份
bro.find_element_by_xpath('//div[@class="m-caldr"]/ul[2]/li[6]').click()  # 日期

sec_3 = bro.find_element_by_xpath('//div[@class="m-adv-search"]/div[1]/dl[4]//select[2]') # 點擊input輸入框
Select(sec_3).select_by_visible_text('8時')  # 小時
bro.find_element_by_xpath('//div[@class="btn-box"]/a[1]').click()


# 爬取用戶ID 發帖內容 時間  客戶端 評論數 轉發量 點贊數 並持久化存儲
page_num = 1
with open('2019.txt','a',encoding='utf-8') as f:
    while page_num<=50:
            page_text = bro.page_source
            tree = etree.HTML(page_text)
            div_list = tree.xpath('//div[@id="pl_feedlist_index"]/div[2]/div')
            for div in div_list:
                try:
                    user_id = div.xpath('./div/div[1]/div[2]/div[1]/div[2]/a[1]/text()')[0] # 用戶id
                    flag = div.xpath('./div/div[1]/div[2]/p[3]')
                    if not flag:
                        content = div.xpath('./div/div[1]/div[2]/p[1]')[0].xpath('string(.)')  # 內容
                        time = div.xpath('./div/div[1]/div[2]/p[2]/a/text()')[0]  # 發布時間
                        client = div.xpath('./div/div[1]/div[2]/p[2]/a[2]/text()')[0]
                    else:
                        content = div.xpath('./div/div[1]/div[2]/p[2]')[0].xpath('string(.)')  # 內容
                        time = div.xpath('./div/div[1]/div[2]/p[3]/a/text()')[0]  # 發布時間
                        client = div.xpath('./div/div[1]/div[2]/p[3]/a[2]/text()')[0]  #客戶端
                    up=div.xpath('./div/div[2]/ul/li[4]/a')[0].xpath('string(.)')  #點贊數
                    transfer = div.xpath('./div/div[2]/ul/li[2]/a/text()')[0]  # 轉發量
                    comment = div.xpath('./div/div[2]/ul/li[3]/a/text()')[0]  # 評論數
                    f.write('\n'+user_id+'\n'+content+'\n'+time+client+' '+transfer+' '+comment+' '+'贊'+up+'\n'+'\n')
                except IndexError:
                    continue
            if page_num ==1:  # 第一頁 元素位置不同 進行判斷
                bro.find_element_by_xpath('//div[@class="m-page"]/div/a').click()
            else:
                bro.find_element_by_xpath('//div[@class="m-page"]/div/a[@class="next"]').click()
            page_num+=1

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 新浪微博搜索頁用戶信息爬取 Python-爬取微博信息爬取某人的微博信息爬取今日熱榜微博的前十名熱點信息數據用selenium爬取某人的微博數據，面向過程方式數據爬蟲爬取微博上的個人所有信息運用Python爬取新浪微博用戶的信息爬取微博文章內容，關鍵字搜索爬取微博內容爬取 nodejs實現定時爬取微博熱搜