[Python爬蟲] 之四:Selenium 抓取微博數據


抓取代碼:

# coding=utf-8
import os
import re
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import IniFile
class weibo:

def __init__(self):
#通過配置文件獲取IEDriverServer.exe路徑
configfile = os.path.join(os.getcwd(),'config.conf')
cf = IniFile.ConfigFile(configfile)
IEDriverServer = cf.GetValue("section", "IEDriverServer")
#每抓取一頁數據延遲的時間,單位為秒,默認為5秒
self.pageDelay = 5
pageInteralDelay = cf.GetValue("section", "pageInteralDelay")
if pageInteralDelay:
self.pageDelay = int(pageInteralDelay)

os.environ["webdriver.ie.driver"] = IEDriverServer
self.driver = webdriver.Ie(IEDriverServer)

def scroll_top(self):
'''
滾動條拉到頂部
:return:
'''
if self.driver.name == "chrome":
js = "var q=document.body.scrollTop=0"

else:
js = "var q=document.documentElement.scrollTop=0"
return self.driver.execute_script(js)

def scroll_foot(self):
'''
滾動條拉到底部
:return:
'''

if self.driver.name == "chrome":
js = "var q=document.body.scrollTop=10000"

else:
js = "var q=document.documentElement.scrollTop=10000"
return self.driver.execute_script(js)

def printTopic(self,topic):
print '原始數據: %s' % topic
print ' '
author_time_nums_index = topic.rfind('@')
ht = topic[:author_time_nums_index]
ht = ht.replace('\n', '')
print '話題: %s' % ht

author_time_nums = topic[author_time_nums_index:]
author_time = author_time_nums.split('ñ')[0]
nums = author_time_nums.split('ñ')[1]
pattern1 = re.compile(r'\d{1,2}分鍾前|今天\s{1}\d{2}:\d{2}|\d{1,2}月\d{1,2}日\s{1}\d{2}:\d{2}')
time1 = re.findall(pattern1, author_time)

print '話題作者: %s' % author_time.split(' ')[0]
# print '時間: %s' % author_time.split(' ')[1]
print '時間: %s' % time1[0]
print '點贊量: %s' % nums.split(' ')[0]
print '評論量: %s' % nums.split(' ')[1]
print '轉發量: %s' % nums.split(' ')[2]
print ' '


def CatchData(self,listClass,firstUrl):
'''
抓取數據
:param id: 要獲取元素標簽的ID
:param firstUrl: 首頁Url
:return:
'''
start = time.clock()
#加載首頁
wait = ui.WebDriverWait(self.driver, 20)
self.driver.get(firstUrl)
#打印標題
print self.driver.title

# # 聚焦元素
# target = self.driver.find_element_by_id('J_ItemList')
# self.driver.execute_script("arguments[0].scrollIntoView();", target)

#滾動5次滾動條
Scrollcount = 5
while Scrollcount > 0:
Scrollcount = Scrollcount -1
self.scroll_foot() #滾動一次滾動條,定位查找一次
total = 0
for className in listClass:
time.sleep(10)
wait.until(lambda driver: self.driver.find_elements_by_xpath(className))
Elements = self.driver.find_elements_by_xpath(className)
for element in Elements:
print ' '
txt = element.text.encode('utf8')
self.printTopic(txt)
total = total + 1

self.driver.close()
self.driver.quit()
end = time.clock()

print ' '
print "共抓取了: %d 個話題" % total
print "整個過程用時間: %f 秒" % (end - start)

# #測試抓取微博數據
obj = weibo()
#pt_li pt_li_2 S_bg2
#pt_li pt_li_1 S_bg2
# firstUrl = "http://weibo.com/?category=0"
firstUrl = "http://weibo.com/?category=1760"
listClass = []
listClass.append("//li[@class='pt_li pt_li_1 S_bg2']")
listClass.append("//li[@class='pt_li pt_li_2 S_bg2']")
obj.CatchData(listClass,firstUrl)

  登錄窗口

  

  

 def longon(self):

        flag = True
        try:

            self.driver.get('https://weibo.com/')
            self.driver.maximize_window()
            time.sleep(2)
            accname = self.driver.find_element_by_id("loginname")

            accname.send_keys('username')

            accpwd = self.driver.find_element_by_name("password")
            accpwd.send_keys('password')
            submit = self.driver.find_element_by_xpath("//div[@class='info_list login_btn']/a")
            submit.click()
            time.sleep(2)
        except Exception as e1:
            message = str(e1.args)
            flag = False
        return flag

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM