要求:
1、題目、url、作者、相對時間以及評論數量
2、存入mongoDB
3、模擬Chrome下拉異步加載新聞
4、相對時間的轉換(1分鍾前。。。。。)
連接mongoDB,設置數據庫名和集合名

實例化Chrome,隱式等待5秒,點擊科技新聞

execue_script 加載js命令運行,兩個循環往下拉下去(這里設定了2000條信息)
、
時間轉換,我這里比較簡單用了正則匹配數字再利用時間戳去量化時間點

簡簡單單的獲取字段,這里需要注意的是href的提取,以及時間的轉化函數

簡簡單單將信息存入mongodb里面

最后上代碼
from selenium import webdriver
import time
import datetime
from dateutil import parser
import re
import pymongo
client = pymongo.MongoClient()
db = client['db'] # 數據庫名
toutiao = db['toutiao'] # 設置集合名
browser = webdriver.Chrome()
url = 'https://www.toutiao.com/'
browser.get(url)
browser.implicitly_wait(5) # 隱式等待
browser.find_element_by_link_text('科技').click()
browser.implicitly_wait(3)
title_list, url_list, comments_list, pubtime_list, author_list = [], [], [], [], [],
# 獲取科技頁面題目,url,作者, 評論數量, 相對時間
def get_page():
time.sleep(3)
while len(title_list) < 2000:
for i in range(50): # 3.要下拉滾動條,搜索解決
js = "var q=document.documentElement.scrollTop={}".format(i * 200) # javascript語句
browser.execute_script(js)
time.sleep(1)
get_info()
else:
browser.close()
def transform_time(t):
if u'剛剛' in t:
c = time.time()
c = time.strftime('%Y年%m月%d日%H時%M分%S秒', time.localtime(c))
return c
min = re.findall('\d+', t)[0]
if u'分鍾前' in t:
c = time.time() - int(min) * 60 # 量化時間
elif u'小時前' in t:
c = time.time() - int(min)*60*60
elif u'天前' in t:
c = time.time() - int(min)*60*60*24
else:
return None
c = time.strftime('%Y年%m月%d日%H時%M分%S秒', time.localtime(c))
return c
def get_info():
titles = browser.find_elements_by_xpath('//div[@class="title-box"]/a')
for title in titles:
title_list.append(title.text)
urls = browser.find_elements_by_xpath('//div[@class="title-box"]/a')
for url in urls:
url_list.append(url.get_attribute('href'))
authers = browser.find_elements_by_xpath('//a[@class="lbtn source"]')
for auther in authers:
author_list.append(auther.text)
comments = browser.find_elements_by_xpath('//a[@class="lbtn comment"]')
for comment in comments:
comments_list.append(comment.text)
pub_times = browser.find_elements_by_xpath('//span[@class="lbtn"]')
for pubtime in pub_times:
new_time = transform_time(pubtime.text)
pubtime_list.append(new_time)
def save_info():
infos = zip(title_list, url_list, author_list, comments_list, pubtime_list)
for info in infos:
# dateStr = info[4]
# myDatetime = parser.parse(dateStr)
data = {
'標題': info[0],
'url': info[1],
'來源': info[2],
'評論': info[3],
'時間': info[4],
}
result = db['toutiao'].insert_one(data)
print(data)
print('done')
def main():
get_page()
save_info()
if __name__ == '__main__':
main()
