要求:
1、題目、url、作者、相對時間以及評論數量
2、存入mongoDB
3、模擬Chrome下拉異步加載新聞
4、相對時間的轉換(1分鍾前。。。。。)
連接mongoDB,設置數據庫名和集合名
實例化Chrome,隱式等待5秒,點擊科技新聞
execue_script 加載js命令運行,兩個循環往下拉下去(這里設定了2000條信息)
、
時間轉換,我這里比較簡單用了正則匹配數字再利用時間戳去量化時間點
簡簡單單的獲取字段,這里需要注意的是href的提取,以及時間的轉化函數
簡簡單單將信息存入mongodb里面
最后上代碼
from selenium import webdriver import time import datetime from dateutil import parser import re import pymongo client = pymongo.MongoClient() db = client['db'] # 數據庫名 toutiao = db['toutiao'] # 設置集合名 browser = webdriver.Chrome() url = 'https://www.toutiao.com/' browser.get(url) browser.implicitly_wait(5) # 隱式等待 browser.find_element_by_link_text('科技').click() browser.implicitly_wait(3) title_list, url_list, comments_list, pubtime_list, author_list = [], [], [], [], [], # 獲取科技頁面題目,url,作者, 評論數量, 相對時間 def get_page(): time.sleep(3) while len(title_list) < 2000: for i in range(50): # 3.要下拉滾動條,搜索解決 js = "var q=document.documentElement.scrollTop={}".format(i * 200) # javascript語句 browser.execute_script(js) time.sleep(1) get_info() else: browser.close() def transform_time(t): if u'剛剛' in t: c = time.time() c = time.strftime('%Y年%m月%d日%H時%M分%S秒', time.localtime(c)) return c min = re.findall('\d+', t)[0] if u'分鍾前' in t: c = time.time() - int(min) * 60 # 量化時間 elif u'小時前' in t: c = time.time() - int(min)*60*60 elif u'天前' in t: c = time.time() - int(min)*60*60*24 else: return None c = time.strftime('%Y年%m月%d日%H時%M分%S秒', time.localtime(c)) return c def get_info(): titles = browser.find_elements_by_xpath('//div[@class="title-box"]/a') for title in titles: title_list.append(title.text) urls = browser.find_elements_by_xpath('//div[@class="title-box"]/a') for url in urls: url_list.append(url.get_attribute('href')) authers = browser.find_elements_by_xpath('//a[@class="lbtn source"]') for auther in authers: author_list.append(auther.text) comments = browser.find_elements_by_xpath('//a[@class="lbtn comment"]') for comment in comments: comments_list.append(comment.text) pub_times = browser.find_elements_by_xpath('//span[@class="lbtn"]') for pubtime in pub_times: new_time = transform_time(pubtime.text) pubtime_list.append(new_time) def save_info(): infos = zip(title_list, url_list, author_list, comments_list, pubtime_list) for info in infos: # dateStr = info[4] # myDatetime = parser.parse(dateStr) data = { '標題': info[0], 'url': info[1], '來源': info[2], '評論': info[3], '時間': info[4], } result = db['toutiao'].insert_one(data) print(data) print('done') def main(): get_page() save_info() if __name__ == '__main__': main()