環境:py3.4.4 32位
需要插件:selenium BeautifulSoup xlwt
# coding = utf-8
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import re #re模塊提供正則表達式支持
import xlwt
url = 'http://v.qq.com/vplus/huilanyujia/videos'
tudou = webdriver.Firefox()
tudou.get(url)
#創建workbook和sheet對象
workbook = xlwt.Workbook() #注意Workbook的開頭W要大寫
sheet1 = workbook.add_sheet('優酷',cell_overwrite_ok=True)
count1 = count2 = 3
sheet1.write(0,0,'由於bug,暫時!!最后一頁數據需要自己手動統計')
sheet1.write(1,0,'如有技術問題,請聯系陳鼎,微信chending2012')
#開始寫入文件
for num in range(2,22):
pageNum='pager_num_0_'+str(num)
tudou.find_element_by_id(pageNum).click()
i = tudou.page_source#獲取讀取到的網頁資源
soup = BeautifulSoup(i,"html.parser")
i1 = soup.find_all("strong",class_="figure_title figure_title_two_row")
i2 = soup.find_all("span",class_="info_inner")
#以上通過beautifulsoup 做一個初略的篩選
for each in i1:
p =r'(target="_blank">)(.+)(</a>)'
play_name =re.search(p,str(each)).group(2)
sheet1.write(count1,0,play_name)
count1 += 1
for each in i2:
play_num = ''
p = re.compile(r'\d+\.?萬?')
play_num0 = p.findall(str(each))
for each1 in play_num0:
play_num +=str(each1)
sheet1.write(count2,1,play_num)
count2 += 1
time.sleep(2)
#最后一頁 ,因為最后一頁的元素地址有點不一樣,所以特地寫一篇
pageNum = 'pager_last_0'
tudou.find_element_by_id(pageNum).click()
i = tudou.page_source#獲取讀取到的網頁資源
soup = BeautifulSoup(i,"html.parser")
i1 = soup.find_all("strong",class_="figure_title figure_title_two_row")
i2 = soup.find_all("span",class_="info_inner")
for each in i1:
p =r'(target="_blank">)(.+)(</a>)'#用正則表達式匹配
play_name =re.search(p,str(each)).group(2)
sheet1.write(count1,0,play_name)
count1 += 1
for each in i2:
play_num = ''
p = re.compile(r'\d+\.?萬?')
play_num0 = p.findall(str(each))
for each1 in play_num0:
play_num +=str(each1)
sheet1.write(count2,1,play_num)
count2 += 1
#保存該excel文件,有同名文件時直接覆蓋
Nowtime = time.strftime('%Y-%m-%d',time.localtime(time.time()))
excel_name = str(Nowtime)+'.xls'
workbook.save(excel_name)
print('done')
tudou.quit()
暫時寫這么多,后期會優化代碼,編寫界面。
這里有一個bug,selenium 翻頁以后,獲取到的網頁內容為前一頁的,而非當前頁。望大神指點。
2017.2.22 -------------------------------------
bug已經解決,最后一頁標簽選擇完以后,任選前面一頁,即得到最后一頁的數據!
--------------------------------------------------
-------------------------附錄
1. python高手之路python處理excel文件(方法匯總) http://www.jb51.net/article/77626.htm
2. python模塊介紹- xlwt 創建xls文件(excel) http://www.cnblogs.com/snake-hand/p/3153158.html
3. seleniumwebdriver(python)第三版 http://wenku.baidu.com/view/cd580331b6360b4c2e3f5727a5e9856a5612268d
4. Beautiful Soup 中文文檔 https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html