Python采集網頁數據保存到excel


urllib讀取網頁,然后用Py-excel寫excel。

import urllib
from xlwt import Workbook
import datetime
def FetchData():
    book = Workbook(encoding='gbk')    #如果采集數據有中文,需要添加這個 
    sheet1 = book.add_sheet('Sheet 2') #表格緩存 
    i = 0
    theday = datetime.date(2009,12,31)
    while i < 100: #這邊的場景就是采集100個網頁,每個網址都包含日期 
        i += 1
        theday = theday + datetime.timedelta(days = 1)
        print theday
        theday_str = str(theday)
        sheet1.write(i,0,theday_str)  #寫表格 
        check_url = r'http://www.xxx.com/index?date=' + theday_str #網頁地址
        try:
            checkfile = urllib.urlopen(check_url)  #網頁保存為文本文件 
        except Exception,e:
            print e
            return
        type = sys.getfilesystemencoding()
        for line in checkfile:
            line = line.decode("UTF-8").encode(type)     #網頁編碼為UTF-8 
            date_west = getdata('date_west', line)       #獲取特定數據 
            if date_west != False:
                sheet1.write(i,1,date_west)
    book.save('simple.xls')  #保存excel文件 
    print 'finish!'
'if keywords in the line, get data from > to </'
def getdata(keywords, line):
    data = ''
    if keywords in line:
        start = line.find('>',)
        end = line.find('</', start)
        data = line[start+1:end]
        return data
    return False

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM