2018-7-12python爬取歷史天氣數據

python 爬蟲天氣數據

需求

需要幾個城市的歷史天氣數據，為了方便最后入庫，需要的字段為城市、溫度、天氣。最好能生成一個完整的csv導入數據。

from bs4 import BeautifulSoup as bsp
import urllib,http.cookiejar,re,time
#對excel的操作，可以添加sheet
import xlwt
#拼接兩個csv
import glob
import time

# 做好cookie管理工作
cookie=http.cookiejar.CookieJar() # 創建空CookieJar
cj=urllib.request.HTTPCookieProcessor(cookie) # 構造cookie
opener = urllib.request.build_opener(cj) # 根據cookie構造opener
# 偽造header
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:37.0) Gecko/20100101 Firefox/37.0'),
                     ('Connection',' keep-alive')]
# 載入header
urllib.request.install_opener(opener)

# 定義若干url
root_url='http://www.tianqihoubao.com/lishi/'

# 獲取root_url網頁內容
req = urllib.request.Request(root_url)
u=bsp(bytes.decode(urllib.request.urlopen(req).read(),'gbk'))

# 找到表格
u1=u.find(class_="citychk")

# 找到表格里的每一個市
u2=u1.find_all('dd')


# 記錄每一個市的url和對應的中文名稱
city_urls=[]
city_names=[]

#for uu in u2:
# u3=uu.find_all('a')
# for u3u in u3:
# city_urls.append(re.split('\.|/',u3u.attrs['href'])[2])
# city_names.append(u3u.text)



#### shg 2018/07/12 ###############
#### 需要哪些城市將那些城市的拼音和漢字放到以下兩個列表中，月份同理
city_urls=["nanjing","beijing"]
city_names=["南京","北京"]
months=['201712','201801','201802','201803','201804','201805','201806','201807']
#### shg 2018/07/12 ###############



# 構造各市歷史月份url
for i,city_url in enumerate(city_urls):
    # 每爬一個市休息2秒鍾
    time.sleep(2)
    with open('D:/天氣數據/'+str(i)+city_names[i]+'.csv','w') as f:
        ##### 只給第一個csv建列名
        if i == 0 :
            f.write('日期,天氣狀況,氣溫,城市,風力風向\n')
        for month in months:
            url_month='http://www.tianqihoubao.com/lishi/'+city_url+'/month/'+month+'.html'
            req = urllib.request.Request(url_month)

            #如果連接不成功，休息5分鍾
            while True:
                try:
                    u=bsp(bytes.decode(urllib.request.urlopen(req).read(),'gbk'))
                    break
                except:
                    time.sleep(300)

            # 刪去不可見字符
            u1=[re.sub('\s','',x.text) for x in u.table.find_all('td')]

            # 寫入文件的時候跳過表頭
            for j,item in enumerate(u1[4:]):
                if j%4==3:
                    # 城市名稱列#### shg 2018/07/12 #################
                    f.write(city_names[i]+",")
                    f.write(item+'\n')
                else:
                    f.write(item+',')
                    
                    
                    
#### shg 2018/07/12 #################
#import glob #import time
##### 拼接多個城市的csv
##### 注意修改路徑
csvx_list = glob.glob('D:/天氣數據/*.csv')
print('總共發現%s個CSV文件'% len(csvx_list))
time.sleep(2)
for i in csvx_list:
    fr = open(i,'r').read()
    with open('weather_history.csv','a') as f:
        f.write(fr)
print('拼接完畢！')
#### shg 2018/07/12 #################

在做的過程中遇到了幾個問題，記錄如下：

1 寫入excel中，分成多個sheet

導入xlwt包。

import xlwt
# 新建一個xls文件，注意編碼
workbook = xlwt.Workbook(encoding='utf-8')
# 新建一個sheet，如果對一個單元格重復操作，添加cell_overwrite_ok=True
booksheet1 = workbook.add_sheet("beijing",cell_overwrite_ok=True)
# 寫入
booksheet1.write(1,1,"beijing")
booksheet2 = workbook.add_sheet("jing",cell_overwrite_ok=True)
booksheet2.write(1,1,"jing")
# 保存文件
workbook.save('weather.xls')

2 拼接多個csv文件。

縱向的拼接，需要兩個csv文件的列相同，注意兩點：

如果是循環跑出來的表，每個表都有表頭，拼接的時候不會去掉。
拼接的時候是按照文件名默認排序從上到下依次排列的。

import glob
import time

csvx_list = glob.glob('D:/天氣數據/*.csv')
print('總共發現%s個CSV文件'% len(csvx_list))
time.sleep(2)
print('正在處理............')
for i in csvx_list:
    fr = open(i,'r').read()
    with open('csv_to_csv.csv','a') as f:
        f.write(fr)
    print('寫入成功！')
print('寫入完畢！')
print('10秒鍾自動關閉程序！')

3 獲取網頁上表格的方法

[re.sub('\s','',x.text) for x in u.table.find_all('td')]

re.sub("替換的內容“，”替換成“，需替換的字符串）

替換的內容可以使用正則表達式，\ 為轉義字符；| 為或，可以拼接多個條件；例如：

import re
re.sub('\[|\]|\"|\"','','["88585465","64325165","1685654"]')

>>> '88585465,64325165,1685654'

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python爬取天氣網歷史天氣數據爬蟲爬取全國歷史天氣數據 python爬取天氣數據的實例詳解天氣數據爬取+pyechart可視化 python獲取天氣數據初識python 之爬蟲：爬取中國天氣網數據 python3 - 抓取全國天氣數據並存入excel python讀取txt天氣數據並使用matplotlib模塊繪圖 python爬取天氣后報網 python爬取中國天氣網站數據並對其進行數據可視化