爬蟲爬取全國歷史天氣數據


一段很簡單的爬蟲程序,爬取的網站為http://www.tianqihoubao.com,可以自己修改爬取城市以及爬取的月份,這里爬取的是1到7月的數據

from bs4 import BeautifulSoup
import requests
import pymysql
import warnings
# import pinyin
# from pinyin import PinYin
from pypinyin import pinyin, lazy_pinyin
import pypinyin
warnings.filterwarnings("ignore")

conn = pymysql.connect(host='localhost', user='root', passwd='root', db='test2', port=3306, charset='utf8')
cursor = conn.cursor()
def get_temperature(url,city):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,  like Gecko) Chrome/63.0.3239.132 Safari/537.36'}           # 設置頭文件信息
    response = requests.get(url,  headers=headers).content    # 提交requests get 請求
    soup = BeautifulSoup(response,  "lxml")       # 用Beautifulsoup 進行解析

    conmid2 = soup.findAll('div',  class_='wdetail')
    # conmid2 = conmid.findAll('div',  class_='wdetail')

    for info in conmid2:
        tr_list = info.find_all('tr')[1:]       # 使用切片取到第三個tr標簽
        for index,  tr in enumerate(tr_list):     # enumerate可以返回元素的位置及內容
            td_list = tr.find_all('td')
            # if index == 0:

            date = td_list[0].text.strip().replace("\n", "")  # 取每個標簽的text信息,並使用replace()函數將換行符刪除
            weather = td_list[1].text.strip().replace("\n", "").split("/")[0].strip()
            temperature = td_list[2].text.strip().replace("\n",  "").split("/")[0].strip()
            wind = td_list[3].text.strip().replace("\n",  "").split("/")[0].strip()

            # else:
            #     city_name = td_list[0].text.replace('\n',  '')
            #     weather = td_list[4].text.replace('\n',  '')
            #     wind = td_list[5].text.replace('\n',  '')
            #     max = td_list[3].text.replace('\n',  '')
            #     min = td_list[6].text.replace('\n',  '')

            print(city,date,  weather,  wind,  temperature)
            cursor.execute('insert into weather(city, date, weather, wind, temp) values(%s, %s, %s, %s, %s)'
                           ,  (city,  date,  weather,  wind,  temperature ))
if __name__=='__main__':

    # citys1= ["成都市","廣元市","綿陽市","德陽市","南充市","廣安市","遂寧市","內江市","樂山市","自貢市","瀘州市","宜賓市","攀枝花市","巴中市","達州市","資陽市","眉山市","雅安市","崇州市","邛崍市","都江堰市","彭州市","江油市","什邡市","廣漢市","綿竹市","閬中市","華鎣市","峨眉山市","萬源市","簡陽市","西昌市","康定市","馬爾康市","隆昌市"]
    # citys1= ["鄭州市","開封市","洛陽市","平頂山市","安陽市","鶴壁市","新鄉市","焦作市","濮陽市","許昌市","漯河市","三門峽市","南陽市","商丘市","周口市","駐馬店市","信陽市","滎陽市","新鄭市","登封市","新密市","偃師市","孟州市","沁陽市","衛輝市","輝縣市","林州市","禹州市","長葛市","舞鋼市","義馬市","靈寶市","項城市","鞏義市","鄧州市","永城市","汝州市","濟源市"]
    # citys1= ["呼和浩特市","包頭市","烏海市","赤峰市","通遼市","鄂爾多斯市","呼倫貝爾市","巴彥淖爾市","烏蘭察布市","霍林郭勒市","滿洲里市","牙克石市","扎蘭屯市","額爾古納市","根河市","豐鎮市","烏蘭浩特市","阿爾山市","二連浩特市","錫林浩特市"]
    # citys1= ["沈陽市","大連市","鞍山市","撫順市","本溪市","丹東市","錦州市","營口市","阜新市","遼陽市","盤錦市","鐵嶺市","朝陽市","葫蘆島市","新民市","瓦房店市","庄河市","海城市","東港市","鳳城市","凌海市","北鎮市","蓋州市","大石橋市","燈塔市","調兵山市","開原市","北票市","凌源市","興城市"]
    # citys1= ["葫蘆島市","新民市","瓦房店市","庄河市","海城市","東港市","鳳城市","凌海市","北鎮市","蓋州市","大石橋市","燈塔市","調兵山市","開原市","北票市","凌源市","興城市"]
    citys1= ["開原市","北票市","凌源市","興城市"]


    for city in citys1:
        city1 = ''.join(lazy_pinyin(city[:-1]))
        print(city1)
        urls = ['http://www.tianqihoubao.com/lishi/'+city1+'/month/201801.html',
                'http://www.tianqihoubao.com/lishi/'+city1+'/month/201802.html',
                'http://www.tianqihoubao.com/lishi/'+city1+'/month/201803.html',
                'http://www.tianqihoubao.com/lishi/'+city1+'/month/201804.html',
                'http://www.tianqihoubao.com/lishi/'+city1+'/month/201805.html',
                'http://www.tianqihoubao.com/lishi/'+city1+'/month/201806.html',
                'http://www.tianqihoubao.com/lishi/'+city1+'/month/201807.html']
        for url in urls:
            get_temperature(url, city)
        conn.commit()



 

 

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM