對空氣質量歷史數據的爬取

本文轉載自查看原文 2018-09-25 17:09 3381

爬取https://www.aqistudy.cn/historydata 網站的空氣質量報告，爬取的數據以csv文件生成

scrapy startproject air_quality 創建scrapy項目

scrapy genspider api_history_spider https://www.apistudy.cn/historydata/index.php 編寫spider

文件目錄如圖所示

seetings.py

1 ITEM_PIPELINES = {
2     'air_quality.pipelines.AirQualityPipeline': 300,
3 }

items.py

 1 import scrapy
 2 
 3 
 4 class AirQualityItem(scrapy.Item):
 5     # define the fields for your item here like:
 6     # name = scrapy.Field()
 7     city_name = scrapy.Field()  # 城市名稱
 8     record_date = scrapy.Field()  # 檢測日期
 9     aqi_val = scrapy.Field()  # AQI
10     range_val = scrapy.Field()  # 范圍
11     quality_level = scrapy.Field()  # 質量等級
12     pm2_5_val = scrapy.Field()  # PM2.5
13     pm10_val = scrapy.Field()  # PM10
14     so2_val = scrapy.Field()  # SO2
15     co_val = scrapy.Field()  # CO
16     no2_val = scrapy.Field()  # NO2
17     o3_val = scrapy.Field()  # O3
18     rank = scrapy.Field()  # 排名

pipelines.py

 1 from scrapy.exporters import CsvItemExporter
 2 
 3 class AirQualityPipeline(object):
 4 
 5     def open_spider(self,spider):
 6         self.file = open('air_quality.csv', 'wb')
 7         self.exporter = CsvItemExporter(self.file)
 8         self.exporter.start_exporting()
 9 
10     def close_spider(self,spider):
11         self.exporter.finish_exporting()
12         self.file.close()
13 
14     def process_item(self, item,spider):
15         self.exporter.export_item(item)
16         return item

api_history_spider.py

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from urllib import parse
 4 from air_quality.items import AirQualityItem
 5 
 6 base_url = 'https://www.aqistudy.cn/historydata/'
 7 
 8 class ApiHistorySpiderSpider(scrapy.Spider):
 9     name = 'api_history_spider'
10     allowed_domains = ["aqistudy.cn"]
11     start_urls = ['https://www.aqistudy.cn/historydata/']
12 
13     def parse(self, response):
14         """
15             解析初始頁面
16         """
17         # 獲取所有城市的URL
18         city_url_list = response.xpath('//div[@class="all"]//div[@class="bottom"]//a//@href')
19 
20         for city_url in city_url_list:
21             # 依次遍歷城市URL
22             city_month_url = base_url + city_url.extract()
23             # 解析每個城市的月份數據
24             request = scrapy.Request(city_month_url, callback=self.parse_city_month)
25             yield request
26 
27     def parse_city_month(self, response):
28         """
29             解析該城市的月份數據
30         """
31         # 獲取該城市的所有月份URL
32         month_url_list = response.xpath('//table[@class="table table-condensed '
33                                         'table-bordered table-striped table-hover '
34                                         'table-responsive"]//a//@href')
35 
36         for month_url in month_url_list:
37             # 依次遍歷月份URL
38             city_day_url = base_url + month_url.extract()
39             # 解析該城市的每日數據
40             request = scrapy.Request(city_day_url, callback=self.parse_city_day)
41             yield request
42 
43     def parse_city_day(self, response):
44         """
45             解析該城市的每日數據
46         """
47         url = response.url
48         item = AirQualityItem()
49         city_url_name = url[url.find('=') + 1:url.find('&')]
50 
51         # 解析url中文
52         # item['city_name'] = city_url_name
53         item['city_name'] = parse.unquote(city_url_name)
54 
55         # 獲取每日記錄
56         day_record_list = response.xpath('//table[@class="table table-condensed '
57                                          'table-bordered table-striped table-hover '
58                                          'table-responsive"]//tr')
59         for i, day_record in enumerate(day_record_list):
60             if i == 0:
61                 # 跳過表頭
62                 continue
63             td_list = day_record.xpath('.//td')
64 
65             item['record_date'] = td_list[0].xpath('text()').extract_first()  # 檢測日期
66             item['aqi_val'] = td_list[1].xpath('text()').extract_first()  # AQI
67             item['range_val'] = td_list[2].xpath('text()').extract_first()  # 范圍
68             item['quality_level'] = td_list[3].xpath('.//div/text()').extract_first()  # 質量等級
69             item['pm2_5_val'] = td_list[4].xpath('text()').extract_first()  # PM2.5
70             item['pm10_val'] = td_list[5].xpath('text()').extract_first()  # PM10
71             item['so2_val'] = td_list[6].xpath('text()').extract_first()  # SO2
72             item['co_val'] = td_list[7].xpath('text()').extract_first()  # CO
73             item['no2_val'] = td_list[8].xpath('text()').extract_first()  # NO2
74             item['o3_val'] = td_list[9].xpath('text()').extract_first()  # O3
75             item['rank'] = td_list[10].xpath('text()').extract_first()  # 排名
76 
77             yield item

運行spider

scrapy crawl api_history_spider

得到csv文件，部分如下圖所示：

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 全國空氣質量指數爬取中國空氣質量在線監測平台加密數據爬取爬蟲之中國空氣質量在線監測平台加密數據爬取 Python網絡爬蟲(中國空氣質量在線監測平台爬取(JS加密與混淆處理)) 空氣質量AQI等級全國城市空氣質量實時數據(PM2.5)實時下載 scrapy下載中間件結合selenium抓取全國空氣質量檢測數據空氣質量相關計算公式（2）全國空氣質量在線平台-js逆向