数据分析~案例:中国五大城市PM2.5数据分析


config.py

import os

# 指定数据集路径
dataset_path='./data'

# 结果保存路径
output_path='./output'

if not os.path.exists(output_path):
os.mkdir(output_path)

# 公共列
common_cols=['year','month']

# 每个城市对应的文件名及所需分析的列名
# 以字典形式保存,如:{城市:(文件名, 列名)}

data_config_dict={'beijing':('BeijingPM20100101_20151231.csv', ['Dongsi', 'Dongsihuan', 'Nongzhanguan']),
'chengdu':('ChengduPM20100101_20151231.csv',['Caotangsi', 'Shahepu']),
'guangzhou':('GuangzhouPM20100101_20151231.csv',['City Station', '5th Middle School']),
'shanghai':('ShanghaiPM20100101_20151231.csv',['Jingan', 'Xuhui']),
'shenyang':('ShenyangPM20100101_20151231.csv',['Taiyuanjie', 'Xiaoheyan'])
}

====================================================================================================================

main.py

"""

案例:中国五大城市PM2.5数据分析
任务:
- 五城市污染状态
- 五城市每个区空气质量的月度差异

数据集来源:https://www.kaggle.com/uciml/pm25-data-for-five-chinese-cities

"""
import csv
import os
import numpy as np
import config

def load_data(data_file,usecols):
'''
读取数据文件,加载数据
:param data_file:文件路径
:param usecols:所使用的列
:return: data_arr: 数据的多维数组表示
'''

data=[]
with open(data_file,'r') as csvfile:
data_reader=csv.DictReader(csvfile)
# print(data_reader)
# === 数据处理 ===
for row in data_reader:
# 取出每行数据,组合为一个列表放入数据列表中
row_data=[]
# 注意csv模块读入的数据全部为字符串类型
for col in usecols:
str_val=row[col]
#数据类型转换为float,如果是'NA',则返回nan
row_data.append(float(str_val) if str_val!='NA' else np.nan)
# 如果行数据中不包含nan才保存该行记录
if not any(np.isnan(row_data)):
data.append(row_data)
# 将data转换为ndarray
data_arr=np.array(data)
return data_arr


def get_polluted_perc(data_arr):
'''
获取各城市每个区污染占比的小时数
规则:
重度污染(heavy) PM2.5 > 150
重度污染(medium) 75 < PM2.5 <= 150
轻度污染(light) 35 < PM2.5 <= 75
优良空气(good) PM2.5 <= 35
:param data_arr:数据的多维数组表示
:return: polluted_perc_list: 污染小时数百分比列表
'''
# 将每个区的PM值平均后作为该城市小时的PM值
# 按行取平均值
hour_val=np.mean(data_arr[:,2:],axis=1)
# 总小时数
n_hours=hour_val.shape[0]
# 重度污染小时数
n_heavy_hours=hour_val[hour_val>150].shape[0]
# 中度污染小时数
n_medium_hours=hour_val[(hour_val>75) & (hour_val<=150)].shape[0]
# 轻度污染小时数
n_light_hours = hour_val[(hour_val > 35) & (hour_val <= 75)].shape[0]
# 优良空气小时数
n_good_hours = hour_val[hour_val <= 35].shape[0]

polluted_perc_list= [n_heavy_hours / n_hours, n_medium_hours / n_hours,
n_light_hours / n_hours, n_good_hours / n_hours]

return polluted_perc_list

def get_avg_pm_per_month(data_arr):
'''
获取每个区每月的平均PM值
:param data_arr:数据的多维数组表示
:return: results_arr: 多维数组结果
'''

results=[]
# 获取年份
years=np.unique(data_arr[:,0])
for year in years:
# 获取当前年份数据
year_data_arr=data_arr[data_arr[:,0]==year]
# 获取数据的月份
month_list=np.unique(year_data_arr[:,1])

for month in month_list:
# 获取月份的所有数据
month_data_arr=year_data_arr[year_data_arr[:,1]==month]
# 计算当前月份PM的均值
mean_vals=np.mean(month_data_arr[:,2:],axis=0).tolist()
# 格式化字符串
row_data=['{:.0f}-{:02.0f}'.format(year,month)]+mean_vals
results.append(row_data)
results_arr=np.array(results)
return results_arr

def save_stats_to_csv(results_arr,save_file,headers):
'''
将统计结果保存至csv文件中
:param results_arr:多维数组结果
:param save_file:文件保存路径
:param headers: csv表头
:return:
'''

with open(save_file,'w',newline='') as csvfile:
writer=csv.writer(csvfile)
writer.writerow(headers)
for row in results_arr.tolist():
writer.writerow(row)
def main():
'''
主函数
'''
polluted_state_list=[]

for city_name,(filename,cols) in config.data_config_dict.items():
# === 数据获取 + 数据处理 ===
data_file=os.path.join(config.dataset_path,filename)
usecols=config.common_cols+['PM_'+col for col in cols]
#加载数据
data_arr=load_data(data_file,usecols)

print('{}共有{}行有效数据'.format(city_name,data_arr.shape[0]))
# 预览前10行数据
print('{}的前10行数据:'.format(city_name))
print(data_arr[:10])

# # === 数据分析 ===
# # 五城市污染状态,统计污染小时数的占比
polluted_perc_list=get_polluted_perc(data_arr)
polluted_state_list.append([city_name]+polluted_perc_list)
print('{}的污染小时数百分比{}'.format(city_name,polluted_perc_list))

# 五城市每个区空气质量的月度差异,分析计算每个月,每个区的平均PM值
results_arr=get_avg_pm_per_month(data_arr)
print('{}的每月平均PM值预览:'.format(city_name))
print(results_arr[:10])

# === 结果展示 ===
# 保存月度统计结果至csv文件
save_filename = city_name + '_month_stats.csv'
save_file = os.path.join(config.output_path, save_filename)
save_stats_to_csv(results_arr, save_file, headers=['month'] + cols)
print('月度统计结果已保存至{}'.format(save_file))

# 污染状态结果保存
save_file = os.path.join(config.output_path, 'polluted_percentage.csv')
with open(save_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['city', 'heavy', 'medium', 'light', 'good'])
for row in polluted_state_list:
writer.writerow(row)
print('污染状态结果已保存至{}'.format(save_file))


if __name__=='__main__':
main()
 
====================================================================================================================

/usr/local/bin/python3.6 /Users/apple/PycharmProjects/xiaoxiang02/main.py
beijing共有19613行有效数据
beijing的前10行数据:
[[2013. 3. 117. 166. 140.]
[2013. 3. 131. 165. 152.]
[2013. 3. 141. 173. 128.]
[2013. 3. 169. 182. 3.]
[2013. 3. 169. 169. 3.]
[2013. 3. 174. 183. 163.]
[2013. 3. 194. 195. 192.]
[2013. 3. 208. 212. 203.]
[2013. 3. 213. 207. 195.]
[2013. 3. 203. 198. 185.]]
beijing的污染小时数百分比[0.1723346759802172, 0.26956610411461784, 0.24611227247233977, 0.3119869474328252]
beijing的每月平均PM值预览:
[['2013-03' '117.99354838709678' '128.4725806451613' '116.1774193548387']
['2013-04' '64.298937784522' '63.165402124430955' '56.88770864946889']
['2013-05' '91.35816618911174' '101.55014326647564' '77.11174785100286']
['2013-06' '110.01160092807424' '119.17169373549883'
'108.27146171693735']
['2013-07' '72.19110378912686' '85.35090609555189' '74.67051070840198']
['2013-08' '63.986301369863014' '69.77168949771689' '64.64687975646879']
['2013-09' '83.79607250755286' '82.89577039274924' '80.97129909365559']
['2013-10' '102.78525641025641' '101.52403846153847' '94.6923076923077']
['2013-11' '83.16338028169014' '84.2338028169014' '83.55211267605634']
['2013-12' '87.7453505007153' '92.02718168812589' '89.99570815450643']]
月度统计结果已保存至./output/beijing_month_stats.csv
chengdu共有23816行有效数据
chengdu的前10行数据:
[[2.013e+03 1.000e+00 1.210e+02 1.380e+02]
[2.013e+03 1.000e+00 1.340e+02 1.590e+02]
[2.013e+03 1.000e+00 2.030e+02 1.620e+02]
[2.013e+03 1.000e+00 2.170e+02 1.570e+02]
[2.013e+03 1.000e+00 2.200e+02 1.700e+02]
[2.013e+03 1.000e+00 2.140e+02 2.250e+02]
[2.013e+03 1.000e+00 2.090e+02 2.440e+02]
[2.013e+03 1.000e+00 2.280e+02 2.420e+02]
[2.013e+03 1.000e+00 2.190e+02 2.770e+02]
[2.013e+03 1.000e+00 2.250e+02 2.810e+02]]
chengdu的污染小时数百分比[0.10971615720524018, 0.2613789049378569, 0.394902586496473, 0.23400235136042996]
chengdu的每月平均PM值预览:
[['2013-01' '170.09582689335394' '189.5625965996909']
['2013-02' '126.59324758842443' '118.9807073954984']
['2013-03' '141.24685534591194' '139.7059748427673']
['2013-04' '102.12990196078431' '94.19607843137256']
['2013-05' '77.12660944206009' '66.92703862660944']
['2013-06' '52.236486486486484' '47.11711711711712']
['2013-07' '50.69642857142857' '40.565934065934066']
['2013-08' '66.55602240896359' '56.627450980392155']
['2013-09' '60.584' '58.364']
['2013-10' '100.51994301994301' '99.68518518518519']]
月度统计结果已保存至./output/chengdu_month_stats.csv
guangzhou共有20074行有效数据
guangzhou的前10行数据:
[[2.013e+03 1.000e+00 8.300e+01 7.800e+01]
[2.013e+03 1.000e+00 9.500e+01 7.000e+01]
[2.013e+03 1.000e+00 5.500e+01 6.600e+01]
[2.013e+03 1.000e+00 6.000e+01 6.900e+01]
[2.013e+03 1.000e+00 4.100e+01 5.100e+01]
[2.013e+03 1.000e+00 4.200e+01 3.900e+01]
[2.013e+03 1.000e+00 4.000e+01 3.700e+01]
[2.013e+03 1.000e+00 4.000e+01 3.800e+01]
[2.013e+03 1.000e+00 3.500e+01 3.400e+01]
[2.013e+03 1.000e+00 4.200e+01 3.400e+01]]
guangzhou的污染小时数百分比[0.01225465776626482, 0.14715552455913122, 0.4265716847663645, 0.4140181329082395]
guangzhou的每月平均PM值预览:
[['2013-01' '83.84602076124567' '85.5363321799308']
['2013-02' '60.82752613240418' '56.825783972125436']
['2013-03' '67.9199372056515' '62.71742543171115']
['2013-04' '72.91483516483517' '65.43406593406593']
['2013-05' '37.05223880597015' '39.65422885572139']
['2013-06' '25.188432835820894' '27.89179104477612']
['2013-07' '15.283018867924529' '25.58490566037736']
['2013-09' '40.171140939597315' '42.285234899328856']
['2013-11' '30.181818181818183' '36.45454545454545']
['2013-12' '62.295121951219514' '70.60487804878049']]
月度统计结果已保存至./output/guangzhou_month_stats.csv
shanghai共有23993行有效数据
shanghai的前10行数据:
[[2.013e+03 1.000e+00 6.600e+01 7.100e+01]
[2.013e+03 1.000e+00 6.700e+01 7.200e+01]
[2.013e+03 1.000e+00 7.300e+01 7.400e+01]
[2.013e+03 1.000e+00 7.500e+01 7.700e+01]
[2.013e+03 1.000e+00 7.300e+01 8.000e+01]
[2.013e+03 1.000e+00 7.400e+01 7.700e+01]
[2.013e+03 1.000e+00 7.300e+01 8.400e+01]
[2.013e+03 1.000e+00 7.700e+01 8.700e+01]
[2.013e+03 1.000e+00 7.300e+01 9.100e+01]
[2.013e+03 1.000e+00 8.200e+01 8.800e+01]]
shanghai的污染小时数百分比[0.0504313758179469, 0.18809652815404493, 0.3728587504688868, 0.3886133455591214]
shanghai的每月平均PM值预览:
[['2013-01' '97.96923076923076' '96.23230769230769']
['2013-02' '64.3262839879154' '62.24773413897281']
['2013-03' '65.05007587253414' '64.90136570561457']
['2013-04' '66.57551669316375' '61.32273449920509']
['2013-05' '62.2625' '57.384375']
['2013-06' '56.86453576864536' '58.00304414003044']
['2013-07' '45.73089171974522' '45.99203821656051']
['2013-08' '34.78417266187051' '35.93237410071942']
['2013-09' '31.261755485893417' '31.976489028213166']
['2013-10' '35.68104776579353' '37.707241910631744']]
月度统计结果已保存至./output/shanghai_month_stats.csv
shenyang共有24115行有效数据
shenyang的前10行数据:
[[2.013e+03 1.000e+00 1.450e+02 1.480e+02]
[2.013e+03 1.000e+00 1.500e+02 1.330e+02]
[2.013e+03 1.000e+00 1.420e+02 1.210e+02]
[2.013e+03 1.000e+00 1.050e+02 1.100e+02]
[2.013e+03 1.000e+00 1.540e+02 1.070e+02]
[2.013e+03 1.000e+00 1.760e+02 1.230e+02]
[2.013e+03 1.000e+00 1.400e+02 1.110e+02]
[2.013e+03 1.000e+00 9.300e+01 7.600e+01]
[2.013e+03 1.000e+00 5.300e+01 5.600e+01]
[2.013e+03 1.000e+00 2.300e+01 2.900e+01]]
shenyang的污染小时数百分比[0.11909599834128136, 0.24242172921418204, 0.33278042712004974, 0.30570184532448685]
shenyang的每月平均PM值预览:
[['2013-01' '200.24801271860096' '207.59777424483306']
['2013-02' '93.0326797385621' '93.11437908496733']
['2013-03' '85.57299843014128' '74.7032967032967']
['2013-04' '62.97513812154696' '58.08839779005525']
['2013-05' '75.40425531914893' '74.39574468085107']
['2013-06' '57.67380560131796' '53.85172981878089']
['2013-07' '47.89235569422777' '32.42745709828393']
['2013-08' '56.172821270310195' '43.90546528803545']
['2013-09' '48.861759425493716' '42.72351885098743']
['2013-10' '84.93227665706051' '82.7478386167147']]
月度统计结果已保存至./output/shenyang_month_stats.csv
污染状态结果已保存至./output/polluted_percentage.csv

Process finished with exit code 0

 

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM