Python二維數據分析

本文轉載自查看原文 2017-10-12 16:39 1648

一.numpy二維數組

1.聲明

 1 import numpy as np
 2 
 3 #每一個[]代表一行
 4 ridership = np.array([
 5     [   0,    0,    2,    5,    0],
 6     [1478, 3877, 3674, 2328, 2539],
 7     [1613, 4088, 3991, 6461, 2691],
 8     [1560, 3392, 3826, 4787, 2613],
 9     [1608, 4802, 3932, 4477, 2705],
10     [1576, 3933, 3909, 4979, 2685],
11     [  95,  229,  255,  496,  201],
12     [   2,    0,    1,   27,    0],
13     [1438, 3785, 3589, 4174, 2215],
14     [1342, 4043, 4009, 4665, 3033]
15 ])

View Code

2.取值

1 print ridership[1,3] #取第二行,第四個數
2 print ridership[1:3,3:5] #取子集
3 print ridership[1,:]#取第二行
4 print ridership[0,:]+ridership[1,:]#第一行的值加上第二行的值
5 print ridership[:,0]+ridership[:,1]#第一列的值加上第二列的值

View Code

3.numpy二維數組之間相加

根據線性代數的規則進行相加

1 a=np.array([[1,2,3],[4,5,6],[7,8,9]])
2 b=np.array([[1,2,3],[4,5,6],[7,8,9]])
3 print a+b

View Code

4.numpy二維數組的求和

1 a = np.array([
2         [1, 2, 3],
3         [4, 5, 6],
4         [7, 8, 9]
5     ])
6 
7 print a.sum() #求出數組所有值的和
8 print a.sum(axis=0) #按照行計算總和
9 print a.sum(axis=1) #按照列計算總和

View Code

例:

1.求出最大車流的車站的平均數

1 def mean_riders_for_max_station(ridership):
2     max_station = ridership[0,:].argmax() #找出最大的車站的下標
3     overall_mean = ridership.mean() #求出總體的平均值
4     mean_for_max = ridership[:,max_station].mean() #根據最大的車站,找到該下標的所有列,求出平均值
5     return (overall_mean,mean_for_max)

View Code

2.求出平均車流量的最大和最小的平均值

1 def min_and_max_riders_per_day(ridership):
2     max_station = ridership.mean(axis=0)
3     max_daily_ridership = max_station.max()
4     min_daily_ridership = max_station.min()
5     return (max_daily_ridership,min_daily_ridership)

View Code

二.pandas二維數組

1.聲明

 1 import pandas as pd
 2 ridership_df = pd.DataFrame(
 3     data=[[   0,    0,    2,    5,    0],
 4           [1478, 3877, 3674, 2328, 2539],
 5           [1613, 4088, 3991, 6461, 2691],
 6           [1560, 3392, 3826, 4787, 2613],
 7           [1608, 4802, 3932, 4477, 2705],
 8           [1576, 3933, 3909, 4979, 2685],
 9           [  95,  229,  255,  496,  201],
10           [   2,    0,    1,   27,    0],
11           [1438, 3785, 3589, 4174, 2215],
12           [1342, 4043, 4009, 4665, 3033]],
13     index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
14            '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
15     columns=['R003', 'R004', 'R005', 'R006', 'R007']
16 )

View Code

2.取值

1 print ridership_df #以表格的方式展示
2 print ridership_df.iloc[0] #獲取第一行的數據
3 print ridership_df.loc['05-05-11']#找到該下標所對應行的數據
4 print ridership_df['R003'] #找出該下標所對應的列的數據
5 print ridership_df.iloc[1,3] #根據點找出對應的值
6 print ridership_df[['R003','R005']] #根據多個下標獲取多列的數據

View Code

3.行列展示

1 df_1=pd.DataFrame({'A':[0,1,2],'B':[3,4,5]}) #默認以列的方式展示
2 print df_1
3 df_2=pd.DataFrame([[0,1,2],[3,4,5]],columns=['A','B','C']) #以行的方式展示
4 print df_2

View Code

例:

1.求出最大車流的車站的平均數

1 def mean_riders_for_max_station(ridership):
2     max_station = ridership.iloc[0].argmax() #獲取第一行的最大的下邊,即為最繁忙的地鐵站
3     overall_mean = ridership.values.mean() #獲取數組中值的所有平均數
4     mean_for_max = ridership[max_station].mean() #取出平均該列的的客流量
5     return (overall_mean,mean_for_max)

View Code

2.相關性

概念:相關性的值(-1,+1),如果越接近+1表示兩個量程正相關,越接近-1表示程負相關,接近0表示越無關

1 subway_df = pd.read_csv('nyc-subway-weather.csv') #下載區有文件
2 def correlation(x,y):
3     #該函數體現了表格中兩個變量之間的相關性
4     std_x = (x-x.mean())/x.std(ddof=0)
5     std_y = (y-y.mean())/y.std(ddof=0)
6     return (std_x*std_y).mean()

View Code

 1 entries = subway_df['ENTRIESn_hourly'] #獲取出對應列的值
 2 cum_entries = subway_df['ENTRIESn']
 3 rain = subway_df['meanprecipi']
 4 temp = subway_df['meantempi']
 5 
 6 #找出兩個變量之間的關聯度
 7 print correlation(entries,rain)
 8 print correlation(entries,temp)
 9 print correlation(rain,temp)
10 print correlation(entries,cum_entries)

View Code

3.DataFrame相關操作

符合線性代數的計算規則,如果兩個DataFrame的值不同,就用NaN值填充

 1 df1=pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]})
 2 df2=pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]})
 3 print df1+df2
 4 
 5 df1 = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
 6 df2 = pd.DataFrame({'d': [10, 20, 30], 'c': [40, 50, 60], 'b': [70, 80, 90]})
 7 print df1 + df2
 8 
 9 df1=pd.DataFrame({'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]},index=['row1','row2','row3'])
10 df2=pd.DataFrame({'a':[10,20,30],'b':[40,50,60],'c':[70,80,90]},index=['row1','row2','row3'])
11 print df1+df2

View Code

例:

1.求出每小時的地鐵的流量

1 entries_and_exits = pd.DataFrame({
2     'ENTRIESn': [3144312, 3144335, 3144353, 3144424, 3144594,
3                  3144808, 3144895, 3144905, 3144941, 3145094],
4     'EXITSn': [1088151, 1088159, 1088177, 1088231, 1088275,
5                1088317, 1088328, 1088331, 1088420, 1088753]
6 })
7 #獲取每小時的進出地鐵的人數
8 def get_hourly_entries_and_exits(entries_and_exits):
9     return entries_and_exits-entries_and_exits.shift(1)

View Code

4.applymap方法,將DataFrame的所有值通過自定義方法得以修改

 1 df = pd.DataFrame({
 2     'a': [1, 2, 3],
 3     'b': [10, 20, 30],
 4     'c': [5, 10, 15]
 5 })
 6 
 7 def add_one(x):
 8     return x + 1
 9 
10 
11 print df.applymap(add_one)

View Code

例:

1.將學生的成績轉換為等級(A,B,C,D,F)

 1 grades_df = pd.DataFrame(
 2     data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
 3           'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
 4     index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
 5            'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
 6 )
 7 def convert_grade(grade):
 8     if grade>=90:
 9         return 'A'
10     elif grade>=80:
11         return 'B'
12     elif grade>=70:
13         return 'C'
14     elif grade>=60:
15         return 'D'
16     else:
17         return 'F'
18 def convert_grades(grades):
19     return grades.applymap(convert_grade)

View Code

2.計算標准差

1 def standardize_column(df):
2     return (df-df.mean())/df.std(ddof=0) #ddof表示使用貝塞爾教正參數
3 
4 def standardize(df):
5     return df.apply(standardize_column)
6 print standardize(grades_df)

View Code

3.獲取數組中第二大的值

 1 df = pd.DataFrame({
 2     'a': [4, 5, 3, 1, 2],
 3     'b': [20, 10, 40, 50, 30],
 4     'c': [25, 20, 5, 15, 10]
 5 })
 6 
 7 def second_largest_column(column):
 8     #將數據以降序進行排列,下標為1的就是第二大的數
 9     sorted_column = column.sort_values(ascending=False)
10     return sorted_column.iloc[1]
11 
12 def senond_large(df):
13     return df.apply(second_largest_column)

View Code

5.DataFrame和Seriers操作

1 s=pd.Series([1,2,3,4])
2 df=pd.DataFrame({0:[10,20,30,40],1:[50,60,70,80],2:[90,100,110,120],3:[130,140,150,160]})
3 print df+s #每行對應數值相加
4 
5 df = pd.DataFrame({0: [10], 1: [20], 2: [30], 3: [40]})
6 print df+s #行相加
7 
8 df = pd.DataFrame({0: [10, 20, 30, 40]})
9 print df+s #列相加

View Code

 1 s = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
 2 df = pd.DataFrame({
 3     'a': [10, 20, 30, 40],
 4     'b': [50, 60, 70, 80],
 5     'c': [90, 100, 110, 120],
 6     'd': [130, 140, 150, 160]
 7 })
 8 print df+s #默認進行行相加
 9 print df.add(s,axis='index') #列相加,此處不匹配,顯示NaN值
10 print df.add(s,axis='columns') #指定進行行相加

View Code

例:

1.計算學生的標准偏差

 1 grades_df = pd.DataFrame(
 2     data={'exam1': [43, 81, 78, 75, 89, 70, 91, 65, 98, 87],
 3           'exam2': [24, 63, 56, 56, 67, 51, 79, 46, 72, 60]},
 4     index=['Andre', 'Barry', 'Chris', 'Dan', 'Emilio', 
 5            'Fred', 'Greta', 'Humbert', 'Ivan', 'James']
 6 )
 7 
 8 #計算每個學生的標准偏差
 9 def standardize(df):
10     return (df-df.mean())/df.std(ddof=0)
11 
12 def standardize_row(df):
13     #1.計算每個學生考試成績和平均成績的差
14     #2.再計算每個學生的樣本偏差
15     mean_diffs = df.sub(df.mean(axis='columns'),axis='index')
16     return mean_diffs.div(df.std(axis='columns',ddof=0),axis='index')

View Code

6.groupby分組

 1 values = np.array([1, 3, 2, 4, 1, 6, 4])
 2 example_df = pd.DataFrame({
 3     'value': values,
 4     'even': values % 2 == 0,
 5     'above_three': values > 3 
 6 }, index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])
 7 
 8 print example_df
 9 
10 #根據單列進行分組
11 grouped_data = example_df.groupby('even')
12 print grouped_data.groups
13 #根據多列進行分組
14 grouped_data = example_df.groupby(['even','above_three'])
15 print grouped_data.groups
16 
17 grouped_data = example_df.groupby('even')
18 print grouped_data.sum() #分組后求和
19 print grouped_data.sum()['value'] #先求和再根據value分組
20 print grouped_data['value'].sum() #繼續根據value分組在求和

View Code

例:

1.計算value的標准偏差

1 def standardize(xs):
2     return (xs-xs.mean())/xs.std(ddof=0)
3 
4 #根據even字段分組
5 grouped_data=example_df.groupby('even')
6 
7 #根據value在分組然后計算標准差
8 print grouped_data['value'].apply(standardize)

View Code

2.畫出車站每周每小時的使用的平均值

1 %pylab inline
2 import seaborn as sns
3 subway_df = pd.read_csv('nyc-subway-weather.csv')
4 #根據day_week分組,然后獲取平均值,最后獲取ENTRIESn_hourly列的值
5 ridership_by_day = subway_df.groupby('day_week').mean()['ENTRIESn_hourly']
6 ridership_by_day.plot()

View Code

3.獲取每個地鐵站每個小時的流量

 1 ridership_df = pd.DataFrame({
 2     'UNIT': ['R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051', 'R079', 'R051'],
 3     'TIMEn': ['00:00:00', '02:00:00', '04:00:00', '06:00:00', '08:00:00', '10:00:00', '12:00:00', '14:00:00', '16:00:00'],
 4     'ENTRIESn': [3144312, 8936644, 3144335, 8936658, 3144353, 8936687, 3144424, 8936819, 3144594],
 5     'EXITSn': [1088151, 13755385,  1088159, 13755393,  1088177, 13755598, 1088231, 13756191,  1088275]
 6 })
 7 def hour_by_group(entries_and_exits):
 8     return entries_and_exits-entries_and_exits.shift(1)
 9 #獲取每個車站的每小時的進出口
10 def get_hourly_entries_and_exits(entries_and_exits):
11     #根據UNIT字段分組,然后,獲取相應的列,最后調用自定義的方法得出結論
12     return entries_and_exits.groupby('UNIT')['ENTRIESn','EXITSn'].apply(hour_by_group)

View Code

7.merge組合,將兩個結果集根據某些字段進行組合,整合為一個結果集

 1 subway_df = pd.DataFrame({
 2     'UNIT': ['R003', 'R003', 'R003', 'R003', 'R003', 'R004', 'R004', 'R004',
 3              'R004', 'R004'],
 4     'DATEn': ['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
 5               '05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11'],
 6     'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 7     'ENTRIESn': [ 4388333,  4388348,  4389885,  4391507,  4393043, 14656120,
 8                  14656174, 14660126, 14664247, 14668301],
 9     'EXITSn': [ 2911002,  2911036,  2912127,  2913223,  2914284, 14451774,
10                14451851, 14454734, 14457780, 14460818],
11     'latitude': [ 40.689945,  40.689945,  40.689945,  40.689945,  40.689945,
12                   40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ,  40.69132 ],
13     'longitude': [-73.872564, -73.872564, -73.872564, -73.872564, -73.872564,
14                   -73.867135, -73.867135, -73.867135, -73.867135, -73.867135]
15 })
16 
17 weather_df = pd.DataFrame({
18     'DATEn': ['05-01-11', '05-01-11', '05-02-11', '05-02-11', '05-03-11',
19               '05-03-11', '05-04-11', '05-04-11', '05-05-11', '05-05-11'],
20     'hour': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
21     'latitude': [ 40.689945,  40.69132 ,  40.689945,  40.69132 ,  40.689945,
22                   40.69132 ,  40.689945,  40.69132 ,  40.689945,  40.69132 ],
23     'longitude': [-73.872564, -73.867135, -73.872564, -73.867135, -73.872564,
24                   -73.867135, -73.872564, -73.867135, -73.872564, -73.867135],
25     'pressurei': [ 30.24,  30.24,  30.32,  30.32,  30.14,  30.14,  29.98,  29.98,
26                    30.01,  30.01],
27     'fog': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
28     'rain': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
29     'tempi': [ 52. ,  52. ,  48.9,  48.9,  54. ,  54. ,  57.2,  57.2,  48.9,  48.9],
30     'wspdi': [  8.1,   8.1,   6.9,   6.9,   3.5,   3.5,  15. ,  15. ,  15. ,  15. ]
31 })
32 
33 #將相關聯的量組合
34 def combine_dfs(subway_df,weather_df):
35     return subway_df.merge(weather_df,on=['DATEn','hour','latitude','longitude'],how='inner')
36 
37 #如果兩個列不同,則要通過on_left和on_right來匹配參數

View Code

例:

1.做出地鐵站位置的散點圖,通過點的大小展示哪里的車站人流最高

 1 %pylab inline
 2 import matplotlib.pyplot as plt
 3 import numpy as np
 4 import pandas as pd
 5 import seaborn as sns
 6 subway_df = pd.read_csv('nyc-subway-weather.csv')
 7 #根據經度和緯度分組,求出平均數,注意as_index會將字段本身不作為索引,避免出錯
 8 data_for_location = subway_df.groupby(['latitude','longitude'],as_index=False).mean()
 9 #求出每小時的標准偏差,作為圖片大小
10 scaled_entries = 
11 data_for_location['ENTRIESn_hourly']/data_for_location['ENTRIESn_hourly'].std(ddof=0)
12 #根據緯度為x軸,經度為y軸,s的教正系數,做出散點圖
13 plt.scatter(data_for_location['latitude'],data_for_location['longitude'],s=scaled_entries)

View Code

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 Python數據分析 Python數據分析 python數據分析與展示（一） python數據分析畫圖體驗 Python數據分析流程 Python數據分析基礎PDF Python數據分析實戰 Python 數據分析 Matplotlib python數據分析及展示（一） python數據分析與算法之五算法