1.Numpy數組
numpy的數組只能存放同一種數據類型,使用的方式和Python列表類似
1.1 聲明:

1 import numpy as np 2 countries = np.array([ 3 'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina', 4 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 5 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 6 'Belize', 'Benin', 'Bhutan', 'Bolivia', 7 'Bosnia and Herzegovina' 8 ]) 9 employment = np.array([ 10 55.70000076, 51.40000153, 50.5 , 75.69999695, 11 58.40000153, 40.09999847, 61.5 , 57.09999847, 12 60.90000153, 66.59999847, 60.40000153, 68.09999847, 13 66.90000153, 53.40000153, 48.59999847, 56.79999924, 14 71.59999847, 58.40000153, 70.40000153, 41.20000076 15 ])
1.2 獲取&切片

1 print countries[0] 2 print countries[3] 3 4 print countries[0:3] 5 print countries[3:] 6 print countries[17] 7 print countries[:]
1.3 for循環

1 for i in range(len(countries)): 2 country = countries[i] 3 country_employeement = employment[i] 4 print 'Country {} has employeement {}'.format(country,country_employeement)
1.4 數據類型

1 print countries.dtype 2 print employment.dtype 3 print np.array([0, 1, 2, 3]).dtype 4 print np.array([1.0, 1.5, 2.0, 2.5]).dtype 5 print np.array([True, False, True]).dtype 6 print np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype
1.5 計算統計值

1 print employment.mean() 2 print employment.std() 3 print employment.max() 4 print employment.min()
1.6 獲取具有最多就業率的國家

1 def max_employeement(countries,employment): 2 i = employment.argmax() 3 return (countries[i],employment[i])
1.7 numpy向量與向量運算

1 #1.兩個numpy向量的數值運算 2 a=np.array([1,2,3,4]) 3 b=np.array([1,2,1,2]) 4 5 #依次相加,符合線性代數的向量加法 6 print a+b 7 print a-b 8 print a*b 9 print a/b 10 print a**b
1.8 向量與值的運算

1 #符合線性代數的運算規則 2 a=np.array([1,2,3,4]) 3 b=2 4 print a+b 5 print a-b 6 print a*b 7 print a/b 8 print a**b
1.9 向量的邏輯運算

1 a=np.array([True,True,False,False]) 2 b=np.array([True,False,True,False]) 3 4 #每一個值依次比較 5 print a&b 6 print a|b 7 print ~a 8 9 print a&True 10 print a&False 11 12 print a|True 13 print b|False
1.10 向量之間的比較

1 a = np.array([1, 2, 3, 4, 5]) 2 b = np.array([5, 4, 3, 2, 1]) 3 #每一個值依次比較 4 print a > b 5 print a >= b 6 print a < b 7 print a <= b 8 print a == b 9 print a != b
1.11 向量和數值比較

1 a=np.array([1,2,3,4]) 2 b=2 3 #a向量的每一個值都和b比較,返回True or False 4 print a > b 5 print a >= b 6 print a < b 7 print a <= b 8 print a == b 9 print a != b
1.12 向量之間可以直接進行相加在進行別的運算

1 female_completion = np.array([ 2 97.35583, 104.62379, 103.02998, 95.14321, 103.69019, 3 98.49185, 100.88828, 95.43974, 92.11484, 91.54804, 4 95.98029, 98.22902, 96.12179, 119.28105, 97.84627, 5 29.07386, 38.41644, 90.70509, 51.7478 , 95.45072 6 ]) 7 8 # Male school completion rate in 2007 for those 20 countries 9 male_completion = np.array([ 10 95.47622, 100.66476, 99.7926 , 91.48936, 103.22096, 11 97.80458, 103.81398, 88.11736, 93.55611, 87.76347, 12 102.45714, 98.73953, 92.22388, 115.3892 , 98.70502, 13 37.00692, 45.39401, 91.22084, 62.42028, 90.66958 14 ]) 15 16 #求和取出兩個向量的總體的均值 17 def overall_completion_rate(female_completion, male_completion): 18 return (female_completion+male_completion)/2
1.13 獲取特定值的標准差

1 country_name = 'United States' 2 def standardize_data(values): 3 #直接將一組數據進行加工,得出一組結果集 4 standardized_values = (values-values.mean())/values.std() 5 return standardized_values
1.14 numpy的索引向量

1 a=np.array([1,2,3,4]) 2 b=np.array([True,True,False,False]) 3 4 #返回b中為True的索引所對應的值 5 print a[b] 6 print a[np.array([True,False,True,False])]
1.15 numpy索引向量可以支持表達式

1 a=np.array([1,2,3,2,1]) 2 b=(a>=2) 3 4 print a[b] 5 #返回boolen向量 6 print a[a>=2]
1.16 numpy可返回索引向量

1 a = np.array([1,2,3,4,5]) 2 b = np.array([1,2,3,2,1]) 3 #返回索引向量 4 print b == 2 5 print a[b==2]
1.17 計算每周學習時間的平均數

1 time_spent = np.array([ 2 12.89697233, 0. , 64.55043217, 0. , 3 24.2315615 , 39.991625 , 0. , 0. , 4 147.20683783, 0. , 0. , 0. , 5 45.18261617, 157.60454283, 133.2434615 , 52.85000767, 6 0. , 54.9204785 , 26.78142417, 0. 7 ]) 8 9 # Days to cancel for 20 students 10 days_to_cancel = np.array([ 11 4, 5, 37, 3, 12, 4, 35, 38, 5, 37, 3, 3, 68, 12 38, 98, 2, 249, 2, 127, 35 13 ]) 14 15 #計算出每周學習時間的平均數 16 def mean_time_for_paid_students(time_spent, days_to_cancel): 17 return time_spent[days_to_cancel>=7].mean()
2.Pandas數組
pandas數組是在numpy的數組上做了一次封裝,可以支持更多的統計分析功能,而且也具有和Python字典類似的功能,key的值可以是任意類型
2.1 pandas數組常用功能

1 import pandas as pd 2 life_expectancy_values = [74.7, 75. , 83.4, 57.6, 74.6, 75.4, 72.3, 81.5, 80.2, 3 70.3, 72.1, 76.4, 68.1, 75.2, 69.8, 79.4, 70.8, 62.7, 4 67.3, 70.6] 5 6 gdp_values = [ 1681.61390973, 2155.48523109, 21495.80508273, 562.98768478, 7 13495.1274663 , 9388.68852258, 1424.19056199, 24765.54890176, 8 27036.48733192, 1945.63754911, 21721.61840978, 13373.21993972, 9 483.97086804, 9783.98417323, 2253.46411147, 25034.66692293, 10 3680.91642923, 366.04496652, 1175.92638695, 1132.21387981] 11 12 #轉換成pandas數組 13 life_expectancy = pd.Series(life_expectancy_values) 14 gdp = pd.Series(gdp_values) 15 16 #切片 17 print life_expectancy[0] 18 print gdp[3:6] 19 20 #循環 21 for country_life_expectancy in life_expectancy: 22 print 'Examining life expectancy {} '.format(country_life_expectancy) 23 24 #統計函數 25 print life_expectancy.mean() 26 print life_expectancy.std() 27 print gdp.max() 28 print gdp.sum()
2.2 pandas數組向量運算

1 a = pd.Series([1, 2, 3, 4]) 2 b = pd.Series([1, 2, 1, 2]) 3 4 print a+b 5 print a*2 6 print a>=3 7 print a[a>=3]
2.3 獲取壽命和gdp的關系

1 #獲取gdp和壽命的關系 2 def variable_correlation(variable1, variable2): 3 #獲取所有超過平均值和小於平均值的v1和v2 4 both_above = (variable1>variable1.mean()) & (variable2>variable2.mean()) 5 both_below = (variable1<variable1.mean()) & (variable2<variable2.mean()) 6 #獲取處於同向的數據 7 is_same_direction = both_above|both_below 8 #統計出True的值 9 num_same_direction = is_same_direction.sum() 10 num_different_direction = len(variable1)-num_same_direction 11 return (num_same_direction,num_different_direction)
說明:大部分數據的方向相同,說明兩個變量是正相關的(一個值較大,另一個值也大)
如果第一個數字較小,第二個數字較大,說明變量是負相關的
如果兩個變量的值相等說明.這兩個變量無關
2.4 pandas數組索引訪問

1 countries = [ 2 'Afghanistan', 'Albania', 'Algeria', 'Angola', 3 'Argentina', 'Armenia', 'Australia', 'Austria', 4 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 5 'Barbados', 'Belarus', 'Belgium', 'Belize', 6 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 7 ] 8 9 10 employment_values = [ 11 55.70000076, 51.40000153, 50.5 , 75.69999695, 12 58.40000153, 40.09999847, 61.5 , 57.09999847, 13 60.90000153, 66.59999847, 60.40000153, 68.09999847, 14 66.90000153, 53.40000153, 48.59999847, 56.79999924, 15 71.59999847, 58.40000153, 70.40000153, 41.20000076, 16 ] 17 18 #轉換成pandas數組,值為employment,索引是countries 19 employment = pd.Series(employment_values,index=countries) 20 21 #找出工資最高的國家 22 def max_employment(employment): 23 #獲取empolyment中最大的索引,然后根據索引取值 24 max_country = employment.argmax() 25 max_value = employment.loc[max_country] 26 return (max_country,max_value)
2.5 去除NaN值

1 #會根據關鍵字來賦值而不是位置來賦值,和python的關鍵字參數同理 2 s1=pd.Series([1,2,3,4],index=['a','b','c','d']) 3 s2=pd.Series([10,20,30,40],index=['a','b','c','d']) 4 print s1+s2 5 6 s1=pd.Series([1,2,3,4],index=['a','b','c','d']) 7 s2=pd.Series([10,20,30,40],index=['b','d','a','c']) 8 print s1+s2

1 #此時由於關鍵字參數缺失,會出現NaN值的情況 2 s1=pd.Series([1,2,3,4],index=['a','b','c','d']) 3 s2=pd.Series([10,20,30,40],index=['c','d','e','f']) 4 print s1+s2 5 6 s1=pd.Series([1,2,3,4],index=['a','b','c','d']) 7 s2=pd.Series([10,20,30,40],index=['e','f','g','h']) 8 print s1+s2 9 10 #直接填充默認值來處理NaN值,這樣NaN值會顯示原來的值 11 s1.add(s2,fill_value=0)
2.6 自定義方法,可以執行pandas中沒有的邏輯

1 #自定義方法apply 2 names = pd.Series([ 3 'Andre Agassi', 4 'Barry Bonds', 5 'Christopher Columbus', 6 'Daniel Defoe', 7 'Emilio Estevez', 8 'Fred Flintstone', 9 'Greta Garbo', 10 'Humbert Humbert', 11 'Ivan Ilych', 12 'James Joyce', 13 'Keira Knightley', 14 'Lois Lane', 15 'Mike Myers', 16 'Nick Nolte', 17 'Ozzy Osbourne', 18 'Pablo Picasso', 19 'Quirinus Quirrell', 20 'Rachael Ray', 21 'Susan Sarandon', 22 'Tina Turner', 23 'Ugueth Urbina', 24 'Vince Vaughn', 25 'Woodrow Wilson', 26 'Yoji Yamada', 27 'Zinedine Zidane' 28 ]) 29 #將姓名翻轉 30 def reverse_name(name): 31 split_name = name.split(' ') 32 first_name = split_name[0] 33 last_name = split_name[1] 34 return last_name + ', ' +first_name 35 36 #直接通過apply即可調用 37 def reverse_names(names): 38 return names.apply(reverse_name)
2.7 使用pandas畫圖

1 %pylab inline 2 #讀取csv文件 3 employment = pd.read_csv('employment-above-15.csv',index_col='Country') 4 #根據index獲取value 5 employment_us = employment.loc['United States'] 6 #畫散點圖 7 employment_us.plot()
3 注意事項:
在numpy數組中,要注意+=,和+的區別
3.1 +=
+=是在原有的數組上進行修改,因為a,b共享一塊內存,所以a修改會導致b也修改

1 import numpy as np 2 a = np.array([1,2,3,4]) 3 b = a 4 a+=np.array([1,1,1,1]) 5 print b 6 7 #因為a,b用的是同一塊內存,當a的值改變b的值就會改變 8 #2,3,4,5
3.2 +
+表示a數組重新開辟了一塊內存空間存放新的數組,本質上a,b是占用兩個不同的內存空間

1 import numpy as np 2 a = np.array([1,2,3,4]) 3 b=a 4 a = a + np.array([1,1,1,1]) 5 print b 6 7 #此時a,b用的是兩塊獨立的內存空間,所以b的值還是1,2,3,4
3.3 切片
切片出來的子集是原先數組的視圖,與原數組共享一塊內存空間,所以會連帶修改

1 import numpy as np 2 a = np.array([1,2,3,4,5]) 3 slice = a[:3] 4 slice[0] = 100 5 print slice 6 7 #因為切片出來的數據還是原先數據的鏡像,所以一旦修改原數據也會修改 8 #100,2,3