food_info = pandas.read_csv("food_info.csv") print(type(food_info))# <class 'pandas.core.frame.DataFrame'>,類似於矩陣結構 print(food_info.dtypes) # 常見的有三種值:int64,float64,object(即字符),datetime,bool print(help(pandas.read_csv))
food_info.head(num) # 如果不指定數據,默認顯示前5條數據,如果指定數據,則顯示num條數據
food_info.tail(num)
food_info.columns # 獲取列名,並且保存為列表格式
food_info.shape # 數據維度
food_info.loc[0,] # 獲得index為零的數據
food_info.loc[3:6] # 切片
food_info["NDB_NO",] # 根據列名來進行取值
過濾列名
col_names = food_info.columns.tolist() # 將所有的列名轉化為列表 gram_columns = [] for c in col_names: if c.endswith('(g)'): gram_columns.append(c) gram_df = food_info[gram_columns] gram_df.head(3)
加減乘除:
food_info["Iron_(mg)"] / 1000 # +-*/
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"] # 維度必須一樣
food_info["Iron_(g)"] = water_energy # 新建一列存儲上面的值
最大值和最小值
max_calories = food_info["Energ_kcal"].max() # min
排序
food_info.sort_values("Sodium_(mg)",inplace=True) # 從小到大進行排序
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=False) # 降序
import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") titanic_survival.head() age = titanic_survival["Age"] age_is_null = pd.isnull(age) # age里面的數,如果是缺失值,那么返回True,如果沒有缺失返回False
age_null_true = age[age_is_null] # 顯示age里面所有的缺失值
age_null_count = len(age_null_true) # 缺失值的長度
good_ages = titanic_survival["Age"][age_is_null == False] # 去掉缺失值
correct_mean_age = sum(good_ages) / len(good_ages) # 去掉缺失值后,求平均年齡
correct_mean_age = titanic_survival["Age"].mean() # 去掉缺失值后,求平均年齡
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] # 獲取X等艙的所有的數據
pclass_fares = pclass_rows["Fare"] # 獲取X等艙的票價的所有數據
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class # X等艙的平均票價
上面的代碼等於下面這一句
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Fare",aggfunc=np.mean)
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Fare") # 默認是求均值
passenger_survival = titanic_survival.pivot_table(index="Embarked",values="Fare,"Survived"],aggfunc=np.sum)
drop_na_columns = titanic_survival.dropna(axis=1) # 去掉丟失值
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) # 如果Age或Sex缺失,則丟掉這條數據
row_index_83_age = titanic_survival.loc[83,"Age"]
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
titanic_reindexed = new_titanci_survival.reset_index(drop=True) # 按照Age排序后,將原來的index值去掉,index重新從0開始排序
自定義函數
def hundredth_row(column): # 自定義返回第100行的數據的函數
hundredth_item = column.loc[99]
return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
def not_null_count(column): # 返回缺失值的個數
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)# 注意這里沒有形參,返回的是每一列的缺失的個數
series
import pandas as pd
from pandas import Series fandango = pd.read_csv('fandango_score_comparison.csv') series_film = fandango['FILM']
series_rt = fandango['RottenTomatoes'] print(type(series_film)) # <class 'pandas.core.series.Series'>
film_names = series_film.values # 獲取每一個值,電影名稱
rt_scores = series_rt.values # 評分
series_custom = Series(rt_scores,index=film_names) # 索引值為film_named的值
series_custom[5:10]
original_index = series_custom.index.tolist()
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
sc2 = series_customer.sort_index() # 按照index進行排序
sc2 = series_customer.sort_values() # 按照values進行排序
rt_critics = Series(fandango['RottenTomatoes'].values,index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
fandango_films = fandango.set_index('FILM',drop=False) # FIML這一列作為index
print(fandango_films.index)
fandango_films。loc["Avengers:Age of Ultron(2015)":"Hot tub Time Machine 2 (2015)"] # 切片,類似於數值作為索引的切片