food_info = pandas.read_csv("food_info.csv") print(type(food_info))# <class 'pandas.core.frame.DataFrame'>,类似于矩阵结构 print(food_info.dtypes) # 常见的有三种值:int64,float64,object(即字符),datetime,bool print(help(pandas.read_csv))
food_info.head(num) # 如果不指定数据,默认显示前5条数据,如果指定数据,则显示num条数据
food_info.tail(num)
food_info.columns # 获取列名,并且保存为列表格式
food_info.shape # 数据维度
food_info.loc[0,] # 获得index为零的数据
food_info.loc[3:6] # 切片
food_info["NDB_NO",] # 根据列名来进行取值
过滤列名
col_names = food_info.columns.tolist() # 将所有的列名转化为列表 gram_columns = [] for c in col_names: if c.endswith('(g)'): gram_columns.append(c) gram_df = food_info[gram_columns] gram_df.head(3)
加减乘除:
food_info["Iron_(mg)"] / 1000 # +-*/
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"] # 维度必须一样
food_info["Iron_(g)"] = water_energy # 新建一列存储上面的值
最大值和最小值
max_calories = food_info["Energ_kcal"].max() # min
排序
food_info.sort_values("Sodium_(mg)",inplace=True) # 从小到大进行排序
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=False) # 降序
import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") titanic_survival.head() age = titanic_survival["Age"] age_is_null = pd.isnull(age) # age里面的数,如果是缺失值,那么返回True,如果没有缺失返回False
age_null_true = age[age_is_null] # 显示age里面所有的缺失值
age_null_count = len(age_null_true) # 缺失值的长度
good_ages = titanic_survival["Age"][age_is_null == False] # 去掉缺失值
correct_mean_age = sum(good_ages) / len(good_ages) # 去掉缺失值后,求平均年龄
correct_mean_age = titanic_survival["Age"].mean() # 去掉缺失值后,求平均年龄
passenger_classes = [1,2,3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] # 获取X等舱的所有的数据
pclass_fares = pclass_rows["Fare"] # 获取X等舱的票价的所有数据
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class # X等舱的平均票价
上面的代码等于下面这一句
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Fare",aggfunc=np.mean)
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Fare") # 默认是求均值
passenger_survival = titanic_survival.pivot_table(index="Embarked",values="Fare,"Survived"],aggfunc=np.sum)
drop_na_columns = titanic_survival.dropna(axis=1) # 去掉丢失值
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age","Sex"]) # 如果Age或Sex缺失,则丢掉这条数据
row_index_83_age = titanic_survival.loc[83,"Age"]
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
titanic_reindexed = new_titanci_survival.reset_index(drop=True) # 按照Age排序后,将原来的index值去掉,index重新从0开始排序
自定义函数
def hundredth_row(column): # 自定义返回第100行的数据的函数
hundredth_item = column.loc[99]
return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
def not_null_count(column): # 返回缺失值的个数
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)# 注意这里没有形参,返回的是每一列的缺失的个数
series
import pandas as pd
from pandas import Series fandango = pd.read_csv('fandango_score_comparison.csv') series_film = fandango['FILM']
series_rt = fandango['RottenTomatoes'] print(type(series_film)) # <class 'pandas.core.series.Series'>
film_names = series_film.values # 获取每一个值,电影名称
rt_scores = series_rt.values # 评分
series_custom = Series(rt_scores,index=film_names) # 索引值为film_named的值
series_custom[5:10]
original_index = series_custom.index.tolist()
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
sc2 = series_customer.sort_index() # 按照index进行排序
sc2 = series_customer.sort_values() # 按照values进行排序
rt_critics = Series(fandango['RottenTomatoes'].values,index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values,index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
fandango_films = fandango.set_index('FILM',drop=False) # FIML这一列作为index
print(fandango_films.index)
fandango_films。loc["Avengers:Age of Ultron(2015)":"Hot tub Time Machine 2 (2015)"] # 切片,类似于数值作为索引的切片
