數據預處理以及探索性分析(EDA)


1.根據某個列進行groupby,判斷是否存在重復列。

# Count the unique variables (if we got different weight values, 
# for example, then we should get more than one unique value in this groupby)
all_cols_unique_players = df.groupby('playerShort').agg({col:'nunique' for col in player_cols})

 其中針對.agg函數:

DataFrame.agg(selffuncaxis=0*args**kwargs)[source]

Aggregate using one or more operations over the specified axis.

例子:

>>> df = pd.DataFrame([[1, 2, 3],
...                    [4, 5, 6],
...                    [7, 8, 9],
...                    [np.nan, np.nan, np.nan]],
...                   columns=['A', 'B', 'C'])


>>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
        A    B
max   NaN  8.0
min   1.0  2.0
sum  12.0  NaN

2.獲取一個字表

其主要實現的是: """Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""

player_index = 'playerShort'
player_cols = [#'player', # drop player name, we have unique identifier
               'birthday',
               'height',
               'weight',
               'position',
               'photoID',
               'rater1',
               'rater2',
              ]
def get_subgroup(dataframe, g_index, g_columns):
    """Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""
    g = dataframe.groupby(g_index).agg({col:'nunique' for col in g_columns})
    if g[g > 1].dropna().shape[0] != 0:
        print("Warning: you probably assumed this had all unique values but it doesn't.")
    return dataframe.groupby(g_index).agg({col:'max' for col in g_columns})
players = get_subgroup(df, player_index, player_cols)
players.head()

3.將整理出的字表存儲至CSV文件

def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
    save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
    dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
    test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
    # Test that we recover what we send in
    if dataframe.equals(test_df):
        print("Test-passed: we recover the equivalent subgroup dataframe.")
    else:
        print("Warning -- equivalence test!!! Double-check.")

4.數據具體指標查看---缺失值查看

import missingno as msno
import pandas_profiling
msno.matrix(players.sample(500),
            figsize=(16, 7),
            width_ratios=(15, 1))
msno.heatmap(players.sample(500),
            figsize=(16, 7),)

5.數據透視表

轉至https://blog.csdn.net/Dorisi_H_n_q/article/details/82288092

透視表概念:pd.pivot_table()

透視表是各種電子表格程序和其他數據分析軟件中一種常見的數據匯總工具。它根據一個或多個鍵對數據進行聚合,並根據行和列上的分組鍵將數據分配到各個矩形區域中。

 透視表:根據特定條件進行分組計算,查找數據,進行計算
pd.pivot_table(df,index=['hand'],columns=['male'],aggfunc='min')

交叉表概念:pd.crosstab(index,colums)

交叉表是一種用於計算分組頻率的特殊透視圖,對數據進行匯總。

pd.crosstab(players.rater1, players.rater2)

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(pd.crosstab(players.rater1, players.rater2), cmap='Blues', annot=True, fmt='d', ax=ax)
ax.set_title("Correlation between Rater 1 and Rater 2\n")
fig.tight_layout()

創建一個新列,新列的值是另外兩列的平均值

players['skintone'] = players[['rater1', 'rater2']].mean(axis=1)
players.head()

6.對離散category數值進行處理

#連續變量離散化
weight_categories = ["vlow_weight", "low_weight", "mid_weight", "high_weight", "vhigh_weight", ] players['weightclass'] = pd.qcut(players['weight'], len(weight_categories), weight_categories)

 

 (Create higher level categories)

position_types = players.position.unique()
position_types
“”“
array(['Center Back', 'Attacking Midfielder', 'Right Midfielder',
       'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder',
       'Left Fullback', nan, 'Left Midfielder', 'Right Fullback',
       'Center Forward', 'Left Winger', 'Right Winger'], dtype=object)
”“”

defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder',]
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']
keeper = 'Goalkeeper'

# modifying dataframe -- adding the aggregated position categorical position_agg
players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"

繪制value_counts()圖片

MIDSIZE = (12, 8)
fig, ax = plt.subplots(figsize=MIDSIZE)
players['position_agg'].value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
ax.set_ylabel("position_agg")
ax.set_xlabel("Counts")
fig.tight_layout()

7.繪制多變量之間的關系圖

from pandas.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(10, 10))
scatter_matrix(players[['height', 'weight', 'skintone']], alpha=0.2, diagonal='hist', ax=ax);

# Perhaps you want to see a particular relationship more clearly

fig, ax = plt.subplots(figsize=MIDSIZE)
sns.regplot('weight', 'height', data=players, ax=ax)
ax.set_ylabel("Height [cm]")
ax.set_xlabel("Weight [kg]")
fig.tight_layout()

8.連續變量離散化(Create quantile bins for continuous variables)

weight_categories = ["vlow_weight",
                     "low_weight",
                     "mid_weight",
                     "high_weight",
                     "vhigh_weight",
                    ]

players['weightclass'] = pd.qcut(players['weight'],
                                 len(weight_categories),
                                 weight_categories)

9.數據報表查看

pandas_profiling.ProfileReport(players)

10.出生日期等時間格式處理

players['birth_date'] = pd.to_datetime(players.birthday, format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime("2013-01-01") - players['birth_date']).dt.days)/365.25
players['age_years']
//選擇特定列
players_cleaned_variables = players.columns.tolist()
players_cleaned_variables
player_dyad = (clean_players.merge(agg_dyads.reset_index().set_index('playerShort'),
                                   left_index=True,
                                   right_index=True))
//groupby+sort_values+rename
(tidy_dyads.groupby(level=1)
           .sum()
           .sort_values('redcard', ascending=False)
           .rename(columns={'redcard':'total redcards received'})).head()

針對時間統計可以分別列出,年、月、日以及相應的小時等數據,查看年的、月的、季度的、每天時間段的各個統計量。 

11.數據偏度與峰度

正態分布的偏度應為零。負偏度表示偏左,正偏表示右偏。

峰度也是一個正態分布和零只能是積極的。我們肯定有一些異常值!

對於分布超級不均衡的數據,采用log變換的方式,將變量的統計部分變為正常。

具體的代碼采用的是np.log

recent[['total_pop']].apply(np.log).apply(scipy.stats.skew)

//繪圖函數
def
plot_hist(df, variable, bins=20, xlabel=None, by=None, ylabel=None, title=None, logx=False, ax=None): if not ax: fig, ax = plt.subplots(figsize=(12,8)) if logx: if df[variable].min() <=0: df[variable] = df[variable] - df[variable].min() + 1 print('Warning: data <=0 exists, data transformed by %0.2g before plotting' % (- df[variable].min() + 1)) bins = np.logspace(np.log10(df[variable].min()), np.log10(df[variable].max()), bins) ax.set_xscale("log") ax.hist(df[variable].dropna().values, bins=bins); if xlabel: ax.set_xlabel(xlabel); if ylabel: ax.set_ylabel(ylabel); if title: ax.set_title(title); return ax
plot_hist(recent, 'total_pop', bins=25, logx=True, 
          xlabel='Log of total population', ylabel='Number of countries',
          title='Distribution of total population of countries 2013-2017');
 
         

查看單個變量之間的變化規律時,在出現變量數值不均衡,有的數值很大,有的很小時,可以采用變量的增長速度比率進行表示,查看其變化情況。例子:

原數據

with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    north_america = time_slice(subregion(data, 'North America'), '1958-1962').sort_values('total_pop').index.tolist()
    for country in north_america:
        plt.plot(time_series(data, country, 'total_pop'), label=country);
        plt.xlabel('Year');
        plt.ylabel('Population');
        plt.title('North American populations over time');
    plt.legend(loc=2,prop={'size':10});

 

比率處理后的數據可視化

with sns.color_palette(sns.diverging_palette(220, 280, s=85, l=25, n=23)):
    for country in north_america:
        ts = time_series(data, country, 'total_pop')
        ts['norm_pop'] = ts.total_pop/ts.total_pop.min()*100
        plt.plot(ts['norm_pop'], label=country);
        plt.xlabel('Year');
        plt.ylabel('Percent increase in population');
        plt.title('Percent increase in population from 1960 in North American countries'); 
plt.legend(loc=2,prop={'size':10});

recent[['gdp_bin','total_pop_access_drinking']].boxplot(by='gdp_bin');
# plt.ylim([0,100000]);
plt.title('Distribution of percent of total population with access to drinking water across gdp per capita categories');
plt.xlabel('GDP per capita quintile');
plt.ylabel('Total population of country');

 12.對特定列進行處理

simple_regions ={
    'World | Asia':'Asia',
    'Americas | Central America and Caribbean | Central America': 'North America',
    'Americas | Central America and Caribbean | Greater Antilles': 'North America',
    'Americas | Central America and Caribbean | Lesser Antilles and Bahamas': 'North America',
    'Americas | Northern America | Northern America': 'North America',
    'Americas | Northern America | Mexico': 'North America',
    'Americas | Southern America | Guyana':'South America',
    'Americas | Southern America | Andean':'South America',
    'Americas | Southern America | Brazil':'South America',
    'Americas | Southern America | Southern America':'South America', 
    'World | Africa':'Africa',
    'World | Europe':'Europe', 
    'World | Oceania':'Oceania'
}
data.region = data.region.apply(lambda x: simple_regions[x])

 

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM