對一個表格類數據集進行數據分析,常常有以下幾個步驟:
1.數據總覽
- 讀取數據集並了解數據集大小,原始特征維度
- 查看特征的數據類型和基本統計量
2.缺失值和唯一值
- 查看數據缺失情況
- 查看唯一值特征情況
3.深入數據
- 類別型數據
- 數值型數據(離散型、連續型)
4.數據間相關關系
- 特征與特征之間
- 特征與目標變量之間
5.用pandas_profilling生成數據報告
以一個零售風控二分類的比賽數據集為例,進行數據分析
1.總覽
print(train.shape) print(test.shape) print(train.columns)
train.info()
data_train.describe()
(800000, 47) (200000, 46) Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade', 'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership', 'annualIncome', 'verificationStatus', 'issueDate', 'isDefault', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'], dtype='object')
對於二分類問題,還應該看下正負樣本的比例,看是否存在類別不平衡問題。
train['isDefault'].value_counts()
0 640390 1 159610 Name: isDefault, dtype: int64
2.缺失情況
查看有多少列存在缺失值以及缺失率,對於缺失值非常多的列可以考慮刪除,缺失很少的列考慮進行填充
train.isnull().any().sum()
22
missing=train.isnull().sum()/len(train) missing=missing[missing>0] missing.sort_values(inplace=True) missing.plot.bar()
查看是否有特征只有唯一值,若有,后續特征工程考慮刪掉此變量
one_value_col=[col for col in train.columns if train[col].nunique()<=1] one_value_col
['policyCode']
3.深入數據類型
首先查看有多少列是數值型變量、多少列是分類型變量
numerical_fea=list(train.select_dtypes(exclude=['object']).columns) category_fea=list(filter(lambda x:x not in numerical_fea,list(train.columns))) print(len(numerical_fea)) print(len(category_fea)) print(category_fea) print(numerical_fea)
42 5 ['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine'] ['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome',
'verificationStatus', 'isDefault', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus',
'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
可以看出在46列特征中,只有5個是object類型(后續特征工程需要進行轉換)
for col in category_fea: temp=train[col].value_counts() plt.figure(figsize=(10,6)) sns.barplot(x=temp.index,y=temp.values) plt.xlabel(str(col)) plt.show() plt.close('all')
其次對於數值型變量,需要划分離散型和連續型。此處定義變量取值小於10個即為離散型。
def get_numerical_serial_fea(data,feas): numerical_serial_fea=[] numerical_noserial_fea=[] for fea in feas: temp=data[fea].nunique() if temp<=10: numerical_noserial_fea.append(fea) continue numerical_serial_fea.append(fea) return numerical_serial_fea,numerical_noserial_fea numerical_serial_fea,numerical_noserial_fea=get_numerical_serial_fea(train,numerical_fea)
對於離散型變量,着重分析是否存在嚴重偏斜(大多數樣本都取相同數值);
#繪圖查看離散型變量分布 for col in numerical_noserial_fea: temp=train[col].value_counts() plt.figure() sns.barplot(x=temp.index,y=temp.values) plt.xlabel(str(col)) plt.show() plt.close('all')
對於連續型變量,着重分析數據是否服從正態分布(或對數正態分布),對於偏態的數據可能會影響預測結果。
#繪圖查看連續型變量分布 f = pd.melt(train, value_vars=numerical_serial_fea) g = sns.FacetGrid(f, col="variable", col_wrap=4, sharex=False, sharey=False) g = g.map(sns.distplot, "value")
4.數據間相關關系
首先探究變量之間、變量和目標變量之間的相關關系
corr=train.corr() sns.heatmap(corr, annot=False) plt.show()
還可以根據y值不同查看各個變量的分布
loan_def=train.loc[train['isDefault']==1] loan_nodef=train.loc[train['isDefault']==0]
fig,((ax1,ax2),(ax3,ax4))=plt.subplots(2,2,figsize=(15,8)) loan_def.groupby('grade').grade.count().plot(kind='barh',ax=ax1,title='Count of grade fraud') loan_nodef.groupby('grade').grade.count().plot(kind='barh',ax=ax2,title='Count of grade fraud') loan_def.groupby('employmentLength').employmentLength.count().plot(kind='barh',ax=ax3,title='Count of grade fraud') loan_nodef.groupby('employmentLength').employmentLength.count().plot(kind='barh',ax=ax4,title='Count of grade fraud') plt.show()
fig,((ax1,ax2))=plt.subplots(1,2,figsize=(15,6)) train.loc[train['isDefault'] == 1]['loanAmnt'].plot( kind='hist', bins=100, title='Log Loan Amt - Fraud', color='r', ax=ax1) train.loc[train['isDefault'] == 0]['loanAmnt'].plot( kind='hist', bins=100, title='Log Loan Amt - Not Fraud', color='b', ax=ax2)
5.用pandas_profiling生成數據報告
import pandas_profiling pfr = pandas_profiling.ProfileReport(train) pfr.to_file("./example.html")
6.其他
查看日期格式的數據
#轉化格式 issueDateDT特征表示數據日期離數據集中日期最早的日期(2007-06-01)的天數 day = pd.to_datetime(train['issueDate'],format='%Y-%m-%d') startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') day = day.apply(lambda x: x-startdate).dt.days sns.histplot(day)
數據透視表
pivot=pd.pivot_table(train,index=['grade'],columns=['employmentLength'],values=['loanAmnt'], aggfunc=np.mean) pivot
loanAmnt |
|||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
employmentLength | 1 year | 10+ years | 2 years | 3 years | 4 years | 5 years | 6 years | 7 years | 8 years | 9 years | < 1 year |
grade | |||||||||||
A | 13335.898602 | 14639.775562 | 13388.455829 | 13486.865618 | 13553.706061 | 13496.995001 | 13775.733802 | 13935.131159 | 14192.510565 | 14072.881276 | 13560.589568 |
B | 12486.311108 | 14191.576561 | 12655.767868 | 12852.254751 | 12997.182897 | 13048.167405 | 13135.203245 | 13318.696946 | 13460.523945 | 13513.865997 | 12994.001504 |
C | 13093.052072 | 15527.287529 | 13383.550540 | 13587.211321 | 13731.955067 | 13860.520936 | 14098.561372 | 14395.124677 | 14413.680358 | 14699.868190 | 13483.717789 |
D | 14204.809266 | 16918.674549 | 14418.175926 | 14476.062066 | 14837.774220 | 14834.854212 | 15224.665884 | 15742.203467 | 15625.839781 | 15967.309875 | 14230.622259 |
E | 16304.007848 | 19339.688764 | 16762.507469 | 16840.061266 | 17080.681138 | 17478.838499 | 17938.082852 | 17567.287968 | 17981.827812 | 18108.666970 | 16209.714997 |
F | 17570.015699 | 20787.572663 | 17880.975030 | 18417.187500 | 18881.518876 | 19196.168342 | 19050.279018 | 19315.302691 | 19507.407407 | 19630.162338 | 18335.909091 |
G | 18475.923295 | 22099.393271 | 20240.042827 |