1.查看數據的類型概況
cols = [c for c in train.columns] #返回數據的列名到列表里
print('Number of features: {}'.format(len(cols)))
print('Feature types:')
train[cols].dtypes.value_counts()
結果如下:
Number of features: 376 Feature types:
2.查看特征的數值范圍
counts = [[], [], []]
for c in cols:
typ = train[c].dtype
uniq = len(np.unique(train[c])) #利用np的unique函數看看該列一共有幾個不同的數值
if uniq == 1: # uniq==1說明該列只有一個數值
counts[0].append(c)
elif uniq == 2 and typ == np.int64: # uniq==2說明該列有兩個數值,往往就是0與1的二類數值
counts[1].append(c)
else:
counts[2].append(c)
print('Constant features: {}\n Binary features: {} \nCategorical features: {}\n'.format(*[len(c) for c in counts]))
print('Constant features:', counts[0])
print('Categorical features:', counts[2])
結果如下:
Constant features: 12
Binary features: 356
Categorical features: 10
Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']
3.畫出類別特征值的分布情況
pal = sns.color_palette()
for c in counts[2]:
value_counts = train[c].value_counts()
fig, ax = plt.subplots(figsize=(10, 5))
plt.title('Categorical feature {} - Cardinality {}'.format(c, len(np.unique(train[c]))))
plt.xlabel('Feature value')
plt.ylabel('Occurences')
plt.bar(range(len(value_counts)), value_counts.values, color=pal[1])
ax.set_xticks(range(len(value_counts)))
ax.set_xticklabels(value_counts.index, rotation='vertical')
plt.show()