mlxtend.feature_selection 特征工程

本文轉載自查看原文 2020-08-18 14:51 1021 sklearn

特征選擇

主要思想：包裹式（封裝器法）從初始特征集合中不斷的選擇特征子集，訓練學習器，根據學習器的性能來對子集進行評價，直到選擇出最佳的子集。包裹式特征選擇直接針對給定學習器進行優化

案例一、封裝器法

常用實現方法：循序特征選擇。

循序向前特征選擇：Sequential Forward Selection，SFS
循序向后特征選擇：Sequential Backword Selection，SBS

SFS過程展示圖：

例子：

SequentialFeatureSelector(estimator, K_features=1, 
                          forvard=Truer,
                          floating=False, 
                          verhose=o , scoring=None,
                          cv=5, n_jobs=1, 
                          pre_dispatch-=2*n_jobs, 
                          clone_estimator=True)

加載數據集

#加載數據集
from mlxtend.feature_selection import SequentialFeatureSelector as SFS #SFS
from mlxtend.data import wine_data #dataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = wine_data()
X.shape  #(178, 13)

數據預處理

#數據預處理
X_train, X_test, y_train, y_test= train_test_split(X, y, stratify=y, test_size=0.3, random_state=1)
std = StandardScaler()
X_train_std = std.fit_transform(X_train)

循序向前特征選擇

#循序向前特征選擇
knn = KNeighborsClassifier(n_neighbors=3)

sfs = SFS(estimator=knn, k_features=4, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0)
sfs.fit(X_train_std, y_train)  #xy不能是df

查看特征索引

#查看特征索引
sfs.subsets_

{1: {'feature_idx': (6,),
'cv_scores': array([0.86290323]),
'avg_score': 0.8629032258064516},
2: {'feature_idx': (6, 9),
'cv_scores': array([0.95967742]),
'avg_score': 0.9596774193548387},
3: {'feature_idx': (6, 9, 11),
'cv_scores': array([0.99193548]),
'avg_score': 0.9919354838709677},
4: {'feature_idx': (6, 8, 9, 11),
'cv_scores': array([0.98387097]),
'avg_score': 0.9838709677419355}}

可視化#1 Plotting the results

%matplotlib inline
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

其中 sfs.get_metric_dict()的結果如下：

{1: {'feature_idx': (6,),
'cv_scores': array([0.86290323]),
'avg_score': 0.8629032258064516,
'ci_bound': nan,
'std_dev': 0.0,
'std_err': nan},
2: {'feature_idx': (6, 9),
'cv_scores': array([0.95967742]),
'avg_score': 0.9596774193548387,
'ci_bound': nan,
'std_dev': 0.0,
'std_err': nan},
3: {'feature_idx': (6, 9, 11),
'cv_scores': array([0.99193548]),
'avg_score': 0.9919354838709677,
'ci_bound': nan,
'std_dev': 0.0,
'std_err': nan},
4: {'feature_idx': (6, 8, 9, 11),
'cv_scores': array([0.98387097]),
'avg_score': 0.9838709677419355,
'ci_bound': nan,
'std_dev': 0.0,
'std_err': nan}}

可視化#2 Selecting the “best” feature combination in a k-range

knn = KNeighborsClassifier(n_neighbors=3)
sfs2 = SFS(estimator=knn, k_features=(3, 10),
                   forward=True, 
                   floating=True,   
                   verbose=0,
                   scoring='accuracy',
                   cv=5)
sfs2.fit(X_train_std, y_train)
fig = plot_sfs(sfs2.get_metric_dict(), kind='std_err')

全部代碼如下：

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 11 10:12:48 2020

@author: Admin
"""

#加載數據集
from mlxtend.feature_selection import SequentialFeatureSelector as SFS #SFS
from mlxtend.data import wine_data #dataset
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X, y = wine_data()
X.shape  #(178, 13)


#數據預處理
X_train, X_test, y_train, y_test= train_test_split(X, y, stratify=y, test_size=0.3, random_state=1)
std = StandardScaler()
X_train_std = std.fit_transform(X_train)


#循序向前特征選擇
knn = KNeighborsClassifier(n_neighbors=3)

sfs = SFS(estimator=knn, k_features=4, forward=True, floating=False, verbose=2, scoring='accuracy', cv=0)
sfs.fit(X_train_std, y_train)

#查看特征索引
sfs.subsets_

#可視化#1  Plotting the results
%matplotlib inline
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')  

#可視化#2   Selecting the “best” feature combination in a k-range
knn = KNeighborsClassifier(n_neighbors=3)
sfs2 = SFS(estimator=knn, k_features=(3, 10),
                   forward=True, 
                   floating=True,   
                   verbose=0,
                   scoring='accuracy',
                   cv=5)
sfs2.fit(X_train_std, y_train)
fig = plot_sfs(sfs2.get_metric_dict(), kind='std_err')

View Code

案例二、封裝器之窮舉特征選擇

窮舉特征選擇（Exhaustive feature selection），即封裝器中搜索算法是將所有特征組合都實現一遍，然后通過比較各種特征組合后的模型表現，從中選擇出最佳的特征子集

導入相關庫

#導入相關庫
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

加載數據集

#加載數據集
iris = load_iris()
X = iris.data
y = iris.target

窮舉特征選擇

#窮舉特征選擇
knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors=3

efs = EFS(knn,
         min_features=1,
         max_features=4,
         scoring='accuracy',
         print_progress=True,
         cv=5)
efs = efs.fit(X, y)

查看最佳特征子集

#查看最佳特征子集
print('Best accuracy score: %.2f' % efs.best_score_)  #Best accuracy score: 0.97
print('Best subset(indices):', efs.best_idx_)         #Best subset(indices): (0, 2, 3)
print('Best subset (correponding names):', efs.best_feature_names_)  #沒有這個函數

度量標准

#度量標准
efs.get_metric_dict()


import pandas as pd

df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

可視化

#可視化
import matplotlib.pyplot as plt

# 平均值
metric_dict = efs.get_metric_dict() 
k_feat = sorted(metric_dict.keys())
avg = [metric_dict[k]['avg_score'] for k in k_feat]

# 區域
fig = plt.figure()
upper, lower = [], []
for k in k_feat: #bound
    upper.append(metric_dict[k]['avg_score'] + metric_dict[k]['std_dev'])
    lower.append(metric_dict[k]['avg_score'] - metric_dict[k]['std_dev'])

plt.fill_between(k_feat, upper, lower, alpha=0.2, color='blue', lw=1)

# 折線圖
plt.plot(k_feat, avg, color='blue', marker='o')

# x, y 軸標簽
#無法運行
'''
plt.ylabel('Accuracy +/- Standard Deviation')
plt.xlabel('Number of Features')
feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
plt.xticks(k_feat, 
    [str(metric_dict[k]['feature_names']) for k in k_feat], 
    rotation=90)
plt.show()
'''

全部代碼如下：

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 11 10:12:48 2020

@author: Admin
"""

#導入相關庫
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

#加載數據集
iris = load_iris()
X = iris.data
y = iris.target


#窮舉特征選擇
knn = KNeighborsClassifier(n_neighbors=3) # n_neighbors=3

efs = EFS(knn,
         min_features=1,
         max_features=4,
         scoring='accuracy',
         print_progress=True,
         cv=5)
efs = efs.fit(X, y)

#查看最佳特征子集
print('Best accuracy score: %.2f' % efs.best_score_)  #Best accuracy score: 0.97
print('Best subset(indices):', efs.best_idx_)         #Best subset(indices): (0, 2, 3)
print('Best subset (correponding names):', efs.best_feature_names_)  #沒有這個函數 




#度量標准
efs.get_metric_dict()


import pandas as pd

df = pd.DataFrame.from_dict(efs.get_metric_dict()).T
df.sort_values('avg_score', inplace=True, ascending=False)
df

#可視化
import matplotlib.pyplot as plt

# 平均值
metric_dict = efs.get_metric_dict() 
k_feat = sorted(metric_dict.keys())
avg = [metric_dict[k]['avg_score'] for k in k_feat]

# 區域
fig = plt.figure()
upper, lower = [], []
for k in k_feat: #bound
    upper.append(metric_dict[k]['avg_score'] + metric_dict[k]['std_dev'])
    lower.append(metric_dict[k]['avg_score'] - metric_dict[k]['std_dev'])

plt.fill_between(k_feat, upper, lower, alpha=0.2, color='blue', lw=1)

# 折線圖
plt.plot(k_feat, avg, color='blue', marker='o')

# x, y 軸標簽
#無法運行
'''
plt.ylabel('Accuracy +/- Standard Deviation')
plt.xlabel('Number of Features')
feature_min = len(metric_dict[k_feat[0]]['feature_idx'])
feature_max = len(metric_dict[k_feat[-1]]['feature_idx'])
plt.xticks(k_feat, 
    [str(metric_dict[k]['feature_names']) for k in k_feat], 
    rotation=90)
plt.show()
'''

View Code

案例三、過濾器法

例1

方差閾值（VarianceThreshold）是特征選擇的一個簡單方法，去掉那些方差沒有達到閾值的特征。默認情況下，刪除零方差的特征，例如那些只有一個值的樣本。
假設我們有一個有布爾特征的數據集，然后我們想去掉那些超過80%的樣本都是0（或者1）的特征。布爾特征是伯努利隨機變量，方差為 p(1-p)。

使用方差選擇法，先要計算各個特征的方差，然后根據閾值，選擇方差大於閾值的特征。使用feature_selection庫的VarianceThreshold類
方差選擇法，返回值為特征選擇后的數據 #參數threshold為方差的閾值

from sklearn.feature_selection import VarianceThreshold

X = [[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]
print(X)  #[[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1, 1]]

sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
'''
array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])
'''

例子2

X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
print(X)  #[[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]

seletor = VarianceThreshold() #默認方法大於0
seletor.fit_transform(X)
'''
array([[2, 0],
       [1, 4],
       [1, 1]])
'''

案例四、嵌入法

對系數排序——即特征權重，然后依據某個閾值選擇部分特征。
在訓練模型的同時，得到了特征權重，並完成特征選擇。像這樣，將特征選擇過程與模型訓練融為一體，在模型訓練過程中自動進行了特征選擇，被稱為“嵌入法” （Embedded）特征選擇。

在過濾式和包裹式特征選擇方法中，特征選擇過程與學習器訓練過程有明顯的分別。而嵌入式特征選擇在學習器訓練過程中自動地進行特征選擇。嵌入式選擇最常用的是L1正則化與L2正則化。在對線性回歸模型加入兩種正則化方法后，他們分別變成了嶺回歸與Lasso回歸

例1

xgboost自帶feature_importances_

#加載數據集
iris = load_iris()
X = iris.data
y = iris.target

#Xgboost特征重要性
from xgboost import XGBClassifier
model = XGBClassifier() # 分類
model.fit(X,y)
model.feature_importances_ # 特征重要性  array([0.01251974, 0.03348068, 0.59583396, 0.35816565], dtype=float32)

#可視化
%matplotlib inline
from xgboost import plot_importance plot_importance(model)

例2

import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

# Load the boston dataset.
X, y = load_boston(return_X_y=True)

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]
# Plot the selected two features from X.
plt.title(
"Features selected from Boston using SelectFromModel with " "threshold %0.3f." % sfm.threshold)
feature1 = X_transform[:, 0]
feature2 = X_transform[:, 1]
plt.plot(feature1, feature2, 'r.')
plt.xlabel("Feature number 1")
plt.ylabel("Feature number 2")
plt.ylim([np.min(feature2), np.max(feature2)])
plt.show()

例3

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
X = [[ 0.87, -1.34, 0.31 ],
    [-2.79, -0.02, -0.85 ],
    [-1.34, -0.48, -2.55 ],
    [ 1.92, 1.48, 0.65 ]]
y = [0, 1, 0, 1]
selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)

# The base estimator from which the transformer is built.
print(selector.estimator_.coef_)  #[[-0.32857694  0.83411609  0.46668853]]

# The threshold value used for feature selection.
print(selector.threshold_)  #0.5431271870420732

# Get a mask, or integer index, of the features selected
print(selector.get_support)

# Reduce X to the selected features.
selector.transform(X)

'''
array([[-1.34],
       [-0.02],
       [-0.48],
       [ 1.48]])
'''

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 機器學習-特征工程-Feature generation 和 Feature selection 特征選擇（Feature Selection）特征工程(Feature Engineering) 特征工程(Feature Engineering) [特征選擇] An Introduction to Feature Selection 翻譯特征選擇 (feature_selection) 特征選擇與稀疏學習（Feature Selection and Sparse Learning） feature selection 特征選擇- Sklearn.feature_selection的理解機器學習之特征選擇（Feature Selection）