在我們對於有很多特征值數據處理時,往往需要找到特征值對於結果Y權重最大的幾個,便於做降維。
於是我們可以用以下這段代碼:
GitHub:https://github.com/chenjunhaolefa/AI/blob/master/MachineLearning/FeatureSelection.py
# coding=utf-8 import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier #解決畫圖產生的中文亂碼問題 mpl.rcParams['font.sans-serif']=[u'simHei'] mpl.rcParams['axes.unicode_minus']=False #測試一下創建了X和Y的矩陣 X = np.random.random((3,3)) Y = np.arange(1,4) print X,Y def FeatureSelection(X,Y): # Build a classification task using 3 informative features ''' X, Y = make_classification(n_samples=10, #該函數負責創建一個自定義的矩陣(X->Y)的關系 n_features=10, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False)''' # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=10,random_state=0) #創建一個額外樹 #forest = RandomForestClassifier (n_estimators = 10) #創建一個隨機樹 #計算X特征值對於Y的影響,並排序出來 forest.fit(X, Y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] print indices # Print the feature ranking print(u"特征排名 :") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure() plt.title(u"特征選擇") plt.bar(range(X.shape[1]), importances[indices], color="r", yerr=std[indices], align="center") plt.xticks(range(X.shape[1]), indices) plt.xlim([-1, X.shape[1]]) plt.show() FeatureSelection(X,Y)
