特征選擇-邏輯回歸(L2正則化)、隨機森林


一、邏輯回歸

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
#導入數據
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',header=None)
df_wine.columns=['Class label','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']
print ('class labels:',np.unique(df_wine['Class label']))
#print (df_wine.head(5))
#分割訓練集合測試集
X,y=df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
#特征值縮放
#歸一化
mms=MinMaxScaler()
X_train_norm=mms.fit_transform(X_train)
X_test_norm=mms.fit_transform(X_test)
#標准化
stdsc=StandardScaler()
X_train_std=stdsc.fit_transform(X_train)
X_test_std=stdsc.fit_transform(X_test)
 
#L1正則化的邏輯斯蒂模型
lr=LogisticRegression(penalty='l2',C=0.1)#penalty='l2'
lr.fit(X_train_std,y_train)
print ('Training accuracy:',lr.score(X_train_std, y_train))
print ('Test accuracy:',lr.score(X_test_std, y_test))#比較訓練集和測試集,觀察是否出現過擬合
print (lr.intercept_)#查看截距,三個類別
print (lr.coef_)#查看權重系數,L1有稀疏化效果做特征選擇
 
#正則化效果,減少約束參數值C,增加懲罰力度,各特征權重系數趨近於0
fig=plt.figure()
ax=plt.subplot(111)
colors=['blue','green','red','cyan','magenta','yellow','black','pink','lightgreen','lightblue','gray','indigo','orange']
weights,params=[],[]
for c in np.arange(-4,6,dtype=float):
    lr=LogisticRegression(penalty='l2',C=10**c,random_state=0)
    lr.fit(X_train_std,y_train)
    weights.append(lr.coef_[0])#三個類別,選擇第一個類別來觀察
    params.append(10**c)
weights=np.array(weights)
for column,color in zip(range(weights.shape[1]),colors):
    plt.plot(params,weights[:,column],label=df_wine.columns[column+1],color=color)
plt.axhline(0,color='black',linestyle='--',linewidth=3)
plt.xlim([10**(-5),10**5])
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.xscale('log')
plt.legend(loc='upper left')
ax.legend(loc='upper center',bbox_to_anchor=(1.38,1.03),ncol=1,fancybox=True)
plt.show()
wine數據

二、隨機森林

##隨機森林特征重要性排序--wine數據
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
#導入數據
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',header=None)
df_wine.columns=['Class label','Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']
df_wine.shape
#df_wine.head()

print ('class labels:',np.unique(df_wine['Class label']))
#分割訓練集合測試集
X,y=df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
#特征值縮放-標准化,決策樹模型不依賴特征縮放
#stdsc=StandardScaler()
#X_train_std=stdsc.fit_transform(X_train)
#X_test_std=stdsc.fit_transform(X_test)
#隨機森林評估特征重要性
feat_labels=df_wine.columns[1:]
forest=RandomForestClassifier(n_estimators=10000,n_jobs=-1,random_state=0)
forest.fit(X_train,y_train)
importances=forest.feature_importances_
indices=np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    #給予10000顆決策樹平均不純度衰減的計算來評估特征重要性
    print ("%2d) %-*s %f" % (f+1,30,feat_labels[f],importances[indices[f]]) )
#可視化特征重要性-依據平均不純度衰減
plt.title('Feature Importance-RandomForest')
plt.bar(range(X_train.shape[1]),importances[indices],color='lightblue',align='center')
plt.xticks(range(X_train.shape[1]),feat_labels,rotation=90)
plt.xlim([-1,X_train.shape[1]])
plt.tight_layout()
plt.show()
wine數據

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM