今天介紹一個機器學習包,sklearn。其功能模塊有regression\classification\clustering\Dimensionality reduction\data preprocessing\model selection
對我來說,常用的主要有regression(SVR)和classification(SVC)兩個部分。
首先介紹一下用sklearn.svm.SVR來做回歸,如下:
1)多元線性回歸
import numpy as np from sklearn.linear_model import LinearRegression rng = np.random.RandomState(10) # 設置隨機局部種子 x = 100 * rng.rand(50, 3) # 設置一個50行3列 所有值乘100的隨機矩陣 x1 = x[:, 0] x1.shape = 50, 1 x2 = x[:, 1] x2.shape = 50, 1 x3 = x[:, 2] x3.shape = 50, 1 y = 1.25 * x1 + 2.5 * x2 + 3 * x3 + 10 + rng.randn(50, 1) # randn是標准正態分布,用於核驗結果 model = LinearRegression(fit_intercept=True) model.fit(x, y) a = np.linspace(0, 50, 1000) # 從0到50創建1000個等差數列,驗證模型 x1_fit = a[:, np.newaxis] # 將a轉置成列 x2_fit = a[:, np.newaxis] x3_fit = a[:, np.newaxis] x_fit = np.hstack((x1_fit, x2_fit, x3_fit)) # 將x1,x2,x3合並一起 y_fit = model.predict(x_fit) # 對y預測 print("Model slope: ", model.coef_[0]) print("Model intercept:", model.intercept_) print('方程的判定系數(R^2): %.2f' % model.score(x, y)) #計算得分,R^2
2)多項式回歸
import random import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures x_data, y_data = [], [] # 隨機生成30個點 for x in range(-10, 20): y = - x ** 2 + 5 * x - 10 + random.random() * 20 x_data.append([x]) y_data.append([y]) # 特征構造 poly_reg = PolynomialFeatures(degree=2) #多項式構造 x_poly = poly_reg.fit_transform(x_data) # 創建線性模型 linear_reg = LinearRegression() linear_reg.fit(x_poly, y_data) plt.plot(x_data, y_data, 'b.') # 用特征構造數據進行預測 plt.plot(x_data, linear_reg.predict(poly_reg.fit_transform(x_data)), 'r') plt.show()
3)非線性回歸(一元為例)
from sklearn.svm import SVR from sklearn.model_selection import GridSearchCV #自動選擇最佳模型 from sklearn.tree import DecisionTreeRegressor #決策樹 from sklearn.ensemble import RandomForestRegressor #隨機森林 import numpy as np import matplotlib.pyplot as plt x = np.array([68.67,54.351,92.991,80.39,64.46]).reshape(-1, 1) #reshape為(-1,1),里面是[[1],[2]...] y = np.array([68.67,54.351,92.991,80.39,64.46]).reshape(-1, 1) # 選擇模型 #model = SVR(kernel='rbf') # model = DecisionTreeRegressor() # model = RandomForestRegressor() model = GridSearchCV(SVR(), param_grid={"kernel": ("linear", 'rbf', 'sigmoid'), "C": np.logspace(-3, 3, 7), "gamma": np.logspace(-3, 3, 7)}) model.fit(x, y) xneed = np.arrray([[1.2],[3.6]]) y_pre = model.predict(xneed)# 進行預測 plt.scatter(x, y, c='k', label='data', zorder=1) plt.plot(xneed, y_pre, c='r', label='SVR_fit') plt.show() print(model.best_params_)
補充:
1.如果要划分訓練樣本和測試樣本數據集。
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3) #選取0.3的測試集
2.為了增強數據之間相關性,通常對數據進行預處理,如標准化。
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_std = scaler.fit_transform(x) # 標准化
3.可以用GridSearchCV自動選擇最佳模型
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(svc, param_grid, cv=3, n_jobs=-1)
4.模型保存
from sklearn.externals import joblib #用於保存和讀取模型pkl
joblib.dump(model, 'svr.pkl') # 保存模型
svr = joblib.load('svr.pkl') # 讀取模型
過兩天補充一下sklearn.svm.SVC...
