df_train.shape

(10886, 12)

准備訓練集數據,測試集數據:
1. df_train_target：目標,也就是count字段。
2. df_train_data：用於產出特征的數據

df_train_target = df_train['count'].values 
print(df_train_target.shape) 
df_train_data = df_train.drop(['count'],axis =1).values
print(df_train_data.shape)

(10886,)
(10886, 11)

算法
咱們依舊會使用交叉驗證的方式（交叉驗證集約占全部數據的20%）來看看模型的效果,
我們會試 支持向量回歸/Suport Vector Regression, 嶺回歸/Ridge Regression 和
隨機森林回歸/Random Forest Regressor。每個模型會跑3趟看平均的結果。

from sklearn import linear_model
from sklearn import cross_validation
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.learning_curve import learning_curve
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import explained_variance_score

# 切分一下數據（訓練集和測試集）
cv = cross_validation.ShuffleSplit(len(df_train_data), n_iter=3, test_size=0.2,
    random_state=0)

# 各種模型來一圈

print("嶺回歸")    
for train, test in cv:    
    svc = linear_model.Ridge().fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print("支持向量回歸/SVR(kernel='rbf',C=10,gamma=.001)")
for train, test in cv:
    
    svc = svm.SVR(kernel ='rbf', C = 10, gamma = .001).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))
    
print("隨機森林回歸/Random Forest(n_estimators = 100)")    
for train, test in cv:    
    svc = RandomForestRegressor(n_estimators = 100).fit(df_train_data[train], df_train_target[train])
    print("train score: {0:.3f}, test score: {1:.3f}\n".format(
        svc.score(df_train_data[train], df_train_target[train]), svc.score(df_train_data[test], df_train_target[test])))

嶺回歸
train score: 0.339, test score: 0.332

train score: 0.330, test score: 0.370

train score: 0.342, test score: 0.320

支持向量回歸/SVR(kernel='rbf',C=10,gamma=.001)
train score: 0.417, test score: 0.408

train score: 0.406, test score: 0.452

train score: 0.419, test score: 0.390

隨機森林回歸/Random Forest(n_estimators = 100)
train score: 0.981, test score: 0.867

train score: 0.981, test score: 0.880

train score: 0.981, test score: 0.869

隨機森林回歸獲得了最佳結果
不過,參數設置得是不是最好的,這個我們可以用GridSearch來幫助測試,找最好的參數

X = df_train_data
y = df_train_target

X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    X, y, test_size=0.2, random_state=0)

tuned_parameters = [{'n_estimators':[10,100,500,550]}]   
    
scores = ['r2']

for score in scores:
    
    print(score)
    
    clf = GridSearchCV(RandomForestRegressor(), tuned_parameters, cv=5, scoring=score)
    clf.fit(X_train, y_train)

    print("最佳參數找到了：")
    print("")
    #best_estimator_ returns the best estimator chosen by the search
    print(clf.best_estimator_)
    print("")
    print("得分分別是:")
    print("")
    #grid_scores_的返回值:
    #    * a dict of parameter settings
    #    * the mean score over the cross-validation folds 
    #    * the list of scores for each fold
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print("")

r2
最佳參數找到了：

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=550, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

得分分別是:

0.846 (+/-0.006) for {'n_estimators': 10}
0.862 (+/-0.005) for {'n_estimators': 100}
0.863 (+/-0.005) for {'n_estimators': 500}
0.864 (+/-0.005) for {'n_estimators': 550}

Grid Search幫挑參數還是蠻方便的, 而且要看看模型狀態是不是,過擬合or欠擬合
我們發現n_estimators=500,550時,擬合得最好。

	datetime	season	weather	temp	atemp	humidity	casual	registered	count	hour	day	month
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16	0	5	1
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40	1	5	1
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32	2	5	1
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13	3	5	1
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1	4	5	1

	season	weather	temp	atemp	humidity	count	month	day	hour
0	1	1	9.84	14.395	81	16	1	5	0
1	1	1	9.02	13.635	80	40	1	5	1
2	1	1	9.02	13.635	80	32	1	5	2
3	1	1	9.84	14.395	75	13	1	5	3
4	1	1	9.84	14.395	75	1	1	5	4

Kaggle 自行車租賃預測比賽項目實現

免責聲明！