step_1:目標確定
通過問卷調查數據,選取其中多組變量來預測其對幸福感的評價。
step_2:數據獲取
連接:
https://tianchi.aliyun.com/competition/entrance/231702/information
下載:
train_set:happiness_train_complete.csv
test_set:happiness_test_complete.csv
index:文件中包含每個變量對應的問卷題目,以及變量取值的含義
survey:文件是數據源的原版問卷,作為補充以方便理解問題背景
step_3:train_set數據清洗和整理
使用matplotlib.pyplot依次畫出id和其它列的scatter圖
通過圖對數據進行操作:
- happiness是樣本標簽(預測模型的真實值),通過問卷發現其類別只有1,2,3,4,5,通過圖發現有-8,應當刪除值為-8這些噪音數據
- 刪除id、survey_time、edu_other、join_party、property_other、invest_other列
- 其它列所有小於0的值和空值均設置為-8
- 均值歸一化

# jupyter notebook下運行 import numpy as np import pandas as pd import matplotlib.pyplot as plt # 導入訓練數據集和測試集 # encoding='gbk',不能用utf-8 train_data = pd.read_csv('happiness_train_complete.csv', encoding='gbk') test_data = pd.read_csv('happiness_test_complete.csv', encoding='gbk') # 訓練集樣本個數8000,每個樣本含有140個特征 # 測試集樣本個數2968,每個樣本含有139個特征 train_data.shape test_data.shape # 去除-8值 train_data = train_data[train_data.happiness>0] train_data.shape # 訓練集標簽 y = train_data.happiness ind1 = ['id','happiness','survey_time','edu_other','join_party','property_other','invest_other'] # 訓練集樣本中刪除指定列數據 X = train_data.drop(ind, axis=1) # 刪除測試集中刪除指定列數據 ind2 = ['id','survey_time','edu_other','join_party','property_other','invest_other'] X_test_data = test_data.drop(ind, axis=1) # 把DateFrame類型轉為np.array y = np.array(y, dtype=int) X = np.array(X, dtype=float) X_test_data = np.array(X_test_data, dtype=float) # 把小於0的值設置為-8 X[X<0]=-8 X_test_data[X_test_data<0]=-8 from sklearn.impute import SimpleImputer # 把樣本中的值為空的特征設置為-8 X = SimpleImputer(fill_value=-8).fit_transform(X) X_test_data = SimpleImputer(fill_value=-8).fit_transform(X_test_data) from sklearn.model_selection import train_test_split # 因為測試集沒有標簽,所以拆分訓練集 X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=666) # 均值歸一化 from sklearn.preprocessing import StandardScaler std = StandardScaler().fit(X_train) X_train_std = std.transform(X_train) X_test_std = std.transform(X_test) std_1 = StandardScaler().fit(X) X_std = std_1.transform(X) X_test_data = std_1.transform(X_test_data)
step_4:選擇算法並實現模型
這是一個分類問題初步定為使用KNN算法來進行建模

from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import GridSearchCV param_grid = [ { 'weights': ['uniform'], 'n_neighbors':[i for i in range(1,11)] }, { 'weights': ['distance'], 'n_neighbors': [i for i in range(1,11)], 'p': [i for i in range(1,6)] }] # 網格搜索優化超參數 knn_clf_grid = KNeighborsClassifier() grid_search = GridSearchCV(knn_clf_grid, param_grid,n_jobs=-1, verbose=2).fit(X_train_std, y_train) # 最優超參數為:{'n_neighbors': 10, 'p': 1, 'weights': 'distance'} grid_search.best_estimator_ grid_search.best_params_ grid_search.best_score_ # 使用真正測試集加載模型 knn = KNeighborsClassifier(n_neighbors=10, p=1, weights='distance').fit(X_std, y) y_pre = knn.predict(X_test_data) # 把預測結果寫入文件 df = pd.DataFrame({'id':test_data.id, 'happniess': y_pre}) df.to_csv('forecast_3.csv', index=None)
提交結果到天池等待評測分數結果score=0.6814
結果提交3次:
第一次:score=1.3260
第二次:數據均值歸一化score=0.9629
第三次:數據均值歸一化+網格搜索優化超參數score=0.6814
第四次:數據均值歸一化+PCA+邏輯回歸(OvO)score=0.6099

import numpy as np import pandas as pd # 導入train_set和test_set, encoding='gbk',不能用utf-8 train_set = pd.read_csv('happiness_train_complete.csv', encoding='gbk') test_set = pd.read_csv('happiness_test_complete.csv', encoding='gbk') # 去除標簽中不合理的數據 -8 train_set = train_set[train_set.happiness>0] y_label = train_set.happiness ind1 = ['id','happiness','survey_time','edu_other','join_party','property_other','invest_other'] X_train_set = train_set.drop(ind1, axis=1) ind2 = ['id','survey_time','edu_other','join_party','property_other','invest_other'] X_test_set = test_set.drop(ind2, axis=1) y_label = np.array(y_label, dtype=int) X_train_set = np.array(X_train_set, dtype=float) X_test_set = np.array(X_test_set, dtype=float) from sklearn.impute import SimpleImputer # 空值設置為-1 X_train_set = SimpleImputer(fill_value=-1).fit_transform(X_train_set) X_test_set = SimpleImputer(fill_value=-1).fit_transform(X_test_set) # # 小於0的值設置為-1 X_train_set[X_train_set < 0] = -1 X_test_set[X_test_set < 0] = -1 from sklearn.preprocessing import StandardScaler # 均值歸一化 std = StandardScaler().fit(X_train_set) X_train__std = std.transform(X_train_set) X_test__std = std.transform(X_test_set) # PCA降維 from sklearn.decomposition import PCA # 包含95%的方差信息 pca = PCA(0.95) pca.fit(X_train__std) X_train_pca = pca.transform(X_train__std) X_test_pca = pca.transform(X_test__std) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_train_pca, y_label, random_state=666) from sklearn.linear_model import LogisticRegression best_c = 0. best_score = 0. best_sum = 10. for c in np.arange(0.001, 0.3, 0.001): log_reg2 = LogisticRegression(C=c, multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) y_pre = log_reg2.predict(X_test) s = sum((y_pre-y_test)**2/len(y_test)) score = log_reg2.score(X_test, y_test) if best_sum > s: best_sum = s best_c = c best_score = score print('c:', best_c) print('score:', best_score) print('sum:', best_sum) log_reg = LogisticRegression(C=0.01, multi_class='multinomial', solver='newton-cg').fit(X_train, y_train) y_pre2 = log_reg.predict(X_test_pca) df = pd.DataFrame({'id':test_set.id, 'happniess': y_pre2}) df.to_csv('log_reg_pca.csv', index=None)