使用scikit-learn進行建模預測和評估操作_泰坦尼克號獲救預測

本文轉載自查看原文 2017-10-29 21:17 1364 C---數據挖掘筆記
# coding: utf-8 # In[142]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[143]: # 導入數據 titanic = pd.read_csv('train.csv') titanic.head(5) # print(titanic.describe()) # In[144]:  titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median()) print(titanic.describe()) # In[145]: print(titanic['Sex'].unique()) # Replace all the occurences of male with the number 0. # 將字符值轉換成 數值 # 進行一個屬性值轉換 titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0 titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1 # In[146]: # 登船地址 print(titanic['Embarked'].unique()) titanic['Embarked'] = titanic['Embarked'].fillna('S') titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0 titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1 titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2 # In[147]: # Import the linear regression class (線性回歸) from sklearn.linear_model import LinearRegression # Sklearn also has a helper that makes it easy to do cross validation(交叉驗證) from sklearn.cross_validation import KFold # The Columns we'll use to predict the target predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] # Initialize our algorithm class alg = LinearRegression() # Generate(生成) cross validation folds(交叉驗證) for the titanic dataset. # We set random_state to ensure we get the same splits(相同的分割) every time we run this. kf = KFold(titanic.shape[0], n_folds=3, random_state=1) # 預測結果 predictions = [] # 訓練集, 測試集, 交叉驗證 for train, test in kf: # The predictors we're using the train the algorithm.  # Note how we only take the rows in the train folds (只在訓練集中進行) train_predictors = (titanic[predictors].iloc[train, :]) # The target we're using to train the algorithm train_target = titanic['Survived'].iloc[train] # Training the algorithm using the prodictors and target # 訓練數據的 X, Y ==> 讓他能進行判斷的操作  alg.fit(train_predictors, train_target) # we can now make predictions on the test fold test_predictions = alg.predict(titanic[predictors].iloc[test, :]) predictions.append(test_predictions) # In[148]: import numpy as np # The Predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis.我們將它們連接在軸0上，因為它們只有一個軸 predictions = np.concatenate(predictions, axis = 0) # Map predictions to outcomes (only possible outcome are 1 and 0) predictions[predictions > 0.5] = 1 predictions[predictions <= .5] = 0 # 進行評估模型 accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions) print(accuracy) # In[149]: from sklearn import cross_validation from sklearn.linear_model import LogisticRegression # Initialize our algorithm alg = LogisticRegression(random_state=1) # Compute the accuracy score for all the cross validation folds. (計算所有交叉驗證折疊的精度分數。) # (much simpler than what we did before !) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # ### 隨機森林 # In[150]:  titanic_test = pd.read_csv('test.csv') titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median()) titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median()) titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0 titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1 titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S') titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0 titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1 titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2 # In[151]: from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier #選中一些特征  predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] # Initialize our algorithm with the default paramters # random_state = 1 表示此處代碼多運行幾次得到的隨機值都是一樣的，如果不設置，兩次執行的隨機值是不一樣的 # n_estimators 指定有多少顆決策樹，樹的分裂的條件是: # min_samples_split 代表樣本不停的分裂，某一個節點上的樣本如果只有2個了 ，就不再繼續分裂了 # min_samples_leaf 是控制葉子節點的最小個數 alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) # Compute the accuracy score for all the cross validation folds (nuch simpler than what we did before) # 進行交叉驗證  kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # In[152]: # 建立100多個決策樹 alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2) # Compute the accuracy score kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # ## 關於特征提取問題 (非常關鍵) # - 盡可能多的提取特征 # - 看不同特征的效果 # - 特征提取是數據挖掘里很- 要的一部分 # - 以上使用的特征都是數據里已經有的了，在真實的數據挖掘里我們常常沒有合適的特征，需要我們自己取提取 # # In[153]: # Generating a familysize column # 合並數據 ：自己生成一個特征，家庭成員的大小:兄弟姐妹+老人孩子 titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] # The .apply method generates a new series 名字的長度(據說國外的富裕的家庭都喜歡取很長的名字) titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x)) # In[154]: import re # A function to get the title from a name def get_title(name): # Use a regular expression to search for a title. # Titles always consist of capital and lowercase letters. title_search = re.search(' ([A-Za-z]+)\.', name) # If the title exists extract and return it. if title_search: return title_search.group(1) return "" # Get all the titles and print how often each one occurs. titles = titanic['Name'].apply(get_title) print(pd.value_counts(titles)) # 輸出看看, 相同數量的,設置相同映射 # 國外不同階層的人都有不同的稱呼 # Map each title to an integer. Some titles are very rare. and are compressed into the same codes as other title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 } for k, v in title_mapping.items(): #將不同的稱呼替換成機器可以計算的數字 titles[titles == k] = v # Verify that we converted everything print(pd.value_counts(titles)) # Add in the title column titanic['Title'] = titles # In[155]: # 進行特征選擇 # 特征重要性分析 # 分析 不同特征對 最終結果的影響 # 例如 衡量age列的重要程度時，什么也不干，得到一個錯誤率error1， # 加入一些噪音數據，替換原來的值(注意，此時其他列的數據不變)，又得到一個一個錯誤率error2 # 兩個錯誤率的差值 可以體現這一個特征的重要性 import numpy as np from sklearn.feature_selection import SelectKBest, f_classif import matplotlib.pylab as plt # 選中一些特征 predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', "Embarked", 'FamilySize', 'Title', 'NameLength'] # Perform feature selection 選擇特性 selector = SelectKBest(f_classif, k = 5) selector.fit(titanic[predictors], titanic['Survived']) # Get the raw p-values(P 值) for each feature, and transform from p-values into scores scores = -np.log10(selector.pvalues_) # Plot the scores. See how "Plcass", "Sex", "Title", and "Fare" are the best ? plt.bar(range(len(predictors)), scores) plt.xticks(range(len(predictors)), predictors, rotation='vertical') plt.show() # 通過以上的特征重要性分析, 選擇出4個最重要的特性，重新進行隨機森林的算法 # Pick only the four best features. predictors = ['Pclass', 'Sex', 'Fare', 'Title'] alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4) # 進行交叉驗證 kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf) #目前的結果是沒有得到提高，本處的目的是為了練習在隨機森林中的特征選擇，它對於實際的數據挖掘具有重要意義 print (scores.mean()) # ### 集成多種算法(減少過擬合) # In[156]: # 在競賽中常用的耍賴的辦法:集成多種算法，取最后每種算法的平均值，來減少過擬合 from sklearn.ensemble import GradientBoostingClassifier import numpy as np # GradientBoostingClassifier也是一種隨機森林的算法，可以集成多個弱分類器，然后變成強分類器 # The algorithm we want to ensemble # We're using the more linear predictors for the logistic regression # and everything with the gradient boosting algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]], [LogisticRegression(random_state=1), ["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]] ] # Initialize the cross validation folds kf = KFold(titanic.shape[0], n_folds=3, random_state=1) predictions = [] for train, test in kf: train_target = titanic['Survived'].iloc[train] full_test_predictions = [] # Make predictions for each algorithm on each folds for alg, predictors in algorithms: # Fit the algorithm on the training data.  alg.fit(titanic[predictors].iloc[train, :], train_target) # Select and predict on the test fold. # The astype(float) is necessary to convert the dataframe test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1] full_test_predictions.append(test_predictions) # Use a simple ensembling scheme - just average the predictions to get the final classification # 兩個算法, 分別算出來的 預測值, 取平均 test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 # Any value over 5 is assumed to be a 1 prediction, and below 5 is a 0 prediction test_predictions[test_predictions <= 0.5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions) # Put all the predictions together into one array predictions = np.concatenate(predictions, axis=0) accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions) print(accuracy) # In[157]:  titles = titanic['Name'].apply(get_title) print(pd.value_counts(titles)) # 輸出看看, 相同數量的,設置相同映射 # 國外不同階層的人都有不同的稱呼 # Map each title to an integer. Some titles are very rare. and are compressed into the same codes as other title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 } for k, v in title_mapping.items(): #將不同的稱呼替換成機器可以計算的數字 titles[titles == k] = v # Add in the title column titanic_test['Title'] = titles print(pd.value_counts(titanic_test['Title'])) # Now, we add the family size column. titanic_test['FamilySize'] = titanic_test['SibSp'] + titanic_test['Parch'] # In[158]:  predictors = ["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"] algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors], [LogisticRegression(random_state=1), ["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]] ] full_predictions = [] for alg, predictors in algorithms: # Fit the Algorithm using the full training data alg.fit(titanic[predictors], titanic['Survived']) predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:, 1] full_predictions.append(predictions) # 梯度提升分類器產生更好的預測 # The gradient boosting classifier generates better predictions, so we weight it high predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4 predictions
免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。
猜您在找 機器學習項目實戰----泰坦尼克號獲救預測(一) 泰坦尼克獲救預測 Kaggle入門——泰坦尼克號生還者預測泰坦尼克號生存預測分析用不同方法在R中進行泰坦尼克號幸存者預測練習 TensorFlow從1到2（十四）評估器的使用和泰坦尼克號乘客分析用決策樹做泰坦尼克號乘客的生存預測決策樹之泰坦尼克號實戰 sklearn機器學習-泰坦尼克號數據分析-kaggle泰坦尼克號生存率分析