使用scikit-learn進行建模預測和評估操作_泰坦尼克號獲救預測


# coding: utf-8 # In[142]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # In[143]: # 導入數據 titanic = pd.read_csv('train.csv') titanic.head(5) # print(titanic.describe()) # In[144]:  titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median()) print(titanic.describe()) # In[145]: print(titanic['Sex'].unique()) # Replace all the occurences of male with the number 0. # 將字符值轉換成 數值 # 進行一個屬性值轉換 titanic.loc[titanic['Sex'] == 'male', 'Sex'] = 0 titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1 # In[146]: # 登船地址 print(titanic['Embarked'].unique()) titanic['Embarked'] = titanic['Embarked'].fillna('S') titanic.loc[titanic['Embarked'] == 'S', 'Embarked'] = 0 titanic.loc[titanic['Embarked'] == 'C', 'Embarked'] = 1 titanic.loc[titanic['Embarked'] == 'Q', 'Embarked'] = 2 # In[147]: # Import the linear regression class (線性回歸) from sklearn.linear_model import LinearRegression # Sklearn also has a helper that makes it easy to do cross validation(交叉驗證) from sklearn.cross_validation import KFold # The Columns we'll use to predict the target predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] # Initialize our algorithm class alg = LinearRegression() # Generate(生成) cross validation folds(交叉驗證) for the titanic dataset. # We set random_state to ensure we get the same splits(相同的分割) every time we run this. kf = KFold(titanic.shape[0], n_folds=3, random_state=1) # 預測結果 predictions = [] # 訓練集, 測試集, 交叉驗證 for train, test in kf: # The predictors we're using the train the algorithm.  # Note how we only take the rows in the train folds (只在訓練集中進行) train_predictors = (titanic[predictors].iloc[train, :]) # The target we're using to train the algorithm train_target = titanic['Survived'].iloc[train] # Training the algorithm using the prodictors and target # 訓練數據的 X, Y ==> 讓他能進行判斷的操作  alg.fit(train_predictors, train_target) # we can now make predictions on the test fold test_predictions = alg.predict(titanic[predictors].iloc[test, :]) predictions.append(test_predictions) # In[148]: import numpy as np # The Predictions are in three separate numpy arrays. Concatenate them into one. # We concatenate them on axis 0, as they only have one axis.我們將它們連接在軸0上,因為它們只有一個軸 predictions = np.concatenate(predictions, axis = 0) # Map predictions to outcomes (only possible outcome are 1 and 0) predictions[predictions > 0.5] = 1 predictions[predictions <= .5] = 0 # 進行評估模型 accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions) print(accuracy) # In[149]: from sklearn import cross_validation from sklearn.linear_model import LogisticRegression # Initialize our algorithm alg = LogisticRegression(random_state=1) # Compute the accuracy score for all the cross validation folds. (計算所有交叉驗證折疊的精度分數。) # (much simpler than what we did before !) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=3) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # ### 隨機森林 # In[150]:  titanic_test = pd.read_csv('test.csv') titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median()) titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic_test['Fare'].median()) titanic_test.loc[titanic_test['Sex'] == 'male', 'Sex'] = 0 titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1 titanic_test['Embarked'] = titanic_test['Embarked'].fillna('S') titanic_test.loc[titanic_test['Embarked'] == 'S', 'Embarked'] = 0 titanic_test.loc[titanic_test['Embarked'] == 'C', 'Embarked'] = 1 titanic_test.loc[titanic_test['Embarked'] == 'Q', 'Embarked'] = 2 # In[151]: from sklearn import cross_validation from sklearn.ensemble import RandomForestClassifier #選中一些特征  predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] # Initialize our algorithm with the default paramters # random_state = 1 表示此處代碼多運行幾次得到的隨機值都是一樣的,如果不設置,兩次執行的隨機值是不一樣的 # n_estimators 指定有多少顆決策樹,樹的分裂的條件是: # min_samples_split 代表樣本不停的分裂,某一個節點上的樣本如果只有2個了 ,就不再繼續分裂了 # min_samples_leaf 是控制葉子節點的最小個數 alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1) # Compute the accuracy score for all the cross validation folds (nuch simpler than what we did before) # 進行交叉驗證  kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # In[152]: # 建立100多個決策樹 alg = RandomForestClassifier(random_state=1, n_estimators=100, min_samples_split=4, min_samples_leaf=2) # Compute the accuracy score kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic['Survived'], cv=kf) # Take the mean of the scores (because we have one for each fold) print(scores.mean()) # ## 關於特征提取問題 (非常關鍵) # - 盡可能多的提取特征 # - 看不同特征的效果 # - 特征提取是數據挖掘里很- 要的一部分 # - 以上使用的特征都是數據里已經有的了,在真實的數據挖掘里我們常常沒有合適的特征,需要我們自己取提取 # # In[153]: # Generating a familysize column # 合並數據 :自己生成一個特征,家庭成員的大小:兄弟姐妹+老人孩子 titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] # The .apply method generates a new series 名字的長度(據說國外的富裕的家庭都喜歡取很長的名字) titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x)) # In[154]: import re # A function to get the title from a name def get_title(name): # Use a regular expression to search for a title. # Titles always consist of capital and lowercase letters. title_search = re.search(' ([A-Za-z]+)\.', name) # If the title exists extract and return it. if title_search: return title_search.group(1) return "" # Get all the titles and print how often each one occurs. titles = titanic['Name'].apply(get_title) print(pd.value_counts(titles)) # 輸出看看, 相同數量的,設置相同映射 # 國外不同階層的人都有不同的稱呼 # Map each title to an integer. Some titles are very rare. and are compressed into the same codes as other title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 } for k, v in title_mapping.items(): #將不同的稱呼替換成機器可以計算的數字 titles[titles == k] = v # Verify that we converted everything print(pd.value_counts(titles)) # Add in the title column titanic['Title'] = titles # In[155]: # 進行特征選擇 # 特征重要性分析 # 分析 不同特征對 最終結果的影響 # 例如 衡量age列的重要程度時,什么也不干,得到一個錯誤率error1, # 加入一些噪音數據,替換原來的值(注意,此時其他列的數據不變),又得到一個一個錯誤率error2 # 兩個錯誤率的差值 可以體現這一個特征的重要性 import numpy as np from sklearn.feature_selection import SelectKBest, f_classif import matplotlib.pylab as plt # 選中一些特征 predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', "Embarked", 'FamilySize', 'Title', 'NameLength'] # Perform feature selection 選擇特性 selector = SelectKBest(f_classif, k = 5) selector.fit(titanic[predictors], titanic['Survived']) # Get the raw p-values(P 值) for each feature, and transform from p-values into scores scores = -np.log10(selector.pvalues_) # Plot the scores. See how "Plcass", "Sex", "Title", and "Fare" are the best ? plt.bar(range(len(predictors)), scores) plt.xticks(range(len(predictors)), predictors, rotation='vertical') plt.show() # 通過以上的特征重要性分析, 選擇出4個最重要的特性,重新進行隨機森林的算法 # Pick only the four best features. predictors = ['Pclass', 'Sex', 'Fare', 'Title'] alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=8, min_samples_leaf=4) # 進行交叉驗證 kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"],cv=kf) #目前的結果是沒有得到提高,本處的目的是為了練習在隨機森林中的特征選擇,它對於實際的數據挖掘具有重要意義 print (scores.mean()) # ### 集成多種算法(減少過擬合) # In[156]: # 在競賽中常用的耍賴的辦法:集成多種算法,取最后每種算法的平均值,來減少過擬合 from sklearn.ensemble import GradientBoostingClassifier import numpy as np # GradientBoostingClassifier也是一種隨機森林的算法,可以集成多個弱分類器,然后變成強分類器 # The algorithm we want to ensemble # We're using the more linear predictors for the logistic regression # and everything with the gradient boosting algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3),["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]], [LogisticRegression(random_state=1), ["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]] ] # Initialize the cross validation folds kf = KFold(titanic.shape[0], n_folds=3, random_state=1) predictions = [] for train, test in kf: train_target = titanic['Survived'].iloc[train] full_test_predictions = [] # Make predictions for each algorithm on each folds for alg, predictors in algorithms: # Fit the algorithm on the training data.  alg.fit(titanic[predictors].iloc[train, :], train_target) # Select and predict on the test fold. # The astype(float) is necessary to convert the dataframe test_predictions = alg.predict_proba(titanic[predictors].iloc[test, :].astype(float))[:, 1] full_test_predictions.append(test_predictions) # Use a simple ensembling scheme - just average the predictions to get the final classification # 兩個算法, 分別算出來的 預測值, 取平均 test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 # Any value over 5 is assumed to be a 1 prediction, and below 5 is a 0 prediction test_predictions[test_predictions <= 0.5] = 0 test_predictions[test_predictions > .5] = 1 predictions.append(test_predictions) # Put all the predictions together into one array predictions = np.concatenate(predictions, axis=0) accuracy = sum(predictions[predictions == titanic['Survived']]) / len(predictions) print(accuracy) # In[157]:  titles = titanic['Name'].apply(get_title) print(pd.value_counts(titles)) # 輸出看看, 相同數量的,設置相同映射 # 國外不同階層的人都有不同的稱呼 # Map each title to an integer. Some titles are very rare. and are compressed into the same codes as other title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2 } for k, v in title_mapping.items(): #將不同的稱呼替換成機器可以計算的數字 titles[titles == k] = v # Add in the title column titanic_test['Title'] = titles print(pd.value_counts(titanic_test['Title'])) # Now, we add the family size column. titanic_test['FamilySize'] = titanic_test['SibSp'] + titanic_test['Parch'] # In[158]:  predictors = ["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"] algorithms = [ [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors], [LogisticRegression(random_state=1), ["Pclass","Sex","Age","Fare","Embarked","FamilySize","Title"]] ] full_predictions = [] for alg, predictors in algorithms: # Fit the Algorithm using the full training data alg.fit(titanic[predictors], titanic['Survived']) predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:, 1] full_predictions.append(predictions) # 梯度提升分類器產生更好的預測 # The gradient boosting classifier generates better predictions, so we weight it high predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4 predictions

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM