數據處理以及建模完整流程

本文轉載自查看原文 2020-05-16 11:09 625 數據挖掘

在數據挖掘工作中，數據預處理對於結果的影響是非常重要的，所以在這方面需要多花時間探索。

這里，我介紹一些數據預處理的流程以及方法：

首先，拿到數據之后，我們先把數據讀進來：

### code ###

import numpy as np

import pandas as pd

import pandas_profiling

#read data

data = pd.read_csv("yourdata")

#看數據情況

data.info

#看是否有空值

data.isnull().sum()

#看數據前面幾行信息

data.head()

#看數據幾行幾列

data.shape

#看數據的每一列的情況(count,mean.std等)

data.describe()

data.drop_duplicates()

#這樣一通看了之后，其實對數據了解還有限，沒辦法了解數據的整體分布，數據長什么樣

#所以對感興趣的內容需要畫圖進一步去看

data.profile_report()

##################

使用Pandas Profiling可以在進行數據分析之前對數據進行快速預覽,一行代碼就生成豐富的交互式數據EDA報告。

除了之前我們需要的一些描述性統計數據，該報告還包含以下信息：

類型推斷：檢測數據幀中列的數據類型。
要點：類型，唯一值，缺失值
分位數統計信息，例如最小值，Q1，中位數，Q3，最大值，范圍，四分位數范圍
描述性統計數據，例如均值，眾數，標准偏差，總和，中位數絕對偏差，變異系數，峰度，偏度
最常使用的值
直方圖
相關性矩陣
缺失值矩陣，計數，熱圖和缺失值樹狀圖
文本分析：了解文本數據的類別（大寫，空格），腳本（拉丁，西里爾字母）和塊（ASCII）

補全空值：

data['age'].filllna(data['age'].median(),inplace=True)

這樣一通下來，基本對數據應該是有個一定的了解了，接下來做的就是對數據進行預處理，

接下來我們使用sklearn中的preproccessing庫來進行數據預處理：

from sklearn import preprocessing
import numpy as np

from sklearn.preprocessing import StandardScaler

#standard scaler

st_scaled = preprocessing.StandardScaler().fit_transform(X_train)
st_scaled

from sklearn.preprocessing import MinMaxScaler

min-max標准化方法是將數據縮放至給定的最小值與最大值之間，通常是０與１之間，可用 MinMaxScaler實現。

#MinMaxScaler
minmax_scaled = preprocessing.MinMaxScaler().fit_transform(X_train)
minmax_scaled

或者將最大的絕對值縮放至單位大小，可用 MaxAbsScaler實現。

與上述標准化方法相似，但是它通過除以最大值將訓練集縮放至[-1,1]。這意味着數據已經以０為中心或者是含有非常非常多０的稀疏數據

#MaxAbsScaler
maxabs = preprocessing.MaxAbsScaler().fit_transform(X_train)
maxabs

#如果你的數據當中有很多異常值，用以上這些方面進行標准化或許發現不太好。但是可以用
#robust_scale或者robustScaler
sklearn.preprocessing.robust_scale(X, axis=0, with_centering=True, with_scaling=True, quantile_range=(25.0, 75.0), copy=True)

#歸一化

from sklearn.preprocessing import Normalizer

preprocessing.Normalizer().fit_transform(X)

#preprocessing.OrdinalEncoder()
#preprocessing.OneHotEncoder()
#https://www.jianshu.com/p/4e19eb163e78
#https://blog.csdn.net/wuzhongqiang/article/details/104169480

from sklearn.preprocessing import Binarizer

#特征二值化
X = [[1,-1,2],[2,0,0],[0,1,-2]]
binarizer = preprocessing.Binarizer().fit_transform(X)
binarizer

#二值化，閾值設置為1.1，返回值為二值化后的數據

preprocessing.Binarizer(threshold=1.1).fit_transform(X)

from sklearn.preprocessing import OneHotEncoder
#獨熱編碼，對IRIS數據集的目標值，返回值為獨熱編碼后的數據
OneHotEncoder().fit_transform(iris.target.reshape((-1,1)))

get_dummies方法：

pd.get_dummies(data['sex'])

缺失值填充

from sklearn.preprocessing import Imputer

#用均值插補缺失值 imp = Imputer(missing_values='NaN', strategy='mean', axis=0)

#imputation of missing values
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan,strategy='mean')
imp.fit([[1,2],[np.nan,3],[7,6]])
X=[[np.nan,2],[np.nan,3],[7,6]]
print(imp.transform(X))

import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10,random_state=0)
imp.fit([[1,2],[3,6],[4,8],[np.nan,3],[7,np.nan]])
X_test = [[np.nan,2],[6,np.nan],[np.nan,6]]
print(np.round(imp.transform(X_test)))

#Nearest neighbors imputation
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
nan = np.nan
X = [[1,2,nan],[3,4,3],[nan,6,5],[8,8,7]]
imputer = KNNImputer(n_neighbors=2,weights='uniform')
imputer.fit_transform(X)

1、缺失值

2、處理文本和類別數據

3、特征縮放

特征選擇：

from sklearn.feature_selection import VarianceThreshold
#方差選擇法，返回值為特征選擇后的數據
#參數threshold為方差的閾值
VarianceThreshold(threshold=2).fit_transform(iris.data)

from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr

#選擇K個最好的特征，返回選擇特征后的數據
#第一個參數為計算評估特征是否好的函數，該函數輸入特征矩陣和目標向量，輸出二元組（評分，P值）的數組，數組第i項為第i個特征的評分和P值。在此定義為計算相關系數
#參數k為選擇的特征個數
SelectKBest(lambda X, Y: tuple(map(tuple,array(list(map(lambda x:pearsonr(x, Y), X.T))).T)), k=2).fit_transform(iris.data, iris.target)
#SelectKBest(lambda X, Y: list(array([pearsonr(x, Y) for x in X.T]).T), k=2).fit_transform(iris.data, iris.target)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
#選擇K個最好的特征，返回選擇特征后的數據
SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

#遞歸特征消除法，返回特征選擇后的數據
#參數estimator為基模型
#參數n_features_to_select為選擇的特征個數
RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target)

from sklearn.decomposition import PCA
#主成分分析法，返回降維后的數據
#參數n_components為主成分數目
pca= PCA(n_components=2)
newData = pca.fit_transform(iris.data)
d_new=pca.inverse_transform(newData)
newData

from sklearn.lda import LDA
LDA(n_components=2).fit_transform(iris.data, iris.target)

模型選擇和訓練：

利用網格搜索對模型進行微調：

from sklearn.model_selection import GridSearchCV param_grid = [ {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]}, {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_train,X_test)

grid_search.best_params_

grid_search.best_score_

grid_search.best_estimator_
用最好的模型去評估測試集數據

from sklearn.metrics import mean_squared_error final_model = grid_search.best_estimator_

final_predictions = final_model.predict(X_test)

#k-折交叉驗證

cross_val_score(svc,X_digits,y_digits,cv=k_fold)

#Automatic parameter searches
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

#load the data and split into train and test sets
X,y = fetch_california_housing(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0)

#define the parameter space that will be searched over
param_distributions = {'n_estimators':randint(1,5),
'max_depth':randint(5,10)}

#now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
n_iter=5,
param_distributions=param_distributions,
random_state=0)

search.fit(X_train,y_train)
search.best_params_

# the search object now acts like a normal random forest estimator
# with max_depth = 9 and n_estimators = 4
search.score(X_test,y_test)

參考資料：

1、https://www.jianshu.com/p/78c7be12d2a2?utm_source=oschina-app

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 POI數據處理流程使用solr的完整流程網絡 | HTTPS 完整流程 IOS App打包發布完整流程 presto中ldaps配置完整流程用戶訪問網站的完整流程一個模型建立的完整流程用戶訪問網站的完整流程微信app支付，完整流程，完整代碼（轉） P2P網絡數據處理流程