離散變量標簽處理
1.類別變量映射為原始變量
原始數據
import pandas as pd df = pd.DataFrame([ ['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'], ['blue', 'XL', 15.3, 'class1']]) df.columns = ['color', 'size', 'prize', 'class label']

1.1 方法1:原始處理方法(將類別變量映射為數值變量)
# 自動映射 class_mapping = {label:idx for idx,label in enumerate(set(df['class label']))} df['class label'] = df['class label'].map(class_mapping) # 指定映射對 size_mapping = { 'XL': 3, 'L': 2, 'M': 1} df['size'] = df['size'].map(size_mapping) color_mapping = { 'green': (0,0,1), 'red': (0,1,0), 'blue': (1,0,0)} df['color'] = df['color'].map(color_mapping) # 逆映射 inv_color_mapping = {v: k for k, v in color_mapping.items()} inv_size_mapping = {v: k for k, v in size_mapping.items()} inv_class_mapping = {v: k for k, v in class_mapping.items()} df['color'] = df['color'].map(inv_color_mapping) df['size'] = df['size'].map(inv_size_mapping) df['class label'] = df['class label'].map(inv_class_mapping)


原始方法2:
# 方法1:將sex列中的male,female分別映射為0,1 titanic.loc[titanic["Sex"] == "male", "Sex"] = 0 titanic.loc[titanic["Sex"] == "female", "Sex"] = 1 #將Embarked列的S,C,Q分別映射為0,1,2 titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0 titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1 titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2 # 方法2:將sex列中的male,female分別映射為0,1 titanic["Sex"]=np.where(titanic["Sex"]=="male",0,1) # 根據值不同映射為相應的值 data['type']=np.where(data['type']==0,data['type1'],data['type2'])
1.2 方法2:使用scikit LabelEncoder處理標簽變量映射
# 標簽類型----數值類型映射 from sklearn.preprocessing import LabelEncoder class_le = LabelEncoder() df['class label'] = class_le.fit_transform(df['class label']) # 逆映射 class_le.inverse_transform(df['class label'])

2. 類別變量熱編碼
2.1 方法1:OneHotEncoder(OneHotEncoder 必須使用整數作為輸入,所以得先預處理一下)
# 數據預處理 color_le = LabelEncoder() df['color'] = color_le.fit_transform(df['color']) #熱編碼 from sklearn.preprocessing import OneHotEncoder ohe = OneHotEncoder(sparse=False) X = ohe.fit_transform(df[['color']].values)

2.2 方法2:get_dummies(只處理類別型變量)
# 數據預處理 import pandas as pd df = pd.DataFrame([ ['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'], ['blue', 'XL', 15.3, 'class1']]) df.columns = ['color', 'size', 'prize', 'class label'] size_mapping = { 'XL': 3, 'L': 2, 'M': 1} df['size'] = df['size'].map(size_mapping) class_mapping = {label:idx for idx,label in enumerate(set(df['class label']))} df['class label'] = df['class label'].map(class_mapping) # 熱編碼 pd.get_dummies(df)

2.3 方法3:scikit DictVectorizer 熱編碼(只處理類別型變量)
# 數據預處理 df.transpose().to_dict().values() feature = df.iloc[:, :-1] feature # 熱編碼 from sklearn.feature_extraction import DictVectorizer dvec = DictVectorizer(sparse=False) X = dvec.fit_transform(feature.transpose().to_dict().values()) pd.DataFrame(X, columns=dvec.get_feature_names())


