這里記錄下標准化,歸一化等內容:
from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer from sklearn.feature_selection import VarianceThreshold from sklearn.decomposition import PCA import numpy as np def dictvec(): # sparse默認是True,是為了節省內存 dict1 = DictVectorizer(sparse=False) data1 = dict1.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) print(data1) print("===============================") print("sparse默認為True,返回如下:") dict2 = DictVectorizer(sparse=True) data2 = dict2.fit_transform([{'city': '北京','temperature':100},{'city': '上海','temperature':60},{'city': '深圳','temperature':30}]) print(data2) print("===============================") print("字典類別數據:") print(dict2.get_feature_names()) return None def countvec(): cv = CountVectorizer() data = cv.fit_transform(["life is short,i like python life","life is too long,i dislike python"]) print("===============================") print("sparse默認為True,返回如下:") print(data) print("===============================") print("CountVectorizer沒有sparse參數,按照如下方式操作:") print(data.toarray()) print("===============================") print("獲得類別名:") print(cv.get_feature_names()) return None # 歸一化 def mm(): # feature_range改變歸一化范圍,默認0-1 # mm = MinMaxScaler(feat ure_range=(2,3)) mm = MinMaxScaler() data = mm.fit_transform([[90, 2, 10, 40], [60, 4, 15, 45], [75, 3, 13, 46]]) print("===============================") print("歸一化,返回如下:") print(data) return None # 標准化 def ss(): std = StandardScaler() data = std.fit_transform([[ 1., -1., 3.],[ 2., 4., 2.],[ 4., 6., -1.]]) print("===============================") print("標准化,返回如下:") print(data) return None # 缺失值 def im(): im = Imputer(missing_values='NaN', strategy='mean', axis=0) # 按列 data = im.fit_transform([[1, 2], [np.nan, 3], [7, 6]]) print("===============================") print("缺失值處理,返回如下:") print(data) return None # 降維 def var(): """ 特征選擇-刪除低方差的特征 """ var = VarianceThreshold(threshold=0) data = var.fit_transform([[0, 2, 0, 3], [0, 1, 4, 3],[0, 1, 1, 3]]) print("===============================") print("刪除低方差降維,返回如下:") print(data) return None # 主成分分析PCA def pca(): """ 主成分分析進行降維 """ # 信息保留90% pca = PCA(n_components=0.9) data = pca.fit_transform([[2,8,4,5],[6,3,0,8],[5,4,9,1]]) print("===============================") print("主成分分析降維,返回如下:") print(data) return None if __name__ == "__main__": # dictvec() countvec() # mm() # ss() # im() # var() # pca()