標准化_1:
import pandas as pd # 讀取數據 grade = pd.read_csv('cj.csv', encoding='gbk') # Z-score標准化 from sklearn.preprocessing import * scaler_z = StandardScaler(copy=True) for i in ["英語","體育","軍訓","數分","高代","解幾"]: grade[i] = scaler_z.fit_transform(grade[[i]]) print(grade) grade_zscore = grade.iloc[:,-7:-1].sum(axis=1) # Min-Max標准化 from sklearn.preprocessing import * scaler_m = MinMaxScaler(copy=False) for j in ["英語","體育","軍訓","數分","高代","解幾"]: grade[j] = scaler_m.fit_transform(grade[[j]]) grade_minmax = grade.iloc[:, -7:-1].sum(axis=1)
離群值:
# 導入需要的包 import pandas as pd from sklearn.neighbors import * # 讀取數據 car_sales = pd.read_csv('car_sales.csv',encoding='gbk') # 取出Quantity列 data = car_sales[["Quantity"]] # 局部因常因子檢測 scaler = LocalOutlierFactor() scaler.fit(data) # 添加新列LOF保存局部離群因子的值 data['LOF'] = - scaler.negative_outlier_factor_ # 選出局部離群因子大於1.5的樣本 quantity_lof = data[data.LOF>1.5] print(quantity_lof )
標准化_2:
# 導入需要的工具包 import pandas as pd import numpy as np from sklearn.preprocessing import * # 讀取數據集 std_df = pd.read_csv('car_sales.csv', encoding = 'gbk') print(std_df) scaler = StandardScaler(copy=True) # 獲得Z-Score標准化后的DataFrame對象 std_df_zscore = pd.DataFrame(scaler.fit_transform(std_df[['Quantity']]), columns = ['Quantity_StandardScaled'] ) # 為上述DataFrame對象添加初始數據集中的銷量列'Quantity' std_df_zscore['Quantity'] = pd.DataFrame(['Quantity']) # 輸出Z-Score標准化后汽車銷量的均值和方差 #print('均值:', std_df_zscore['____'].mean(axis = 0)) #print('方差:', std_df_zscore['____'].std(axis = 0)) out = std_df_zscore.head(5) print(out)
缺失值:
# 導入需要的工具包 import pandas as pd from sklearn.preprocessing import Imputer # 讀取數據集 data = pd.read_csv('car_sales.csv') # print(data) # 設定參數 # imp = Imputer(missing_values = 'NaN', strategy = '____', axis = ____) imp = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0) # 擬合填補參數 temp = imp.fit(data[['Quantity']]) print(temp) # 缺失值填充 # data['Quantity'] = imp.transform(____) data['Quantity'] = imp.transform(data[['Quantity']]) # 驗證是否填充 if data['Quantity'].isnull().any() == False: print("汽車銷量缺失值已填充!")
離散化:
# 導入需要的工具包 import pandas as pd import numpy as np from sklearn.preprocessing import * # 讀取數據集 binary_df = pd.read_csv('car_sales.csv',encoding='gbk') # 二值化,閾值設置為800 scaler = Binarizer(threshold=800) quantity_binary = scaler.fit_transform(binary_df[['Quantity']]) # 查看銷量大於800的有多少數據 print(sum(quantity_binary))
特征編碼:
# 導入需要的工具包 import pandas as pd import numpy as np from sklearn.preprocessing import * # 讀取數據集 feature_df = pd.read_csv('car_sales.csv',encoding='gbk') # print(feature_df) # 建立標簽編碼器 le = LabelEncoder() # 進行標簽編碼,並返回ndarray對象label_list # label_list = le.fit(feature_df.['Make']) label_list = le.fit_transform(feature_df['Make']) print(label_list) # 查看本數據集中共有多少汽車品牌 print(le.classes_)
