python實現MED分類器

本文轉載自查看原文 2021-05-06 01:30 196
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import copy


# 特征白化，返回白化后的矩陣（numpy數組格式）
# 參數為numpy格式的數組，其格式為數學上的矩陣的轉置
def whitening(feature_x):
    new_feature_x = np.asmatrix(feature_x).T
    sigma_x = np.cov(new_feature_x)
    eig_x = np.linalg.eig(sigma_x)  # 計算協方差矩陣sigma_x的特征值和特征向量
    diag_x = np.diag(eig_x[0])
    W = np.dot(np.power(np.asmatrix(diag_x).I, 0.5), eig_x[1].T)  # 記得eig_x[1]要轉置！因為它是所求特征向量矩陣的轉置
    return np.dot(W, new_feature_x).T.A  # 將矩陣轉換為numpy的風格


# MED分類器
# 只能分辨訓練集中存在的類別
class MedClassifier:
    def __init__(self):
        self.center_dict = {}  # 分類中心點，以類別標簽為鍵   label: center_point(list)
        self.feature_number = 0  # 特征維度
        self.train_state = False  # 訓練狀態，True為訓練完成，False表示還沒訓練過

    # 根據傳入的樣本集（特征+標簽）來訓練MED分類器，
    # 其中每一個特征要求是行向量，標簽也是行向量（為了與numpy array的格式對齊）
    # 函數將輸入的標簽數組轉換為字典
    def train(self, feature_set, label_set):
        new_label_set = {key: value for key, value in enumerate(label_set)}  # 將標簽集合轉換為以下標為鍵的字典   index: label
        self.feature_number = len(feature_set[0])
        sample_num = len(label_set)  # 樣本個數
        count = {}  # 計算每個類別的樣本個數  label: count(int)
        # 計算每個類別的分類中心點
        for index in range(sample_num):
            if new_label_set[index] not in count.keys():
                count[new_label_set[index]] = 0
            else:
                count[new_label_set[index]] += 1  # 計算對應標簽的樣本數
            if new_label_set[index] not in self.center_dict.keys():
                self.center_dict[new_label_set[index]] = feature_set[index]
            else:
                self.center_dict[new_label_set[index]] += feature_set[index]
        for _key_ in self.center_dict.keys():
            for _feature_ in range(self.feature_number):
                self.center_dict[_key_][_feature_] /= count[_key_]
        self.train_state = True

    # 根據輸入來進行分類預測，輸出以 下標—預測分類 為鍵值對的字典
    def predict(self, feature_set):
        # 先判斷此分類器是否經過訓練
        if not self.train_state:
            return {}
        sample_num = len(feature_set)
        distance_to = {}  # 計算某個樣本到各分類中心點距離的平方  label: float
        result = {}  # 保存分類結果  index: label
        for _sample_ in range(sample_num):
            for _key_ in self.center_dict.keys():
                delta = feature_set[_sample_] - self.center_dict[_key_]
                distance_to[_key_] = np.dot(delta.T, delta)
            result[_sample_] = min(distance_to, key=distance_to.get)  # 返回最小值的鍵（即label）
        return result

    # 判斷預測准確率
    def accuracy(self, feature_set, label_set):
        if not self.train_state:
            return 0.0
        correct_num = 0
        total_num = len(label_set)
        predict = self.predict(feature_set)
        for _sample_ in range(total_num):
            if predict[_sample_] == label_set[_sample_]:
                correct_num += 1
        return correct_num / total_num

    # 根據指定的陽性類別，計算分類器的性能指標（准確率accuracy，精度precision，召回率recall，特異性specificity，F1_Score）
    def performance(self, feature_set, label_set, positive):
        if not self.train_state:
            return {}
        total_num = len(label_set)
        predict = self.predict(feature_set)
        true_positive, false_positive, true_negative, false_negative = 0, 0, 0, 0
        for _sample_ in range(total_num):
            if predict[_sample_] == label_set[_sample_]:
                if label_set[_sample_] == positive:
                    true_positive += 1
                else:
                    true_negative += 1
            else:
                if label_set[_sample_] == positive:
                    false_negative += 1
                else:
                    false_positive += 1
        accuracy = (true_positive + true_negative) / total_num  # 准確率（預測正確的樣本與總樣本數之比）
        precision = true_positive / (true_positive + false_positive)  # 精度（所有 預測 為陽性的樣本中， 真值 為陽性的比例）
        recall = true_positive / (true_positive + false_negative)  # 召回率（所有 真值 為陽性的樣本中， 預測 為陽性的比例）
        specificity = true_negative / (true_negative + false_positive)  # 特異性（所有 真值 為陰性的樣本中， 預測 為陰性的比例）
        F1_Score = (2 * precision * recall) / (precision + recall)  # 精度與召回率的加權平均
        return {"accuracy": accuracy, "precision": precision, "recall": recall, "specificity": specificity, "F1_Score": F1_Score}

    # 獲取某一類的樣本中心點
    def get_center(self, key):
        if key in self.center_dict.keys():
            return self.center_dict[key]
        else:
            return []

    def get_center_dict(self):
        return self.center_dict


# 將字典轉換為列表（只保留每個鍵值對的值）
def dict_values_to_list(_dict_):
    if isinstance(_dict_, dict):
        return list(_dict_.values())
    else:
        return []


# feature表示樣本特征，label表示對應的標簽,m行n列共計m*n個子圖
def visualization_2d(feature, label, m, n):
    plt.figure(figsize=(20, 20), dpi=80)
    img = [[] for i in range(m*n)]
    for i in range(m):
        for j in range(n):
            img[i*n+j] = plt.subplot(m, n, i*n+j+1)
            plt.xlabel("x"+str(i))
            plt.ylabel("x"+str(j))
            # plt.xlim(-1, 9)
            # plt.ylim(-1, 9)
            # plt.legend()  # 顯示圖例
            plt.scatter(feature[:, i], feature[:, j], s=5, c=label, marker='.')
            plt.colorbar()  # 顯示顏色條
            plt.grid(True)  # 顯示網格線
    plt.show()


# 展示二維平面上，二分類問題的決策線（class_1和class_2）
# feature是樣本特征集合，label是對應的標簽集合，對每一維特征進行兩兩比較，n表示特征維數
def show_decision_line(feature, label, med_classifier, class_1=0, class_2=0, n=0):
    plt.figure(figsize=(16, 12), dpi=80)  # 整張畫布大小與分辨率
    img = [[] for i in range(n * n)]
    for i in range(n):
        for j in range(n):
            img[i * n + j] = plt.subplot(n, n, i * n + j + 1)
            center_1 = med_classifier.get_center(class_1)
            center_2 = med_classifier.get_center(class_2)
            c_1 = [center_1[i], center_1[j]]  # class_1類中心點的i, j兩維的分量
            c_2 = [center_2[i], center_2[j]]  # class_2類中心點的i, j兩維的分量
            center_3 = [(c_1[0] + c_2[0]) / 2, (c_1[1] + c_2[1]) / 2]  # 兩點連線的中點
            k2, b2 = calculate_vertical_line(c_1, c_2)  # 兩點中垂線的斜率和截距
            plt.scatter(feature[:, i], feature[:, j], c=label, s=20, marker='.')  # 整個樣本集在特征0和2上的散點圖
            plt.scatter(c_1[0], c_1[1], c='b', marker='x')  # 顯示med分類器計算的樣本中心點
            plt.scatter(c_2[0], c_2[1], c='b', marker='x')
            plt.colorbar()  # 顯示散點圖的顏色條
            plt.grid(True)  # 顯示網格線
            plt.axis('equal')  # 橫縱坐標間隔大小相同
            plt.axline(c_1, c_2, color='c', linestyle="--", label="connected line")
            plt.axline(center_3, slope=k2, color='r', label="decision line")
            if i == j:
                plt.legend()  # 對角線上的子圖顯示出圖例
            plt.xlabel("feature " + str(i))
            plt.ylabel("feature " + str(j))
            plt.tight_layout()  # 自動調整子圖大小，減少相互遮擋的問題
    plt.show()


# 計算兩點連線，返回斜率和縱截距（假設是二維平面上的點，並且用列表表示）
def calculate_connected_line(point_1, point_2):
    if len(point_1) != 2 or len(point_2) != 2:
        return None
    k = (point_1[1] - point_2[1]) / (point_1[0] - point_2[0])
    b = (point_1[0] * point_2[1] - point_2[0] * point_1[1]) / (point_1[0] - point_2[0])
    return k, b


# 計算兩點中垂線，返回斜率和縱截距（假設是二維平面上的點，並且用列表表示）
def calculate_vertical_line(point_1, point_2):
    if len(point_1) != 2 or len(point_2) != 2:
        return None
    k = -(point_1[0] - point_2[0]) / (point_1[1] - point_2[1])
    b = (point_1[1] + point_2[1] + (point_1[0] + point_2[0]) * (point_1[0] - point_2[0]) / (point_1[1] - point_2[1]))/2
    return k, b


# 去除某個類別的樣本，返回兩個numpy數組
def remove_from_sample(feature, label, _class_):
    new_feature = []
    new_label = []
    for index in range(len(label)):
        if label[index] != _class_:
            new_feature.append(feature[index])
            new_label.append(label[index])
    return np.asarray(new_feature), np.asarray(new_label)


if __name__ == '__main__':
    iris = datasets.load_iris()
    iris_x = iris.data
    iris_y = iris.target

    iris_x_whitening = whitening(iris_x)  # 返回的是numpy數組格式，是數學矩陣的轉置
    iris_x = iris_x_whitening

    # print(np.cov(iris_x_whitening.T))

    # 顯示白化前后的散點圖
    # visualization_2d(iris_x, iris_y, 4, 4)
    # visualization_2d(np.asarray(iris_x_whitening).T, iris_y, 4, 4)

    # 去除線性可分的類（0類）
    iris_x_nonlinear, iris_y_nonlinear = remove_from_sample(iris_x, iris_y, 0)

    # 去除線性不可分類（1類）
    iris_x_linear, iris_y_linear = remove_from_sample(iris_x, iris_y, 1)

    # visualization_2d(iris_x_nonlinear, iris_y_nonlinear, 4, 4)  # 顯示4個特征兩兩對比的散點圖（包括自己比自己）

    x_train, x_test, y_train, y_test = train_test_split(iris_x_linear, iris_y_linear, test_size=0.3)
    med = MedClassifier()  # 創建MED分類器
    med.train(x_train, y_train)  # 訓練
    # print(np.asarray(dict_values_to_list(med.predict(x_test))))  # 用numpy數組格式顯示預測結果
    # print(y_test)
    performance = med.performance(x_test, y_test, 0)  # 當以0類為陽性時，計算med分類器的性能指標
    print(performance)

    # 展示每個特征兩兩對比圖，顯示決策線
    show_decision_line(x_test, y_test, med, class_1=0, class_2=2, n=4)
在鳶尾花數據集上
- 去除線性可分的類（1類），結果如下：
- 去除線性不可分的類（0類），結果如下：
免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。
猜您在找 線性分類器及python實現朴素貝葉斯分類器及Python實現朴素貝葉斯分類器及Python實現基於SVM的分類器Python實現 python實現簡單的朴素貝葉斯分類器朴素貝葉斯分類器及Python、MATLAB實現 Fisher線性分類器通俗解釋及MATLAB、Python實現朴素貝葉斯分類器 (Naive Bayes Classifier) python實現朴素貝葉斯文本分類實現 python cherry分類器 SVM分類器實現實例