異常監測的要點:1. 適用於數據集符合某種分布,能夠轉換為某種分布也算,比如車的航行軌跡,就不能用這招。 2. 或者使用閾值設定,結合邏輯回歸設定異常,也可以。3. 在數據集中,異常數據點非常少,1%都算多。
在實戰中,需要結合實際情況調用包。
數據集
鏈接:https://pan.baidu.com/s/1IU6sG2LHrVxHE2e51I0SHw
提取碼:0ihl
代碼
import numpy as np import pandas as pd import matplotlib.pyplot as plt import scipy.io as spio def display_2d_data(X,marker): #plt.figure(figsize=(10,8)) plt.plot(X[:,0],X[:,1],marker) return plt def estimateGaussian(X): #求均值與方差 m,n = X.shape mu = np.zeros((n,1)) sigma2 = np.zeros((n,1)) mu = np.mean(X,axis = 0) sigma2 = np.var(X,axis = 0) * (m-1) / m # 在概率論中計算sigma2時,除以的是(1-m),機器學習中,除以(1-m)和除以m的差別不大 return mu,sigma2 def multivariateGaussian(X,mu,Sigma2): k = len(mu) if (Sigma2.shape[0]>1): Sigma2 = np.diag(Sigma2) '''多元高斯分布函數''' X = X-mu argu = (2*np.pi)**(-k/2)*np.linalg.det(Sigma2)**(-0.5) p = argu*np.exp(-0.5*np.sum(np.dot(X,np.linalg.inv(Sigma2))*X,axis=1)) # axis表示每行 return p def visualizeFit(X,mu,sigma2): x = np.arange(0, 36, 0.5) # 0-36,步長0.5 y = np.arange(0, 36, 0.5) X1,X2 = np.meshgrid(x,y) # 要畫等高線,所以meshgird Z = multivariateGaussian(np.hstack((X1.reshape(-1,1),X2.reshape(-1,1))), mu, sigma2) # 計算對應的高斯分布函數 Z = Z.reshape(X1.shape) # 調整形狀 plt.figure(figsize=(10,8)) plt.plot(X[:,0],X[:,1],'bx') if np.sum(np.isinf(Z).astype(float)) == 0: # 如果計算的為無窮,就不用畫了 #plt.contourf(X1,X2,Z,10.**np.arange(-20, 0, 3),linewidth=.5) CS = plt.contour(X1,X2,Z,10.**np.arange(-20, 0, 3),color='black',linewidth=.5) # 畫等高線,Z的值在10.**np.arange(-20, 0, 3) #plt.clabel(CS) plt.show() # 選擇最優的epsilon,即:使F1Score最大 def selectThreshold(yval,pval): '''初始化所需變量''' bestEpsilon = 0. bestF1 = 0. F1 = 0. step = (np.max(pval)-np.min(pval))/1000 '''計算''' for epsilon in np.arange(np.min(pval),np.max(pval),step): cvPrecision = pval<epsilon tp = np.sum((cvPrecision == 1) & (yval == 1).ravel()).astype(float) # sum求和是int型的,需要轉為float fp = np.sum((cvPrecision == 1) & (yval == 0).ravel()).astype(float) fn = np.sum((cvPrecision == 0) & (yval == 1).ravel()).astype(float) precision = tp/(tp+fp) # 精准度 recision = tp/(tp+fn) # 召回率 F1 = (2*precision*recision)/(precision+recision) # F1Score計算公式 if F1 > bestF1: # 修改最優的F1 Score bestF1 = F1 bestEpsilon = epsilon return bestEpsilon,bestF1 def AnomalyDetection2(): data = spio.loadmat("data2.mat") X = data['X'] Xval = data['Xval'] yval = data['yval'] #print(pd.DataFrame(X)) mu,sigma2 = estimateGaussian(X) #print(mu,sigma2) p = multivariateGaussian(X,mu,sigma2) #print(pd.DataFrame(p)) pval = multivariateGaussian(Xval,mu,sigma2) epsilon,F1 = selectThreshold(yval,pval) print("the best epsilon is ",epsilon) print("the best F1 is ",F1) print("Outliers found",np.sum(p < epsilon)) AnomalyDetection2() def AnomalyDetection(): data = spio.loadmat("data.mat") #print(data) X = data['X'] Xval = data['Xval'] yval = data['yval'] # y = 1 為異常 #plt.plot(X[:,0],X[:,1],'x') plt = display_2d_data(X,'x') plt.title("Origin data") plt.show() mu,sigma2 = estimateGaussian(X) #print(mu,sigma2) p = multivariateGaussian(X,mu,sigma2) #print(p) visualizeFit(X,mu,sigma2) # 顯示圖像 # 選擇異常點 pval = multivariateGaussian(Xval,mu,sigma2) epsilon,F1 = selectThreshold(yval,pval) print(u'在CV上得到的最好的epsilon是:%e'%epsilon) print(u'對應的F1Score值為:%f'%F1) outliers = np.where(p<epsilon) # 找到小於臨界值的異常點,並作圖 #plt.figure(figsize=(10,8)) plt.plot(X[outliers,0],X[outliers,1],'o',markeredgecolor='r',markerfacecolor='w',markersize=10.) plt = display_2d_data(X, 'bx') plt.show() if __name__ == "__main__": AnomalyDetection()