python機器學習-乳腺癌細胞挖掘(博主親自錄制視頻)
項目合作QQ:231469242
多重共線性測試需要改進
文件夾需要兩個包
python3.0 anaconda
normality_check.py 正太檢驗
# -*- coding: utf-8 -*- ''' Author:Toby QQ:231469242,all right reversed,no commercial use normality_check.py 正態性檢驗腳本 ''' import scipy from scipy.stats import f import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats # additional packages from statsmodels.stats.diagnostic import lillifors #正態分布測試 def check_normality(testData): #20<樣本數<50用normal test算法檢驗正態分布性 if 20<len(testData) <50: p_value= stats.normaltest(testData)[1] if p_value<0.05: print("use normaltest") print ("data are not normal distributed") return False else: print("use normaltest") print ("data are normal distributed") return True #樣本數小於50用Shapiro-Wilk算法檢驗正態分布性 if len(testData) <50: p_value= stats.shapiro(testData)[1] if p_value<0.05: print ("use shapiro:") print ("data are not normal distributed") return False else: print ("use shapiro:") print ("data are normal distributed") return True if 300>=len(testData) >=50: p_value= lillifors(testData)[1] if p_value<0.05: print ("use lillifors:") print ("data are not normal distributed") return False else: print ("use lillifors:") print ("data are normal distributed") return True if len(testData) >300: p_value= stats.kstest(testData,'norm')[1] if p_value<0.05: print ("use kstest:") print ("data are not normal distributed") return False else: print ("use kstest:") print ("data are normal distributed") return True #對所有樣本組進行正態性檢驗 def NormalTest(list_groups): for group in list_groups: #正態性檢驗 status=check_normality(group) if status==False : return False return True
Rsquare_multimode.py 多種模型計算R平方
加入了線性顯著檢測和r相關系數顯著檢測,多重共線性,自相關,殘差正太檢驗等等
# -*- coding: utf-8 -*- #斯皮爾曼等級相關(Spearman’s correlation coefficient for ranked data) import math,pylab,scipy import numpy as np import scipy.stats as stats from scipy.stats import t from scipy.stats import f import pandas as pd import matplotlib.pyplot as plt from statsmodels.stats.diagnostic import lillifors import normality_check import statsmodels.formula.api as sm x=[4.03,3.76,3.77,3.34,3.47,2.92,3.20,2.71,3.53,4.51] y=[6.47,6.13,6.19,4.89,5.63,4.52,5.89,4.79,5.27,6.08] list_group=[x,y] sample=len(x) #顯著性 a=0.05 #數據可視化 plt.plot(x,y,'ro') #斯皮爾曼等級相關,非參數檢驗 def Spearmanr(x,y): print("use spearmanr,Nonparametric tests") #樣本不一致時,發出警告 if len(x)!=len(y): print ("warming,the samples are not equal!") r,p=stats.spearmanr(x,y) print("spearman r**2:",r**2) print("spearman p:",p) if sample<500 and p>0.05: print("when sample < 500,p has no mean(>0.05)") print("when sample > 500,p has mean") #皮爾森 ,參數檢驗 def Pearsonr(x,y): print("use Pearson,parametric tests") r,p=stats.pearsonr(x,y) print("pearson r**2:",r**2) print("pearson p:",p) if sample<30: print("when sample <30,pearson has no mean") #皮爾森 ,參數檢驗,帶有詳細參數 def Pearsonr_details(x,y,xLabel,yLabel,formula): n=len(x) df=n-2 data=pd.DataFrame({yLabel:y,xLabel:x}) result = sm.ols(formula, data).fit() print(result.summary()) #模型F分布顯著性分析 print('\n') print("linear relation Significant test:...................................") #如果F檢驗的P值<0.05,拒絕H0,x和y無顯著關系,H1成立,x和y有顯著關系 if result.f_pvalue<0.05: print ("P value of f test<0.05,the linear relation is right.") #R的顯著檢驗 print('\n') print("R significant test:...................................") r_square=result.rsquared r=math.sqrt(r_square) t_score=r*math.sqrt(n-2)/(math.sqrt(1-r**2)) t_std=t.isf(a/2,df) if t_score<-t_std or t_score>t_std: print ("R is significant according to its sample size") else: print ("R is not significant") #殘差分析 print('\n') print("residual error analysis:...................................") states=normality_check.check_normality(result.resid) if states==True: print("the residual error are normal distributed") else: print("the residual error are not normal distributed") #殘差偏態和峰態 Skew = stats.skew(result.resid, bias=True) Kurtosis = stats.kurtosis(result.resid, fisher=False,bias=True) if round(Skew,1)==0: print("residual errors normality Skew:in middle,perfect match") elif round(Skew,1)>0: print("residual errors normality Skew:close right") elif round(Skew,1)<0: print("residual errors normality Skew:close left") if round(Kurtosis,1)==3: print("residual errors normality Kurtosis:in middle,perfect match") elif round(Kurtosis,1)>3: print("residual errors normality Kurtosis:more peak") elif round(Kurtosis,1)<3: print("residual errors normality Kurtosis:more flat") #自相關分析autocorrelation print('\n') print("autocorrelation test:...................................") DW = np.sum( np.diff( result.resid.values )**2.0 )/ result.ssr if round(DW,1)==2: print("Durbin-Watson close to 2,there is no autocorrelation.OLS model works well") #共線性檢查 print('\n') print("multicollinearity test:") conditionNumber=result.condition_number if conditionNumber>30: print("conditionNumber>30,multicollinearity exists") else: print("conditionNumber<=30,multicollinearity not exists") #繪制殘差圖,用於方差齊性檢驗 Draw_residual(list(result.resid)) ''' result.rsquared Out[28]: 0.61510660055413524 ''' #kendalltau非參數檢驗 def Kendalltau(x,y): print("use kendalltau,Nonparametric tests") r,p=stats.kendalltau(x,y) print("kendalltau r**2:",r**2) print("kendalltau p:",p) #選擇模型 def R_mode(x,y,xLabel,yLabel,formula): #正態性檢驗 Normal_result=normality_check.NormalTest(list_group) print ("normality result:",Normal_result) if len(list_group)>2: Kendalltau(x,y) if Normal_result==False: Spearmanr(x,y) Kendalltau(x,y) if Normal_result==True: Pearsonr_details(x,y,xLabel,yLabel,formula) #調整的R方 def Adjust_Rsquare(r_square,n,k): adjust_rSquare=1-((1-r_square)*(n-1)*1.0/(n-k-1)) return adjust_rSquare ''' n=len(x) n=10 k=1 r_square=0.615 Adjust_Rsquare(r_square,n,k) Out[11]: 0.566875 ''' #繪圖 def Plot(x,y,yLabel,xLabel,Title): plt.plot(x,y,'ro') plt.ylabel(yLabel) plt.xlabel(xLabel) plt.title(Title) plt.show() #繪圖參數 yLabel='Alcohol' xLabel='Tobacco' Title='Sales in Several UK Regions' Plot(x,y,yLabel,xLabel,Title) formula='Alcohol ~ Tobacco' #繪制殘點圖 def Draw_residual(residual_list): x=[i for i in range(1,len(residual_list)+1)] y=residual_list pylab.plot(x,y,'ro') pylab.title("draw residual to check wrong number") # Pad margins so that markers don't get clipped by the axes,讓點不與坐標軸重合 pylab.margins(0.3) #繪制網格 pylab.grid(True) pylab.show() R_mode(x,y,xLabel,yLabel,formula) ''' result.fittedvalues表示預測的y值陣列 result.fittedvalues Out[42]: 0 6.094983 1 5.823391 2 5.833450 3 5.400915 4 5.531682 5 4.978439 6 5.260090 7 4.767201 8 5.592035 9 6.577813 dtype: float64 #計算殘差的偏態 S = stats.skew(result.resid, bias=True) Out[44]: -0.013678125910039975 K = stats.kurtosis(result.resid, fisher=False,bias=True) K Out[47]: 1.5271300905736027 '''
result.params 得到兩個參數:x的系數和截距
截距
result.params[0]
x系數
result.params[1]
dubin watson解讀
--殘差是否符合正太分布
D.W統計量是用來檢驗殘差分布是否為正態分布的,因為用OLS進行回歸估計是假設模型殘差服從正態分布的,因此,如果殘差不服從正態分布,那么,模型將是有偏的,也就是說模型的解釋能力是不強的。
D.W統計量在2左右說明殘差是服從正態分布的,若偏離2太遠,那么你所構建的模型
的解釋能力就要受影響了。
jarque-bera解讀
----樣本是否符合正太分布
JB統計量全稱叫Jarque-Bera統計量,是用來
檢驗一組樣本是否能夠認為來自正態總體的一種方法,它依據OLS
殘差,對大樣本進行檢驗(或稱為漸進檢驗)。

Jarque和Bera建立了如下
檢驗統計量——JB統計量:
若變量服從正態分布,則S為零,K為3,
因而JB統計量的值為零;如果變量不是正態變量,則JB統計量將為一個逐漸增大值。
如果JB統計量值較大,比如為11,則可以計算出卡方值大於11的概率為0.004,這個概率過小,因此不能認為樣本來自正態分布。反之,成立。
Jarque-Bera的P值接近於0,表明顯著性高,數據服從正態分布。
Omnibus解讀
Omnibus統計量的P值都接近於0,自變量的作用顯著。
Omnibus tests are a kind of statistical test. They test whether the explained variance in a set of data is significantly greater than the unexplained variance, overall. One example is the F-test in the analysis of variance. There can be legitimate significant effects within a model even if the omnibus test is not significant. For instance, in a model with two independent variables, if only one variable exerts a significant effect on the dependent variable and the other does not, then the omnibus test may be non-significant. This fact does not affect the conclusions that may be drawn from the one significant variable. In order to test effects within an omnibus test, researchers often use contrasts.
https://en.wikipedia.org/wiki/Omnibus_test
python信用評分卡建模(附代碼,博主錄制)