python機器學習-乳腺癌細胞挖掘(博主親自錄制視頻)
項目合作QQ:231469242
多重共線性測試需要改進
文件夾需要兩個包


python3.0 anaconda
normality_check.py 正太檢驗
# -*- coding: utf-8 -*-
'''
Author:Toby
QQ:231469242,all right reversed,no commercial use
normality_check.py
正態性檢驗腳本
'''
import scipy
from scipy.stats import f
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
# additional packages
from statsmodels.stats.diagnostic import lillifors
#正態分布測試
def check_normality(testData):
#20<樣本數<50用normal test算法檢驗正態分布性
if 20<len(testData) <50:
p_value= stats.normaltest(testData)[1]
if p_value<0.05:
print("use normaltest")
print ("data are not normal distributed")
return False
else:
print("use normaltest")
print ("data are normal distributed")
return True
#樣本數小於50用Shapiro-Wilk算法檢驗正態分布性
if len(testData) <50:
p_value= stats.shapiro(testData)[1]
if p_value<0.05:
print ("use shapiro:")
print ("data are not normal distributed")
return False
else:
print ("use shapiro:")
print ("data are normal distributed")
return True
if 300>=len(testData) >=50:
p_value= lillifors(testData)[1]
if p_value<0.05:
print ("use lillifors:")
print ("data are not normal distributed")
return False
else:
print ("use lillifors:")
print ("data are normal distributed")
return True
if len(testData) >300:
p_value= stats.kstest(testData,'norm')[1]
if p_value<0.05:
print ("use kstest:")
print ("data are not normal distributed")
return False
else:
print ("use kstest:")
print ("data are normal distributed")
return True
#對所有樣本組進行正態性檢驗
def NormalTest(list_groups):
for group in list_groups:
#正態性檢驗
status=check_normality(group)
if status==False :
return False
return True
Rsquare_multimode.py 多種模型計算R平方
加入了線性顯著檢測和r相關系數顯著檢測,多重共線性,自相關,殘差正太檢驗等等
# -*- coding: utf-8 -*-
#斯皮爾曼等級相關(Spearman’s correlation coefficient for ranked data)
import math,pylab,scipy
import numpy as np
import scipy.stats as stats
from scipy.stats import t
from scipy.stats import f
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats.diagnostic import lillifors
import normality_check
import statsmodels.formula.api as sm
x=[4.03,3.76,3.77,3.34,3.47,2.92,3.20,2.71,3.53,4.51]
y=[6.47,6.13,6.19,4.89,5.63,4.52,5.89,4.79,5.27,6.08]
list_group=[x,y]
sample=len(x)
#顯著性
a=0.05
#數據可視化
plt.plot(x,y,'ro')
#斯皮爾曼等級相關,非參數檢驗
def Spearmanr(x,y):
print("use spearmanr,Nonparametric tests")
#樣本不一致時,發出警告
if len(x)!=len(y):
print ("warming,the samples are not equal!")
r,p=stats.spearmanr(x,y)
print("spearman r**2:",r**2)
print("spearman p:",p)
if sample<500 and p>0.05:
print("when sample < 500,p has no mean(>0.05)")
print("when sample > 500,p has mean")
#皮爾森 ,參數檢驗
def Pearsonr(x,y):
print("use Pearson,parametric tests")
r,p=stats.pearsonr(x,y)
print("pearson r**2:",r**2)
print("pearson p:",p)
if sample<30:
print("when sample <30,pearson has no mean")
#皮爾森 ,參數檢驗,帶有詳細參數
def Pearsonr_details(x,y,xLabel,yLabel,formula):
n=len(x)
df=n-2
data=pd.DataFrame({yLabel:y,xLabel:x})
result = sm.ols(formula, data).fit()
print(result.summary())
#模型F分布顯著性分析
print('\n')
print("linear relation Significant test:...................................")
#如果F檢驗的P值<0.05,拒絕H0,x和y無顯著關系,H1成立,x和y有顯著關系
if result.f_pvalue<0.05:
print ("P value of f test<0.05,the linear relation is right.")
#R的顯著檢驗
print('\n')
print("R significant test:...................................")
r_square=result.rsquared
r=math.sqrt(r_square)
t_score=r*math.sqrt(n-2)/(math.sqrt(1-r**2))
t_std=t.isf(a/2,df)
if t_score<-t_std or t_score>t_std:
print ("R is significant according to its sample size")
else:
print ("R is not significant")
#殘差分析
print('\n')
print("residual error analysis:...................................")
states=normality_check.check_normality(result.resid)
if states==True:
print("the residual error are normal distributed")
else:
print("the residual error are not normal distributed")
#殘差偏態和峰態
Skew = stats.skew(result.resid, bias=True)
Kurtosis = stats.kurtosis(result.resid, fisher=False,bias=True)
if round(Skew,1)==0:
print("residual errors normality Skew:in middle,perfect match")
elif round(Skew,1)>0:
print("residual errors normality Skew:close right")
elif round(Skew,1)<0:
print("residual errors normality Skew:close left")
if round(Kurtosis,1)==3:
print("residual errors normality Kurtosis:in middle,perfect match")
elif round(Kurtosis,1)>3:
print("residual errors normality Kurtosis:more peak")
elif round(Kurtosis,1)<3:
print("residual errors normality Kurtosis:more flat")
#自相關分析autocorrelation
print('\n')
print("autocorrelation test:...................................")
DW = np.sum( np.diff( result.resid.values )**2.0 )/ result.ssr
if round(DW,1)==2:
print("Durbin-Watson close to 2,there is no autocorrelation.OLS model works well")
#共線性檢查
print('\n')
print("multicollinearity test:")
conditionNumber=result.condition_number
if conditionNumber>30:
print("conditionNumber>30,multicollinearity exists")
else:
print("conditionNumber<=30,multicollinearity not exists")
#繪制殘差圖,用於方差齊性檢驗
Draw_residual(list(result.resid))
'''
result.rsquared
Out[28]: 0.61510660055413524
'''
#kendalltau非參數檢驗
def Kendalltau(x,y):
print("use kendalltau,Nonparametric tests")
r,p=stats.kendalltau(x,y)
print("kendalltau r**2:",r**2)
print("kendalltau p:",p)
#選擇模型
def R_mode(x,y,xLabel,yLabel,formula):
#正態性檢驗
Normal_result=normality_check.NormalTest(list_group)
print ("normality result:",Normal_result)
if len(list_group)>2:
Kendalltau(x,y)
if Normal_result==False:
Spearmanr(x,y)
Kendalltau(x,y)
if Normal_result==True:
Pearsonr_details(x,y,xLabel,yLabel,formula)
#調整的R方
def Adjust_Rsquare(r_square,n,k):
adjust_rSquare=1-((1-r_square)*(n-1)*1.0/(n-k-1))
return adjust_rSquare
'''
n=len(x)
n=10
k=1
r_square=0.615
Adjust_Rsquare(r_square,n,k)
Out[11]: 0.566875
'''
#繪圖
def Plot(x,y,yLabel,xLabel,Title):
plt.plot(x,y,'ro')
plt.ylabel(yLabel)
plt.xlabel(xLabel)
plt.title(Title)
plt.show()
#繪圖參數
yLabel='Alcohol'
xLabel='Tobacco'
Title='Sales in Several UK Regions'
Plot(x,y,yLabel,xLabel,Title)
formula='Alcohol ~ Tobacco'
#繪制殘點圖
def Draw_residual(residual_list):
x=[i for i in range(1,len(residual_list)+1)]
y=residual_list
pylab.plot(x,y,'ro')
pylab.title("draw residual to check wrong number")
# Pad margins so that markers don't get clipped by the axes,讓點不與坐標軸重合
pylab.margins(0.3)
#繪制網格
pylab.grid(True)
pylab.show()
R_mode(x,y,xLabel,yLabel,formula)
'''
result.fittedvalues表示預測的y值陣列
result.fittedvalues
Out[42]:
0 6.094983
1 5.823391
2 5.833450
3 5.400915
4 5.531682
5 4.978439
6 5.260090
7 4.767201
8 5.592035
9 6.577813
dtype: float64
#計算殘差的偏態
S = stats.skew(result.resid, bias=True)
Out[44]: -0.013678125910039975
K = stats.kurtosis(result.resid, fisher=False,bias=True)
K
Out[47]: 1.5271300905736027
'''
result.params 得到兩個參數:x的系數和截距
截距
result.params[0]
x系數
result.params[1]




dubin watson解讀
--殘差是否符合正太分布
D.W統計量是用來檢驗殘差分布是否為正態分布的,因為用OLS進行回歸估計是假設模型殘差服從正態分布的,因此,如果殘差不服從正態分布,那么,模型將是有偏的,也就是說模型的解釋能力是不強的。
D.W統計量在2左右說明殘差是服從正態分布的,若偏離2太遠,那么你所構建的模型
的解釋能力就要受影響了。
jarque-bera解讀
----樣本是否符合正太分布
JB統計量全稱叫Jarque-Bera統計量,是用來
檢驗一組樣本是否能夠認為來自正態總體的一種方法,它依據OLS
殘差,對大樣本進行檢驗(或稱為漸進檢驗)。
Jarque和Bera建立了如下
檢驗統計量——JB統計量:
若變量服從正態分布,則S為零,K為3,
因而JB統計量的值為零;如果變量不是正態變量,則JB統計量將為一個逐漸增大值。
如果JB統計量值較大,比如為11,則可以計算出卡方值大於11的概率為0.004,這個概率過小,因此不能認為樣本來自正態分布。反之,成立。
Jarque-Bera的P值接近於0,表明顯著性高,數據服從正態分布。
Omnibus解讀
Omnibus統計量的P值都接近於0,自變量的作用顯著。
Omnibus tests are a kind of statistical test. They test whether the explained variance in a set of data is significantly greater than the unexplained variance, overall. One example is the F-test in the analysis of variance. There can be legitimate significant effects within a model even if the omnibus test is not significant. For instance, in a model with two independent variables, if only one variable exerts a significant effect on the dependent variable and the other does not, then the omnibus test may be non-significant. This fact does not affect the conclusions that may be drawn from the one significant variable. In order to test effects within an omnibus test, researchers often use contrasts.
https://en.wikipedia.org/wiki/Omnibus_test
python信用評分卡建模(附代碼,博主錄制)




