python機器學習-乳腺癌細胞挖掘(博主親自錄制視頻)
數據來源
https://github.com/thomas-haslwanter/statsintro_python/tree/master/ISP/Code_Quantlets/08_TestsMeanValues/anovaTwoway
# -*- coding: utf-8 -*- """ Created on Tue May 30 10:16:54 2017 @author: daxiong """ # Import standard packages import numpy as np import pandas as pd # additional packages from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm # Get the data inFile = 'altman_12_6.txt' data = np.genfromtxt(inFile, delimiter=',') def anova_interaction(data): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data. ''' # Bring them in DataFrame-format df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer']) # Determine the ANOVA with interaction #C(fetus):C(observer)表示兩者的交互 formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults['F'][0] anova_interaction(data) ''' df sum_sq mean_sq F PR(>F) C(fetus) 2 324.008889 162.004444 2113.101449 1.051039e-27 C(observer) 3 1.198611 0.399537 5.211353 6.497055e-03 C(fetus):C(observer) 6 0.562222 0.093704 1.222222 3.295509e-01 Residual 24 1.840000 0.076667 NaN NaN fetus和observer的交互沒有顯著影響 3.295509e-01>0.05 Out[5]: True 6.497055e-03<0.05 Out[6]: True '''
原數據
H0假設
平方和總計=因素1平方和+因素2平方和+因素1*因素2+組內誤差平方和
計算的F分數表
紅色區間就拒絕H0
根據兩個因素,把原始數據分為六個組,並計算六個組的均值,組成一個矩陣
計算性別因素的平方和
計算年齡因素平方和
計算組內誤差平方和
總平方和
兩個因素平方和=總平方和 - 第一個因素平方和 - 第二個因素平方和 - 組內誤差平方和
算出來為7
計算F分數,
F_第一因素=第一因素平方和/組內誤差平方和
F_第二因素=第二因素平方和/組內誤差平方和
F_第一第二因素交互=第一第二因素交互平方和/組內誤差平方和
spss應用
R**2=0.518,年齡和性別對分數影響只占了一半,還有其他因素造成分數的波動。
python代碼測試結果和spss一致
# -*- coding: utf-8 -*- ''' Name of QuantLet: ISP_anovaTwoway Published in: An Introduction to Statistics with Python Description: 'Two-way Analysis of Variance (ANOVA) The model is formulated using the <patsy> formula description. This is very similar to the way models are expressed in R.' Keywords: plot, fitting, two-way anova See also: 'ISP_anovaOneway, ISP_kruskalWallis, ISP_multipleTesting, ISP_oneGroup, ISP_twoGroups' Author: Thomas Haslwanter Submitted: October 31, 2015 Datafile: altman_12_6.txt Two-way Analysis of Variance (ANOVA) The model is formulated using the "patsy" formula description. This is very similar to the way models are expressed in R. ''' # Copyright(c) 2015, Thomas Haslwanter. All rights reserved, under the CC BY-SA 4.0 International License # Import standard packages import numpy as np import pandas as pd # additional packages from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm excel = 'score_age_gender.xlsx' def anova_interaction(inFile): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data. ''' # Get the data df=pd.read_excel(excel) # --- >>> START stats <<< --- # Determine the ANOVA with interaction formula = 'score ~ C(gender) + C(age) + C(gender):C(age)' lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults['F'][0] if __name__ == '__main__': anova_interaction(excel)
方差分析樣本量:
方差分析前提是樣本符合正態分布,樣本量越大,正態分布可能就越高。
if we suppose that you have k groups, N is the total sample size for all groups, then n-k should exceeds zero. Otherwise, there is no minimum size for each group except you need 2 elements for each to enable calculating the variance, but this is just a theoretical criteria.
However, to use ANOVA you need the check the Normal distribution for each group, so the higher size of your groups sizes, the more opportunity to have the Normal distribution.
Is there a minimum number per group neccessary for an ANOVA?. Available from: https://www.researchgate.net/post/Is_there_a_minimum_number_per_group_neccessary_for_an_ANOVA [accessed Jun 2, 2017].
由於分組的樣本量太小,單獨兩兩檢驗時,發現與雙因素方差檢驗結果不一致,年齡有顯著差異,性別無顯著差異
雙因素方差檢驗:年齡,性別都有顯著不同
# -*- coding: utf-8 -*-
# additional packages
from scipy.stats.mstats import kruskalwallis
import scipy.stats as stats
import numpy as np
import scipy as sp
#性別和成績是否有關
#性別分為:男孩+女孩
list_boys=[4,6,8,6,6,9,8,9,13]
list_girls=[4,8,9,7,10,13,12,14,16]
list_group=[list_boys,list_girls]
#三組非正太分布數據檢驗
def Kruskawallis_test(list_group):
# Perform the Kruskal-Wallis test,返回True表示有顯著差異,返回False表示無顯著差異
print"Use kruskawallis test:"
h, p = kruskalwallis(list_group)
print"H value:",h
print"p",p
# Print the results
if p<0.05:
print('There is a significant difference.')
return True
else:
print('No significant difference.')
return False
#兩組非正太分布數據檢驗
def Mannwhitneyu(group1, group2):
if np.int(sp.__version__.split('.')[1]) > 16:
u, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
else:
u, p_value = stats.mannwhitneyu(group1, group2, use_continuity=True)
p_value *= 2 # because the default was a one-sided p-value
print(("Mann-Whitney test", p_value))
if p_value<0.05:
print "there is significant difference"
else:
print "there is no significant difference"
'''
正態性符合
NormalTest(list_group)
use shapiro:
p of normal: 0.407877713442
data are normal distributed
use shapiro:
p of normal: 0.99165725708
data are normal distributed
Out[50]: True
'''
#獨立T檢驗
print(stats.ttest_ind(list_boys,list_girls))
'''
#性別和成績無顯著相關
Ttest_indResult(statistic=-1.7457431218879393, pvalue=0.10002508010755197)
'''
#Mannwhitneyu 檢驗
print(Mannwhitneyu(list_boys,list_girls))
'''
#性別和成績無顯著相關
('Mann-Whitney test', 0.10936655881962447)
there is no significant difference
None
'''
#年齡與得分
list_10=[4,6,8,4,8,9]
list_11=[6,6,9,7,10,13]
list_12=[8,9,13,12,14,16]
list_group=[list_10,list_11,list_12]
print(Kruskawallis_test(list_group))
'''
#年齡與得分有顯著相關
Use kruskawallis test:
H value: 7.37853403141
p 0.0249903128375
There is a significant difference.
True
'''
print(stats.f_oneway(list_10,list_11,list_12))
'''
#年齡與得分有顯著相關
F_onewayResult(statistic=6.5186915887850461, pvalue=0.0091759991604794412)
'''
三種廣告和兩種媒體的雙因素方差檢驗
數據
spss結果
python結果和spss結果一致
廣告方案 VS 銷量 有顯著差異
廣告媒體 VS銷量 無顯著差異
python使用了參數檢驗和非參數檢驗
# -*- coding: utf-8 -*- # additional packages from scipy.stats.mstats import kruskalwallis import scipy.stats as stats import numpy as np import scipy as sp #廣告媒體和銷量是否有關 #廣告媒體分為:報紙+電視 list_paper=[8,12,22,14,10,18] list_TV=[12,8,26,30,18,14] list_group=[list_paper,list_TV] #三組非正太分布數據檢驗 def Kruskawallis_test(list_group): # Perform the Kruskal-Wallis test,返回True表示有顯著差異,返回False表示無顯著差異 print"Use kruskawallis test:" h, p = kruskalwallis(list_group) print"H value:",h print"p",p # Print the results if p<0.05: print('There is a significant difference.') return True else: print('No significant difference.') return False #兩組非正太分布數據檢驗 def Mannwhitneyu(group1, group2): if np.int(sp.__version__.split('.')[1]) > 16: u, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided') else: u, p_value = stats.mannwhitneyu(group1, group2, use_continuity=True) p_value *= 2 # because the default was a one-sided p-value print(("Mann-Whitney test", p_value)) if p_value<0.05: print "there is significant difference" else: print "there is no significant difference" ''' 正態性符合 NormalTest(list_group) use shapiro: p of normal: 0.816518545151 data are normal distributed use shapiro: p of normal: 0.686249196529 data are normal distributed Out[44]: True ''' #獨立T檢驗 print(stats.ttest_ind(list_paper,list_TV)) ''' #廣告媒體和銷量無顯著相關 Out[45]: Ttest_indResult(statistic=-0.98373875367592956, pvalue=0.34844605001479834) ''' #Mannwhitneyu 檢驗 print(Mannwhitneyu(list_paper,list_TV)) ''' #廣告媒體和銷量無顯著相關 ('Mann-Whitney test', 0.46804160590041655) there is no significant difference None ''' #廣告方案與銷售量 list_adPlan1=[8,12,12,8] list_adPlan2=[22,14,26,30] list_adPlan3=[10,18,18,14] list_group=[list_adPlan1,list_adPlan2,list_adPlan3] print(Kruskawallis_test(list_group)) ''' #廣告方案與銷售量有顯著差異 H value: 7.38209219858 p 0.0249458925076 There is a significant difference. True ''' print(stats.f_oneway(list_adPlan1,list_adPlan2,list_adPlan3)) ''' #廣告方案與銷售量有顯著差異 F_onewayResult(statistic=7.7400000000000002, pvalue=0.011077453402650223) '''
超市位置 競爭者數量 銷售
數據
分析結果:超市位置,競爭者數量,兩者交互都具有顯著關系,R**2=0.78,三個因素占了方差差異的78%
python 與spss結果一致
# -*- coding: utf-8 -*-
# additional packages
from scipy.stats.mstats import kruskalwallis
import scipy.stats as stats
import numpy as np
import scipy as sp
import pandas as pd
import variance_check
excel="location_competition_sales.xlsx"
df=pd.read_excel(excel)
#超市位置和銷售是否有關
#超市位置分為:居民區+商業區+寫字樓
list_resident=list(df.sale[df.location=="resident"])
list_commercial=list(df.sale[df.location=="commercial"])
list_officialBuilding=list(df.sale[df.location=="officialBuilding"])
list_group=[list_resident,list_commercial,list_officialBuilding]
'''
正態分布符合
NormalTest(list_group)
use shapiro:
p of normal: 0.503288388252
data are normal distributed
use shapiro:
p of normal: 0.832764387131
data are normal distributed
use shapiro:
p of normal: 0.516751050949
data are normal distributed
Out[28]: True
方差不齊
scipy.stats.levene(list_resident,list_commercial,list_officialBuilding)
Out[4]: LeveneResult(statistic=4.4442331784870888, pvalue=0.019539858639246118)
'''
#三組非正太分布數據檢驗
def Kruskawallis_test(list_group):
# Perform the Kruskal-Wallis test,返回True表示有顯著差異,返回False表示無顯著差異
print"Use kruskawallis test:"
h, p = kruskalwallis(list_group)
print"H value:",h
print"p",p
# Print the results
if p<0.05:
print('There is a significant difference.')
return True
else:
print('No significant difference.')
return False
#兩組非正太分布數據檢驗
def Mannwhitneyu(group1, group2):
if np.int(sp.__version__.split('.')[1]) > 16:
u, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
else:
u, p_value = stats.mannwhitneyu(group1, group2, use_continuity=True)
p_value *= 2 # because the default was a one-sided p-value
print(("Mann-Whitney test", p_value))
if p_value<0.05:
print "there is significant difference"
else:
print "there is no significant difference"
#超市位置和銷售是否有關
print(Kruskawallis_test(list_group))
'''
#超市位置和銷售顯著相關
H value: 16.5992978614
p 0.000248604089056
There is a significant difference.
Out[5]: True
'''
print(stats.f_oneway(list_resident,list_commercial,list_officialBuilding))
'''
#超市位置和銷售顯著相關
F_onewayResult(statistic=13.356711543650388, pvalue=5.6272024104779695e-05)
'''
stats.ttest_ind(list_resident,list_commercial)
'''
居民和商業區不限制,LSD方法不准確
Out[13]: Ttest_indResult(statistic=-1.220218599266957, pvalue=0.23530091594463254)
'''
stats.ttest_ind(list_resident,list_officialBuilding)
'''
#居民區和寫字樓顯著
Out[14]: Ttest_indResult(statistic=3.6229866471434136, pvalue=0.0015056781096067899)
'''
stats.ttest_ind(list_commercial,list_officialBuilding)
'''
#商業區和寫字樓顯著
Out[15]: Ttest_indResult(statistic=5.9972670824689063, pvalue=4.9044253751230555e-06)
'''
#競爭對手和銷售是否有關
list0=list(df.sale[df.competition==0])
list1=list(df.sale[df.competition==1])
list2=list(df.sale[df.competition==2])
list3=list(df.sale[df.competition==3])
list_group1=[list0,list1,list2,list3]
print(Kruskawallis_test(list_group1))
'''
#競爭對手和銷售有顯著相關
H value: 8.07588250451
p 0.0444692008525
There is a significant difference.
True
'''
print(stats.f_oneway(list0,list1,list2,list3))
'''
#競爭對手和銷售有顯著相關
F_onewayResult(statistic=4.1350270513392635, pvalue=0.013834155024036648)
'''
stats.ttest_ind(list0,list1)
'''
#競爭對手0和1不顯著
Out[18]: Ttest_indResult(statistic=0.28073494601522792, pvalue=0.78251141069039509)
'''
variance_check.py
# -*- coding: utf-8 -*- ''' 用於方差齊性檢驗 正太性檢驗 配對相等檢驗 ''' import scipy,math from scipy.stats import f import numpy as np import matplotlib.pyplot as plt import scipy.stats as stats # additional packages from statsmodels.stats.diagnostic import lillifors #多重比較 from statsmodels.sandbox.stats.multicomp import multipletests #用於排列組合 import itertools ''' #測試數據 group1=[2,3,7,2,6] group2=[10,8,7,5,10] group3=[10,13,14,13,15] list_groups=[group1,group2,group3] list_total=group1+group2+group3 ''' a=0.05 #正態分布測試 def check_normality(testData): #20<樣本數<50用normal test算法檢驗正態分布性 if 20<len(testData) <50: p_value= stats.normaltest(testData)[1] if p_value<0.05: print"use normaltest" print"p of normal:",p_value print "data are not normal distributed" return False else: print"use normaltest" print"p of normal:",p_value print "data are normal distributed" return True #樣本數小於50用Shapiro-Wilk算法檢驗正態分布性 if len(testData) <50: p_value= stats.shapiro(testData)[1] if p_value<0.05: print "use shapiro:" print"p of normal:",p_value print "data are not normal distributed" return False else: print "use shapiro:" print"p of normal:",p_value print "data are normal distributed" return True if 300>=len(testData) >=50: p_value= lillifors(testData)[1] if p_value<0.05: print "use lillifors:" print"p of normal:",p_value print "data are not normal distributed" return False else: print "use lillifors:" print"p of normal:",p_value print "data are normal distributed" return True if len(testData) >300: p_value= stats.kstest(testData,'norm')[1] if p_value<0.05: print "use kstest:" print"p of normal:",p_value print "data are not normal distributed" return False else: print "use kstest:" print"p of normal:",p_value print "data are normal distributed" return True #對所有樣本組進行正態性檢驗 def NormalTest(list_groups): for group in list_groups: #正態性檢驗 status=check_normality(group) if status==False : return False return True #排列組合函數 def Combination(list_groups): combination= [] for i in range(1,len(list_groups)+1): iter = itertools.combinations(list_groups,i) combination.append(list(iter)) #需要排除第一個和最后一個 return combination[1:-1][0] ''' Out[57]: [[([2, 3, 7, 2, 6], [10, 8, 7, 5, 10]), ([2, 3, 7, 2, 6], [10, 13, 14, 13, 15]), ([10, 8, 7, 5, 10], [10, 13, 14, 13, 15])]] ''' #方差齊性檢測 def Levene_test(group1,group2,group3): leveneResult=scipy.stats.levene(group1,group2,group3) p=leveneResult[1] print"levene test:" if p<0.05: print"variances of groups are not equal" return False else: print"variances of groups are equal" return True ''' H0成立,三組數據方差無顯著差異 Out[9]: LeveneResult(statistic=0.24561403508771934, pvalue=0.7860617221429711) ''' #比較組內的樣本是否相等,如果不相等,不適合於tukey等方法 #此函數有問題,無法解決nan排除 def Equal_lenth(list_groups): list1=list_groups[0] list2=list_groups[1] list3=list_groups[2] list1_removeNan=[x for x in list1 if str(x) != 'nan' and str(x)!= '-inf'] list2_removeNan=[x for x in list2 if str(x) != 'nan' and str(x)!= '-inf'] list3_removeNan=[x for x in list3 if str(x) != 'nan' and str(x)!= '-inf'] len1=len(list1_removeNan) len2=len(list2_removeNan) len3=len(list3_removeNan) if len1==len2==len3: return True else: return False ''' #返回True or false normality=NormalTest(list_groups) leveneResult=Levene_test(list_groups[0],list_groups[1],list_groups[2]) '''
https://study.163.com/provider/400000000398149/index.htm?share=2&shareId=400000000398149( 歡迎關注博主主頁,學習python視頻資源,還有大量免費python經典文章)