總結下第七章的統計分析方法,里面涉及到了很多統計專業概念。 Summary 函數 > myvars<-c("mpg","hp","wt") > summary(mtcars[myvars]) mpg hp wt Min. :10.40 Min. : 52.0 Min. :1.513 1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 Median :19.20 Median :123.0 Median :3.325 Mean :20.09 Mean :146.7 Mean :3.217 3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 Max. :33.90 Max. :335.0 Max. :5.424 sapply 函數,針對某一數據源的某一數據列應用特點的統計函數 mystats<-function(x,na.omit=FALSE){ + if(na.omit){x<-x[!is.na(x)]} + m<-mean(x) + n<-length(x) + s<-sd(x) + skew<-sum((x-m)^3/s^3)/n + kurt<-sum((x-m)^4/s^4)/n-3 + return(c(n=n,mean=m,stdev=s,skew=skew,kurtosis=kurt))} > myvars<-c("mpg","hp","wt") > sapply(mtcars[myvars],mystats) mpg hp wt n 32.000000 32.0000000 32.00000000 mean 20.090625 146.6875000 3.21725000 stdev 6.026948 68.5628685 0.97845744 skew 0.610655 0.7260237 0.42314646 kurtosis -0.372766 -0.1355511 -0.02271075 #hmisc describe 包 - 生成詳細的統計概括 > library(Hmisc) > myvars<-c("mpg","hp","wt") > describe(mtcars[myvars]) mtcars[myvars] 3 Variables 32 Observations ------------------------------------------------------------------------------------- mpg n missing distinct Info Mean Gmd .05 .10 .25 32 0 25 0.999 20.09 6.796 12.00 14.34 15.43 .50 .75 .90 .95 19.20 22.80 30.09 31.30 lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9 ------------------------------------------------------------------------------------- hp n missing distinct Info Mean Gmd .05 .10 .25 32 0 22 0.997 146.7 77.04 63.65 66.00 96.50 .50 .75 .90 .95 123.00 180.00 243.50 253.55 lowest : 52 62 65 66 91, highest: 215 230 245 264 335 ------------------------------------------------------------------------------------- wt n missing distinct Info Mean Gmd .05 .10 .25 32 0 29 0.999 3.217 1.089 1.736 1.956 2.581 .50 .75 .90 .95 3.325 3.610 4.048 5.293 lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424 ------------------------------------------------------------------------------------- > #pastecs desc - 和上面的describe函數功能類似 > install.packages("pastecs") > library(pastecs) > myvars<-c("mpg","hp","wt") > stat.desc(mtcars[myvars]) mpg hp wt nbr.val 32.0000000 32.0000000 32.0000000 nbr.null 0.0000000 0.0000000 0.0000000 nbr.na 0.0000000 0.0000000 0.0000000 min 10.4000000 52.0000000 1.5130000 max 33.9000000 335.0000000 5.4240000 range 23.5000000 283.0000000 3.9110000 sum 642.9000000 4694.0000000 102.9520000 median 19.2000000 123.0000000 3.3250000 mean 20.0906250 146.6875000 3.2172500 SE.mean 1.0654240 12.1203173 0.1729685 CI.mean.0.95 2.1729465 24.7195501 0.3527715 var 36.3241028 4700.8669355 0.9573790 std.dev 6.0269481 68.5628685 0.9784574 coef.var 0.2999881 0.4674077 0.3041285 > > > > #psych describe - 描述函數 > install.packages("psych") > library(psych) > describe(mtcars[myvars]) vars n mean sd median trimmed mad min max range skew kurtosis mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37 hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14 wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02 se mpg 1.07 hp 12.12 wt 0.17 > > #doBy summaryBy -分組統計函數,第一個輸入參數是一個公式:var1+var2+var3~group1+group2(~左邊的變量是待分析變量,右邊的是待分組變量) > install.packages("doBy") > library(doBy) > summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats) am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev 1 0 19 17.14737 3.833966 0.01395038 -0.8031783 19 160.2632 53.90820 2 1 13 24.39231 6.166504 0.05256118 -1.4553520 13 126.8462 84.06232 hp.skew hp.kurtosis wt.n wt.mean wt.stdev wt.skew wt.kurtosis 1 -0.01422519 -1.2096973 19 3.768895 0.7774001 0.9759294 0.1415676 2 1.35988586 0.5634635 13 2.411000 0.6169816 0.2103128 -1.1737358 > #調用同名函數 - 和編程中的命名空間概念類似 > Hmisc::describe(mtcars[myvars]) mtcars[myvars] 3 Variables 32 Observations ------------------------------------------------------------------------------------- mpg n missing distinct Info Mean Gmd .05 .10 .25 32 0 25 0.999 20.09 6.796 12.00 14.34 15.43 .50 .75 .90 .95 19.20 22.80 30.09 31.30 lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9 ------------------------------------------------------------------------------------- hp n missing distinct Info Mean Gmd .05 .10 .25 32 0 22 0.997 146.7 77.04 63.65 66.00 96.50 .50 .75 .90 .95 123.00 180.00 243.50 253.55 lowest : 52 62 65 66 91, highest: 215 230 245 264 335 ------------------------------------------------------------------------------------- wt n missing distinct Info Mean Gmd .05 .10 .25 32 0 29 0.999 3.217 1.089 1.736 1.956 2.581 .50 .75 .90 .95 3.325 3.610 4.048 5.293 lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424 ------------------------------------------------------------------------------------- > > > #分組 - 兩個分組函數:aggregate和by > mtcars$am [1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 > help("aggregate") > > > aggregate(mtcars[myvars],by=list(am=mtcars$am),mean) am mpg hp wt 1 0 17.14737 160.2632 3.768895 2 1 24.39231 126.8462 2.411000 > aggregate(mtcars[myvars],by=list(am=mtcars$am),sd) am mpg hp wt 1 0 3.833966 53.90820 0.7774001 2 1 6.166504 84.06232 0.6169816 > > > dstats<-function(x)sapply(x,mystats) > by(mtcars[myvars],mtcars$am,dstats) mtcars$am: 0 mpg hp wt n 19.00000000 19.00000000 19.0000000 mean 17.14736842 160.26315789 3.7688947 stdev 3.83396639 53.90819573 0.7774001 skew 0.01395038 -0.01422519 0.9759294 kurtosis -0.80317826 -1.20969733 0.1415676 --------------------------------------------------------------- mtcars$am: 1 mpg hp wt n 13.00000000 13.0000000 13.0000000 mean 24.39230769 126.8461538 2.4110000 stdev 6.16650381 84.0623243 0.6169816 skew 0.05256118 1.3598859 0.2103128 kurtosis -1.45535200 0.5634635 -1.1737358 > > #頻數表 - 這東西和Excel中的Pivit Table 類似 > #table, xtabs, prop.table, margin.table, addmargins, ftable > library(vcd) 載入需要的程輯包:grid Warning message: 程輯包‘vcd’是用R版本3.3.3 來建造的 > attach(Arthritis) > mytable<-table(Improved) > mytable Improved None Some Marked 42 14 28 > > #百分比形式 > prop.table(mytable) Improved None Some Marked 0.5000000 0.1666667 0.3333333 > > #二維列聯表 > mytable<-xtabs(~Treatment+Improved, data=Arthritis) > mytable Improved Treatment None Some Marked Placebo 29 7 7 Treated 13 7 21 > > > > #cross table > install.packages("gmodels") > library(gmodels) > CrossTable(Arthritis$Treatment,Arthritis$Improved) Cell Contents |-------------------------| | N | | Chi-square contribution | | N / Row Total | | N / Col Total | | N / Table Total | |-------------------------| Total Observations in Table: 84 | Arthritis$Improved Arthritis$Treatment | None | Some | Marked | Row Total | --------------------|-----------|-----------|-----------|-----------| Placebo | 29 | 7 | 7 | 43 | | 2.616 | 0.004 | 3.752 | | | 0.674 | 0.163 | 0.163 | 0.512 | | 0.690 | 0.500 | 0.250 | | | 0.345 | 0.083 | 0.083 | | --------------------|-----------|-----------|-----------|-----------| Treated | 13 | 7 | 21 | 41 | | 2.744 | 0.004 | 3.935 | | | 0.317 | 0.171 | 0.512 | 0.488 | | 0.310 | 0.500 | 0.750 | | | 0.155 | 0.083 | 0.250 | | --------------------|-----------|-----------|-----------|-----------| Column Total | 42 | 14 | 28 | 84 | | 0.500 | 0.167 | 0.333 | | --------------------|-----------|-----------|-----------|-----------| > > > #多維列聯表 > mytable<-xtab(~Treatment+Sex+Improved,data=Arthritis) #這個公式和之前的類型,左邊的是需要分析的變量,右邊是需要分組的變量 Error: could not find function "xtab" > mytable<-xtabs(~Treatment+Sex+Improved,data=Arthritis) > mytable , , Improved = None Sex Treatment Female Male Placebo 19 10 Treated 6 7 , , Improved = Some Sex Treatment Female Male Placebo 7 0 Treated 5 2 , , Improved = Marked Sex Treatment Female Male Placebo 6 1 Treated 16 5 > ftable(mytable) Improved None Some Marked Treatment Sex Placebo Female 19 7 6 Male 10 0 1 Treated Female 6 5 16 Male 7 2 5 > > margin.table(mytable,1) Treatment Placebo Treated 43 41 > margin.table(mytable,2) Sex Female Male 59 25 > margin.table(mytable,3) Improved None Some Marked 42 14 28 > margin.table(mytable,c(1,3)) Improved Treatment None Some Marked Placebo 29 7 7 Treated 13 7 21 > ftable(prop.table(mytable,c(1,2))) Improved None Some Marked Treatment Sex Placebo Female 0.59375000 0.21875000 0.18750000 Male 0.90909091 0.00000000 0.09090909 Treated Female 0.22222222 0.18518519 0.59259259 Male 0.50000000 0.14285714 0.35714286 > > > ftable(addmargins(prop.table(mytable,c(1,2)),3)) Improved None Some Marked Sum Treatment Sex Placebo Female 0.59375000 0.21875000 0.18750000 1.00000000 Male 0.90909091 0.00000000 0.09090909 1.00000000 Treated Female 0.22222222 0.18518519 0.59259259 1.00000000 Male 0.50000000 0.14285714 0.35714286 1.00000000 > help(addmargins) > #addmargins 求和 > ftable(addmargins(prop.table(mytable,c(1,2)),3))*100 Improved None Some Marked Sum Treatment Sex Placebo Female 59.375000 21.875000 18.750000 100.000000 Male 90.909091 0.000000 9.090909 100.000000 Treated Female 22.222222 18.518519 59.259259 100.000000 Male 50.000000 14.285714 35.714286 100.000000 #獨立性檢驗 - 檢測變量之間是否有關聯 > > > #卡方獨立性檢驗 > library(vcd) > mytable<-xtabs(~Treatment+Improved,data=Arthritis) > chisq.test(mytable) Pearson's Chi-squared test data: mytable X-squared = 13.055, df = 2, p-value = 0.001463 > > > mytable<-xtabs(~Improved+Sex,dat=Arthritis) > chisq.test(mytable) Pearson's Chi-squared test data: mytable X-squared = 4.8407, df = 2, p-value = 0.08889 Warning message: In chisq.test(mytable) : Chi-squared近似算法有可能不准 > > > > > > #fisher > mytable<-xtabs(~Treatment+Improved,data=Arthritis) > fisher.test(mytable) Fisher's Exact Test for Count Data data: mytable p-value = 0.001393 alternative hypothesis: two.sided > > > #mantelhaen > mytable<-xtabs(~Treatment+Improved+Sex,data=Arthritis) > mantelhaen.test(mytable) Cochran-Mantel-Haenszel test data: mytable Cochran-Mantel-Haenszel M^2 = 14.632, df = 2, p-value = 0.0006647 > #associate > library(vcd) > mytable<-xtabs(~Treatment+Improved,data=Arthritis) > associate(mytable) Error: could not find function "associate" > associates(mytable) Error: could not find function "associates" > assocstats(mytable) X^2 df P(> X^2) Likelihood Ratio 13.530 2 0.0011536 Pearson 13.055 2 0.0014626 Phi-Coefficient : NA Contingency Coeff.: 0.367 Cramer's V : 0.394 > > > #相關 批量檢測變量的相關性,比較有用,用來推導變量之間的依賴關系 > states<-state.x77[,1:6] > cov(states) Population Income Illiteracy Life Exp Murder Population 19931683.7588 571229.7796 292.8679592 -407.8424612 5663.523714 Income 571229.7796 377573.3061 -163.7020408 280.6631837 -521.894286 Illiteracy 292.8680 -163.7020 0.3715306 -0.4815122 1.581776 Life Exp -407.8425 280.6632 -0.4815122 1.8020204 -3.869480 Murder 5663.5237 -521.8943 1.5817755 -3.8694804 13.627465 HS Grad -3551.5096 3076.7690 -3.2354694 6.3126849 -14.549616 HS Grad Population -3551.509551 Income 3076.768980 Illiteracy -3.235469 Life Exp 6.312685 Murder -14.549616 HS Grad 65.237894 > > cor(states) Population Income Illiteracy Life Exp Murder HS Grad Population 1.00000000 0.2082276 0.1076224 -0.06805195 0.3436428 -0.09848975 Income 0.20822756 1.0000000 -0.4370752 0.34025534 -0.2300776 0.61993232 Illiteracy 0.10762237 -0.4370752 1.0000000 -0.58847793 0.7029752 -0.65718861 Life Exp -0.06805195 0.3402553 -0.5884779 1.00000000 -0.7808458 0.58221620 Murder 0.34364275 -0.2300776 0.7029752 -0.78084575 1.0000000 -0.48797102 HS Grad -0.09848975 0.6199323 -0.6571886 0.58221620 -0.4879710 1.00000000 > > cor(states,method="spearman") Population Income Illiteracy Life Exp Murder HS Grad Population 1.0000000 0.1246098 0.3130496 -0.1040171 0.3457401 -0.3833649 Income 0.1246098 1.0000000 -0.3145948 0.3241050 -0.2174623 0.5104809 Illiteracy 0.3130496 -0.3145948 1.0000000 -0.5553735 0.6723592 -0.6545396 Life Exp -0.1040171 0.3241050 -0.5553735 1.0000000 -0.7802406 0.5239410 Murder 0.3457401 -0.2174623 0.6723592 -0.7802406 1.0000000 -0.4367330 HS Grad -0.3833649 0.5104809 -0.6545396 0.5239410 -0.4367330 1.0000000 > > > x<- states[,c("Population","InCome","HS Grad")] Error in states[, c("Population", "InCome", "HS Grad")] : subscript out of bounds > x<- states[,c("Population","InCome","Illiteracy", "HS Grad")] Error in states[, c("Population", "InCome", "Illiteracy", "HS Grad")] : subscript out of bounds > head(states) Population Income Illiteracy Life Exp Murder HS Grad Alabama 3615 3624 2.1 69.05 15.1 41.3 Alaska 365 6315 1.5 69.31 11.3 66.7 Arizona 2212 4530 1.8 70.55 7.8 58.1 Arkansas 2110 3378 1.9 70.66 10.1 39.9 California 21198 5114 1.1 71.71 10.3 62.6 Colorado 2541 4884 0.7 72.06 6.8 63.9 > x<-states[,1] > x Alabama Alaska Arizona Arkansas California 3615 365 2212 2110 21198 Colorado Connecticut Delaware Florida Georgia 2541 3100 579 8277 4931 Hawaii Idaho Illinois Indiana Iowa 868 813 11197 5313 2861 Kansas Kentucky Louisiana Maine Maryland 2280 3387 3806 1058 4122 Massachusetts Michigan Minnesota Mississippi Missouri 5814 9111 3921 2341 4767 Montana Nebraska Nevada New Hampshire New Jersey 746 1544 590 812 7333 New Mexico New York North Carolina North Dakota Ohio 1144 18076 5441 637 10735 Oklahoma Oregon Pennsylvania Rhode Island South Carolina 2715 2284 11860 931 2816 South Dakota Tennessee Texas Utah Vermont 681 4173 12237 1203 472 Virginia Washington West Virginia Wisconsin Wyoming 4981 3559 1799 4589 376 > x<-states[,c("Population","Income","Illiteracy","HS Grad")] > y<-states[,c("Life Exp","Murder")] > cor(x,y) Life Exp Murder Population -0.06805195 0.3436428 Income 0.34025534 -0.2300776 Illiteracy -0.58847793 0.7029752 HS Grad 0.58221620 -0.4879710 > > > > #偏相關 > install.packages("ggm") 程輯包‘ggm’是用R版本3.3.3 來建造的 > colnames(states) [1] "Population" "Income" "Illiteracy" "Life Exp" "Murder" "HS Grad" > pcor(c(1,5,2,3,6),cov(states)) [1] 0.3462724 > > > #相關性的顯著性驗證 > cor.test(states[,3],states[,5]) Pearson's product-moment correlation data: states[, 3] and states[, 5] t = 6.8479, df = 48, p-value = 1.258e-08 alternative hypothesis: true correlation is not equal to 0 95 percent confidence interval: 0.5279280 0.8207295 sample estimates: cor 0.7029752 > > > #多變量驗證 > library(psych) > corr.test(states,use="complete") Call:corr.test(x = states, use = "complete") Correlation matrix Population Income Illiteracy Life Exp Murder HS Grad Population 1.00 0.21 0.11 -0.07 0.34 -0.10 Income 0.21 1.00 -0.44 0.34 -0.23 0.62 Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66 Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58 Murder 0.34 -0.23 0.70 -0.78 1.00 -0.49 HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00 Sample Size [1] 50 Probability values (Entries above the diagonal are adjusted for multiple tests.) Population Income Illiteracy Life Exp Murder HS Grad Population 0.00 0.59 1.00 1.0 0.10 1 Income 0.15 0.00 0.01 0.1 0.54 0 Illiteracy 0.46 0.00 0.00 0.0 0.00 0 Life Exp 0.64 0.02 0.00 0.0 0.00 0 Murder 0.01 0.11 0.00 0.0 0.00 0 HS Grad 0.50 0.00 0.00 0.0 0.00 0 To see confidence intervals of the correlations, print with the short=FALSE option > > > > > #T檢驗 > #測試在美國南部和北部犯罪后被關進監獄的概率 > library(MASS) > t.test(Prob~So, data=UScrime) Welch Two Sample t-test data: Prob by So t = -3.8954, df = 24.925, p-value = 0.0006506 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -0.03852569 -0.01187439 sample estimates: mean in group 0 mean in group 1 0.03851265 0.06371269 > > library(MASS) > sapply(UCcrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))}) Error in lapply(X = X, FUN = FUN, ...) : object 'UCcrime' not found > sapply(UScrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))}) Show Traceback Rerun with Debug Error in is.data.frame(x) : object 'X' not found > attach(UScrime) The following object is masked _by_ .GlobalEnv: y > sapply(UScrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))}) Show Traceback Rerun with Debug Error in is.data.frame(x) : object 'X' not found > head(UScrime) M So Ed Po1 Po2 LF M.F Pop NW U1 U2 GDP Ineq Prob Time y 1 151 1 91 58 56 510 950 33 301 108 41 394 261 0.084602 26.2011 791 2 143 0 113 103 95 583 1012 13 102 96 36 557 194 0.029599 25.2999 1635 3 142 1 89 45 44 533 969 18 219 94 33 318 250 0.083401 24.3006 578 4 136 0 121 149 141 577 994 157 80 102 39 673 167 0.015801 29.9012 1969 5 141 0 121 109 101 591 985 18 30 91 20 578 174 0.041399 21.2998 1234 6 121 0 110 118 115 547 964 25 44 84 29 689 126 0.034201 20.9995 682 > sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(X))) + ) Show Traceback Rerun with Debug Error in is.data.frame(x) : object 'X' not found > > > library(MASS) > sapply(UScrime[c("U1","U2")],function(x)(c(mean=mean(x),sd=sd(x)))) U1 U2 mean 95.46809 33.97872 sd 18.02878 8.44545 > > > with(UScrime,t.test(U1,U2,paired=TRUE)) Paired t-test data: U1 and U2 t = 32.407, df = 46, p-value < 2.2e-16 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: 57.67003 65.30870 sample estimates: mean of the differences 61.48936 > > > > #兩組差異的非參數檢驗 > states<-data.frame(state.region,state.x77) > kruskal.test(Illiteracy~state.region,data=states) Kruskal-Wallis rank sum test data: Illiteracy by state.region Kruskal-Wallis chi-squared = 22.672, df = 3, p-value = 4.726e-05