R語言學習筆記(五)


總結下第七章的統計分析方法,里面涉及到了很多統計專業概念。

Summary 函數
> myvars<-c("mpg","hp","wt")
> summary(mtcars[myvars])
mpg hp wt 
Min. :10.40 Min. : 52.0 Min. :1.513 
1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 
Median :19.20 Median :123.0 Median :3.325 
Mean :20.09 Mean :146.7 Mean :3.217 
3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 
Max. :33.90 Max. :335.0 Max. :5.424

 

sapply 函數,針對某一數據源的某一數據列應用特點的統計函數
mystats<-function(x,na.omit=FALSE){
+ if(na.omit){x<-x[!is.na(x)]}
+ m<-mean(x)
+ n<-length(x)
+ s<-sd(x)
+ skew<-sum((x-m)^3/s^3)/n
+ kurt<-sum((x-m)^4/s^4)/n-3
+ return(c(n=n,mean=m,stdev=s,skew=skew,kurtosis=kurt))}

> myvars<-c("mpg","hp","wt")
> sapply(mtcars[myvars],mystats)
mpg hp wt
n 32.000000 32.0000000 32.00000000
mean 20.090625 146.6875000 3.21725000
stdev 6.026948 68.5628685 0.97845744
skew 0.610655 0.7260237 0.42314646
kurtosis -0.372766 -0.1355511 -0.02271075

 

#hmisc describe 包 - 生成詳細的統計概括
> library(Hmisc)

> myvars<-c("mpg","hp","wt")
> describe(mtcars[myvars])
mtcars[myvars]


3 Variables 32 Observations
-------------------------------------------------------------------------------------
mpg 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 25 0.999 20.09 6.796 12.00 14.34 15.43 
.50 .75 .90 .95 
19.20 22.80 30.09 31.30


lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
-------------------------------------------------------------------------------------
hp 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 22 0.997 146.7 77.04 63.65 66.00 96.50 
.50 .75 .90 .95 
123.00 180.00 243.50 253.55


lowest : 52 62 65 66 91, highest: 215 230 245 264 335
-------------------------------------------------------------------------------------
wt 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 29 0.999 3.217 1.089 1.736 1.956 2.581 
.50 .75 .90 .95 
3.325 3.610 4.048 5.293


lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
-------------------------------------------------------------------------------------
>

 


#pastecs desc - 和上面的describe函數功能類似

> install.packages("pastecs")

> library(pastecs)

> myvars<-c("mpg","hp","wt")
> stat.desc(mtcars[myvars])
mpg hp wt
nbr.val 32.0000000 32.0000000 32.0000000
nbr.null 0.0000000 0.0000000 0.0000000
nbr.na 0.0000000 0.0000000 0.0000000
min 10.4000000 52.0000000 1.5130000
max 33.9000000 335.0000000 5.4240000
range 23.5000000 283.0000000 3.9110000
sum 642.9000000 4694.0000000 102.9520000
median 19.2000000 123.0000000 3.3250000
mean 20.0906250 146.6875000 3.2172500
SE.mean 1.0654240 12.1203173 0.1729685
CI.mean.0.95 2.1729465 24.7195501 0.3527715
var 36.3241028 4700.8669355 0.9573790
std.dev 6.0269481 68.5628685 0.9784574
coef.var 0.2999881 0.4674077 0.3041285
> 
> 
>


> #psych describe - 描述函數

> install.packages("psych")

> library(psych)

> describe(mtcars[myvars])
vars n mean sd median trimmed mad min max range skew kurtosis
mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61 -0.37
hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73 -0.14
wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42 -0.02
se
mpg 1.07
hp 12.12
wt 0.17
>

> #doBy summaryBy -分組統計函數,第一個輸入參數是一個公式:var1+var2+var3~group1+group2(~左邊的變量是待分析變量,右邊的是待分組變量)

> install.packages("doBy")
> library(doBy)
> summaryBy(mpg+hp+wt~am,data=mtcars,FUN=mystats)
am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean hp.stdev
1 0 19 17.14737 3.833966 0.01395038 -0.8031783 19 160.2632 53.90820
2 1 13 24.39231 6.166504 0.05256118 -1.4553520 13 126.8462 84.06232
hp.skew hp.kurtosis wt.n wt.mean wt.stdev wt.skew wt.kurtosis
1 -0.01422519 -1.2096973 19 3.768895 0.7774001 0.9759294 0.1415676
2 1.35988586 0.5634635 13 2.411000 0.6169816 0.2103128 -1.1737358
>

#調用同名函數 - 和編程中的命名空間概念類似
> Hmisc::describe(mtcars[myvars])
mtcars[myvars]


3 Variables 32 Observations
-------------------------------------------------------------------------------------
mpg 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 25 0.999 20.09 6.796 12.00 14.34 15.43 
.50 .75 .90 .95 
19.20 22.80 30.09 31.30


lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9
-------------------------------------------------------------------------------------
hp 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 22 0.997 146.7 77.04 63.65 66.00 96.50 
.50 .75 .90 .95 
123.00 180.00 243.50 253.55


lowest : 52 62 65 66 91, highest: 215 230 245 264 335
-------------------------------------------------------------------------------------
wt 
n missing distinct Info Mean Gmd .05 .10 .25 
32 0 29 0.999 3.217 1.089 1.736 1.956 2.581 
.50 .75 .90 .95 
3.325 3.610 4.048 5.293


lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424
-------------------------------------------------------------------------------------
> 
> 
>

#分組 - 兩個分組函數:aggregate和by
> mtcars$am
[1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1
> help("aggregate")
> 
> 
> aggregate(mtcars[myvars],by=list(am=mtcars$am),mean)
am mpg hp wt
1 0 17.14737 160.2632 3.768895
2 1 24.39231 126.8462 2.411000
> aggregate(mtcars[myvars],by=list(am=mtcars$am),sd)
am mpg hp wt
1 0 3.833966 53.90820 0.7774001
2 1 6.166504 84.06232 0.6169816
> 
> 
> dstats<-function(x)sapply(x,mystats)
> by(mtcars[myvars],mtcars$am,dstats)
mtcars$am: 0
mpg hp wt
n 19.00000000 19.00000000 19.0000000
mean 17.14736842 160.26315789 3.7688947
stdev 3.83396639 53.90819573 0.7774001
skew 0.01395038 -0.01422519 0.9759294
kurtosis -0.80317826 -1.20969733 0.1415676
--------------------------------------------------------------- 
mtcars$am: 1
mpg hp wt
n 13.00000000 13.0000000 13.0000000
mean 24.39230769 126.8461538 2.4110000
stdev 6.16650381 84.0623243 0.6169816
skew 0.05256118 1.3598859 0.2103128
kurtosis -1.45535200 0.5634635 -1.1737358
>

 

> #頻數表 - 這東西和Excel中的Pivit Table 類似
> #table, xtabs, prop.table, margin.table, addmargins, ftable
> library(vcd)
載入需要的程輯包:grid
Warning message:
程輯包‘vcd’是用R版本3.3.3 來建造的 
> attach(Arthritis)
> mytable<-table(Improved)
> mytable
Improved
None Some Marked 
42 14 28 
> 
> #百分比形式

> prop.table(mytable)
Improved
None Some Marked 
0.5000000 0.1666667 0.3333333 
> 
> #二維列聯表
> mytable<-xtabs(~Treatment+Improved, data=Arthritis)
> mytable
Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21
> 
> 
> 
> #cross table
> install.packages("gmodels")
> library(gmodels)

> CrossTable(Arthritis$Treatment,Arthritis$Improved)



Cell Contents
|-------------------------|
| N |
| Chi-square contribution |
| N / Row Total |
| N / Col Total |
| N / Table Total |
|-------------------------|



Total Observations in Table: 84



| Arthritis$Improved 
Arthritis$Treatment | None | Some | Marked | Row Total | 
--------------------|-----------|-----------|-----------|-----------|
Placebo | 29 | 7 | 7 | 43 | 
| 2.616 | 0.004 | 3.752 | | 
| 0.674 | 0.163 | 0.163 | 0.512 | 
| 0.690 | 0.500 | 0.250 | | 
| 0.345 | 0.083 | 0.083 | | 
--------------------|-----------|-----------|-----------|-----------|
Treated | 13 | 7 | 21 | 41 | 
| 2.744 | 0.004 | 3.935 | | 
| 0.317 | 0.171 | 0.512 | 0.488 | 
| 0.310 | 0.500 | 0.750 | | 
| 0.155 | 0.083 | 0.250 | | 
--------------------|-----------|-----------|-----------|-----------|
Column Total | 42 | 14 | 28 | 84 | 
| 0.500 | 0.167 | 0.333 | | 
--------------------|-----------|-----------|-----------|-----------|



> 
> 
> #多維列聯表
> mytable<-xtab(~Treatment+Sex+Improved,data=Arthritis) #這個公式和之前的類型,左邊的是需要分析的變量,右邊是需要分組的變量
Error: could not find function "xtab"
> mytable<-xtabs(~Treatment+Sex+Improved,data=Arthritis)
> mytable
, , Improved = None


Sex
Treatment Female Male
Placebo 19 10
Treated 6 7


, , Improved = Some


Sex
Treatment Female Male
Placebo 7 0
Treated 5 2


, , Improved = Marked


Sex
Treatment Female Male
Placebo 6 1
Treated 16 5


> ftable(mytable)
Improved None Some Marked
Treatment Sex 
Placebo Female 19 7 6
Male 10 0 1
Treated Female 6 5 16
Male 7 2 5
> 
> margin.table(mytable,1)
Treatment
Placebo Treated 
43 41 
> margin.table(mytable,2)
Sex
Female Male 
59 25 
> margin.table(mytable,3)
Improved
None Some Marked 
42 14 28 
> margin.table(mytable,c(1,3))
Improved
Treatment None Some Marked
Placebo 29 7 7
Treated 13 7 21
> ftable(prop.table(mytable,c(1,2)))
Improved None Some Marked
Treatment Sex 
Placebo Female 0.59375000 0.21875000 0.18750000
Male 0.90909091 0.00000000 0.09090909
Treated Female 0.22222222 0.18518519 0.59259259
Male 0.50000000 0.14285714 0.35714286
> 
> 
> ftable(addmargins(prop.table(mytable,c(1,2)),3))
Improved None Some Marked Sum
Treatment Sex 
Placebo Female 0.59375000 0.21875000 0.18750000 1.00000000
Male 0.90909091 0.00000000 0.09090909 1.00000000
Treated Female 0.22222222 0.18518519 0.59259259 1.00000000
Male 0.50000000 0.14285714 0.35714286 1.00000000
> help(addmargins)
> #addmargins 求和
> ftable(addmargins(prop.table(mytable,c(1,2)),3))*100
Improved None Some Marked Sum
Treatment Sex 
Placebo Female 59.375000 21.875000 18.750000 100.000000
Male 90.909091 0.000000 9.090909 100.000000
Treated Female 22.222222 18.518519 59.259259 100.000000
Male 50.000000 14.285714 35.714286 100.000000

#獨立性檢驗 - 檢測變量之間是否有關聯
> 
> 
> #卡方獨立性檢驗
> library(vcd)
> mytable<-xtabs(~Treatment+Improved,data=Arthritis)
> chisq.test(mytable)


Pearson's Chi-squared test


data: mytable
X-squared = 13.055, df = 2, p-value = 0.001463


> 
> 
> mytable<-xtabs(~Improved+Sex,dat=Arthritis)
> chisq.test(mytable)


Pearson's Chi-squared test


data: mytable
X-squared = 4.8407, df = 2, p-value = 0.08889


Warning message:
In chisq.test(mytable) : Chi-squared近似算法有可能不准
> 
> 
> 
> 
> 
> #fisher
> mytable<-xtabs(~Treatment+Improved,data=Arthritis)
> fisher.test(mytable)


Fisher's Exact Test for Count Data


data: mytable
p-value = 0.001393
alternative hypothesis: two.sided


> 
> 
> #mantelhaen
> mytable<-xtabs(~Treatment+Improved+Sex,data=Arthritis)
> mantelhaen.test(mytable)


Cochran-Mantel-Haenszel test


data: mytable
Cochran-Mantel-Haenszel M^2 = 14.632, df = 2, p-value = 0.0006647


> #associate
> library(vcd)
> mytable<-xtabs(~Treatment+Improved,data=Arthritis)
> associate(mytable)
Error: could not find function "associate"
> associates(mytable)
Error: could not find function "associates"
> assocstats(mytable)
X^2 df P(> X^2)
Likelihood Ratio 13.530 2 0.0011536
Pearson 13.055 2 0.0014626


Phi-Coefficient : NA 
Contingency Coeff.: 0.367 
Cramer's V : 0.394 
> 
> 
>

#相關 批量檢測變量的相關性,比較有用,用來推導變量之間的依賴關系
> states<-state.x77[,1:6]
> cov(states)
Population Income Illiteracy Life Exp Murder
Population 19931683.7588 571229.7796 292.8679592 -407.8424612 5663.523714
Income 571229.7796 377573.3061 -163.7020408 280.6631837 -521.894286
Illiteracy 292.8680 -163.7020 0.3715306 -0.4815122 1.581776
Life Exp -407.8425 280.6632 -0.4815122 1.8020204 -3.869480
Murder 5663.5237 -521.8943 1.5817755 -3.8694804 13.627465
HS Grad -3551.5096 3076.7690 -3.2354694 6.3126849 -14.549616
HS Grad
Population -3551.509551
Income 3076.768980
Illiteracy -3.235469
Life Exp 6.312685
Murder -14.549616
HS Grad 65.237894
> 
> cor(states)
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.00000000 0.2082276 0.1076224 -0.06805195 0.3436428 -0.09848975
Income 0.20822756 1.0000000 -0.4370752 0.34025534 -0.2300776 0.61993232
Illiteracy 0.10762237 -0.4370752 1.0000000 -0.58847793 0.7029752 -0.65718861
Life Exp -0.06805195 0.3402553 -0.5884779 1.00000000 -0.7808458 0.58221620
Murder 0.34364275 -0.2300776 0.7029752 -0.78084575 1.0000000 -0.48797102
HS Grad -0.09848975 0.6199323 -0.6571886 0.58221620 -0.4879710 1.00000000
> 
> cor(states,method="spearman")
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.0000000 0.1246098 0.3130496 -0.1040171 0.3457401 -0.3833649
Income 0.1246098 1.0000000 -0.3145948 0.3241050 -0.2174623 0.5104809
Illiteracy 0.3130496 -0.3145948 1.0000000 -0.5553735 0.6723592 -0.6545396
Life Exp -0.1040171 0.3241050 -0.5553735 1.0000000 -0.7802406 0.5239410
Murder 0.3457401 -0.2174623 0.6723592 -0.7802406 1.0000000 -0.4367330
HS Grad -0.3833649 0.5104809 -0.6545396 0.5239410 -0.4367330 1.0000000
> 
> 
> x<- states[,c("Population","InCome","HS Grad")]
Error in states[, c("Population", "InCome", "HS Grad")] : 
subscript out of bounds
> x<- states[,c("Population","InCome","Illiteracy", "HS Grad")]
Error in states[, c("Population", "InCome", "Illiteracy", "HS Grad")] : 
subscript out of bounds
> head(states)
Population Income Illiteracy Life Exp Murder HS Grad
Alabama 3615 3624 2.1 69.05 15.1 41.3
Alaska 365 6315 1.5 69.31 11.3 66.7
Arizona 2212 4530 1.8 70.55 7.8 58.1
Arkansas 2110 3378 1.9 70.66 10.1 39.9
California 21198 5114 1.1 71.71 10.3 62.6
Colorado 2541 4884 0.7 72.06 6.8 63.9
> x<-states[,1]
> x
Alabama Alaska Arizona Arkansas California 
3615 365 2212 2110 21198 
Colorado Connecticut Delaware Florida Georgia 
2541 3100 579 8277 4931 
Hawaii Idaho Illinois Indiana Iowa 
868 813 11197 5313 2861 
Kansas Kentucky Louisiana Maine Maryland 
2280 3387 3806 1058 4122 
Massachusetts Michigan Minnesota Mississippi Missouri 
5814 9111 3921 2341 4767 
Montana Nebraska Nevada New Hampshire New Jersey 
746 1544 590 812 7333 
New Mexico New York North Carolina North Dakota Ohio 
1144 18076 5441 637 10735 
Oklahoma Oregon Pennsylvania Rhode Island South Carolina 
2715 2284 11860 931 2816 
South Dakota Tennessee Texas Utah Vermont 
681 4173 12237 1203 472 
Virginia Washington West Virginia Wisconsin Wyoming 
4981 3559 1799 4589 376 
> x<-states[,c("Population","Income","Illiteracy","HS Grad")]
> y<-states[,c("Life Exp","Murder")]
> cor(x,y)
Life Exp Murder
Population -0.06805195 0.3436428
Income 0.34025534 -0.2300776
Illiteracy -0.58847793 0.7029752
HS Grad 0.58221620 -0.4879710
> 
>

 


> 
> #偏相關
> install.packages("ggm")


程輯包‘ggm’是用R版本3.3.3 來建造的 
> colnames(states)
[1] "Population" "Income" "Illiteracy" "Life Exp" "Murder" "HS Grad" 
> pcor(c(1,5,2,3,6),cov(states))
[1] 0.3462724
> 
> 
> #相關性的顯著性驗證
> cor.test(states[,3],states[,5])


Pearson's product-moment correlation


data: states[, 3] and states[, 5]
t = 6.8479, df = 48, p-value = 1.258e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5279280 0.8207295
sample estimates:
cor 
0.7029752


> 
> 
> #多變量驗證
> library(psych)
> corr.test(states,use="complete")
Call:corr.test(x = states, use = "complete")
Correlation matrix 
Population Income Illiteracy Life Exp Murder HS Grad
Population 1.00 0.21 0.11 -0.07 0.34 -0.10
Income 0.21 1.00 -0.44 0.34 -0.23 0.62
Illiteracy 0.11 -0.44 1.00 -0.59 0.70 -0.66
Life Exp -0.07 0.34 -0.59 1.00 -0.78 0.58
Murder 0.34 -0.23 0.70 -0.78 1.00 -0.49
HS Grad -0.10 0.62 -0.66 0.58 -0.49 1.00
Sample Size 
[1] 50
Probability values (Entries above the diagonal are adjusted for multiple tests.) 
Population Income Illiteracy Life Exp Murder HS Grad
Population 0.00 0.59 1.00 1.0 0.10 1
Income 0.15 0.00 0.01 0.1 0.54 0
Illiteracy 0.46 0.00 0.00 0.0 0.00 0
Life Exp 0.64 0.02 0.00 0.0 0.00 0
Murder 0.01 0.11 0.00 0.0 0.00 0
HS Grad 0.50 0.00 0.00 0.0 0.00 0


To see confidence intervals of the correlations, print with the short=FALSE option
> 
> 
> 
> 
> #T檢驗
> #測試在美國南部和北部犯罪后被關進監獄的概率
> library(MASS)
> t.test(Prob~So, data=UScrime)


Welch Two Sample t-test


data: Prob by So
t = -3.8954, df = 24.925, p-value = 0.0006506
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.03852569 -0.01187439
sample estimates:
mean in group 0 mean in group 1 
0.03851265 0.06371269


> 
> library(MASS)
> sapply(UCcrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))})
Error in lapply(X = X, FUN = FUN, ...) : object 'UCcrime' not found
> sapply(UScrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))})
Show Traceback

Rerun with Debug
Error in is.data.frame(x) : object 'X' not found > attach(UScrime)
The following object is masked _by_ .GlobalEnv:


y


> sapply(UScrime[c("U1","U2")], function(x){c(mean=mean(x),sd=sd(X))})
Show Traceback

Rerun with Debug
Error in is.data.frame(x) : object 'X' not found > head(UScrime)
M So Ed Po1 Po2 LF M.F Pop NW U1 U2 GDP Ineq Prob Time y
1 151 1 91 58 56 510 950 33 301 108 41 394 261 0.084602 26.2011 791
2 143 0 113 103 95 583 1012 13 102 96 36 557 194 0.029599 25.2999 1635
3 142 1 89 45 44 533 969 18 219 94 33 318 250 0.083401 24.3006 578
4 136 0 121 149 141 577 994 157 80 102 39 673 167 0.015801 29.9012 1969
5 141 0 121 109 101 591 985 18 30 91 20 578 174 0.041399 21.2998 1234
6 121 0 110 118 115 547 964 25 44 84 29 689 126 0.034201 20.9995 682
> sapply(UScrime[c("U1","U2")], function(x)(c(mean=mean(x),sd=sd(X)))
+ )
Show Traceback

Rerun with Debug
Error in is.data.frame(x) : object 'X' not found > 
> 
> library(MASS)
> sapply(UScrime[c("U1","U2")],function(x)(c(mean=mean(x),sd=sd(x))))
U1 U2
mean 95.46809 33.97872
sd 18.02878 8.44545
> 
> 
> with(UScrime,t.test(U1,U2,paired=TRUE))


Paired t-test


data: U1 and U2
t = 32.407, df = 46, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
57.67003 65.30870
sample estimates:
mean of the differences 
61.48936


> 
> 
> 
> #兩組差異的非參數檢驗
> states<-data.frame(state.region,state.x77)
> kruskal.test(Illiteracy~state.region,data=states)


Kruskal-Wallis rank sum test


data: Illiteracy by state.region
Kruskal-Wallis chi-squared = 22.672, df = 3, p-value = 4.726e-05

  


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM