本文由digging4發表於:http://www.cnblogs.com/digging4/p/5031203.html
統計建模與R軟件-第三章
3.1某單位對100名女生測定血清總蛋白含量\((g/L)\),數據如下:
74.3 78.8 68.8 78.0 70.4 80.5 80.5 69.7 71.2 73.5
79.5 75.6 75.0 78.8 72.0 72.0 72.0 74.3 71.2 72.0
75.0 73.5 78.8 74.3 75.8 65.0 74.3 71.2 69.7 68.0
73.5 75.0 72.0 64.3 75.8 80.3 69.7 74.3 73.5 73.5
75.8 75.8 68.8 76.5 70.4 71.2 81.2 75.0 70.4 68.0
70.4 72.0 76.5 74.3 76.5 77.6 67.3 72.0 75.0 74.3
73.5 79.5 73.5 74.7 65.0 76.5 81.6 75.4 72.7 72.7
67.2 76.5 72.7 70.4 77.2 68.8 67.3 67.3 67.3 72.7
75.8 73.5 75.0 73.5 73.5 73.5 72.7 81.6 70.3 74.3
73.5 79.5 70.4 76.5 72.7 77.2 84.3 75.0 76.5 70.4
計算均值、方差、標准差、極差、標准誤、變異系數、偏度、峰度。
x <- c(74.3, 78.8, 68.8, 78, 70.4, 80.5, 80.5, 69.7, 71.2, 73.5, 79.5, 75.6,
75, 78.8, 72, 72, 72, 74.3, 71.2, 72, 75, 73.5, 78.8, 74.3, 75.8, 65, 74.3,
71.2, 69.7, 68, 73.5, 75, 72, 64.3, 75.8, 80.3, 69.7, 74.3, 73.5, 73.5,
75.8, 75.8, 68.8, 76.5, 70.4, 71.2, 81.2, 75, 70.4, 68, 70.4, 72, 76.5,
74.3, 76.5, 77.6, 67.3, 72, 75, 74.3, 73.5, 79.5, 73.5, 74.7, 65, 76.5,
81.6, 75.4, 72.7, 72.7, 67.2, 76.5, 72.7, 70.4, 77.2, 68.8, 67.3, 67.3,
67.3, 72.7, 75.8, 73.5, 75, 73.5, 73.5, 73.5, 72.7, 81.6, 70.3, 74.3, 73.5,
79.5, 70.4, 76.5, 72.7, 77.2, 84.3, 75, 76.5, 70.4)
n <- length(x)
mean(x) #均值
## [1] 73.67
var(x) #方差
## [1] 15.52
sd(x) #標准差
## [1] 3.939
max(x) - min(x) #極差
## [1] 20
sd(x)/100^(1/2) # 標准誤
## [1] 0.3939
100 * sd(x)/mean(x) #變異系數
## [1] 5.347
n/((n - 1) * (n - 2)) * sum(x - mean(x))^3/sd(x)^3 # 偏度
## [1] -4.411e-41
n * (n + 1)/((n - 1) * (n - 2) * (n - 3) * sd(x)^4) * sum(x - mean(x))^4 - 3 *
(n - 1)^2/(n - 2)/(n - 3) #峰度
## [1] -3.093
3.2 繪出習題3.1的直方圖、密度估計曲線、經驗分布圖和qq圖,並將密度估計曲線與正態密度曲線相比較,將經驗分布曲線與正態分布曲線相比較(其中正態曲線的均值和標准差取習題3.1計算出的值)
# 直方圖
hist(x)

# 密度估計曲線
plot(density(x), col = "blue")

# 經驗分布圖
plot(ecdf(x))

# qq圖
qqnorm(x)
qqline(x)

# 將密度估計曲線與正態密度曲線相比較
plot(density(x), col = "blue")
x1 <- 55:154
lines(x1, dnorm(x1, mean(x), sd(x)), col = "red")

# 將經驗分布曲線與正態分布曲線相比較 verticals=TRUE
# 表示畫垂直線;do.p=FALSE表示數據點上不畫點
plot(ecdf(x), verticals = TRUE, do.p = FALSE)
x1 <- 55:154
lines(x1, pnorm(x1, mean(x), sd(x)))

3.3 繪出習題3.1的莖葉圖、箱線圖,並計算五數總括。
# 莖葉圖
stem(x)
##
## The decimal point is at the |
##
## 64 | 300
## 66 | 23333
## 68 | 00888777
## 70 | 344444442222
## 72 | 0000000777777555555555555
## 74 | 33333333700000004688888
## 76 | 5555555226
## 78 | 0888555
## 80 | 355266
## 82 |
## 84 | 3
# 箱線圖
boxplot(x)

# 計算五數總括
fivenum(x)
## [1] 64.3 71.2 73.5 75.8 84.3
3.4 分別用W檢驗方法和Kolmogorov-Smirnov檢驗方法檢驗習題3.1的數據是否服從正態分布。
# 用W檢驗方法
# 提供w統計量和相應的p值,當p值小於某個顯著性水平(比如0.05)時,則認為樣本不是來自於正態分布的總體
# p-value=0.6708 大於0.05,可以認為樣本是來自於正態分布的總體
shapiro.test(x)
##
## Shapiro-Wilk normality test
##
## data: x
## W = 0.9901, p-value = 0.6708
# Kolmogorov-Smirnov檢驗方法檢驗 用於比較兩個分布的是否一致
# 第一個參數為待檢驗向量,第二個參數為用待檢驗向量的均值和標准差生成一個相同長度的正態分布向量
# p-value=0.6994,p值大於0.05,不能拒絕其分布一致的假設,即兩個向量分布一致。
ks.test(x, rnorm(length(x), mean(x), sd(x)))
##
## Two-sample Kolmogorov-Smirnov test
##
## data: x and rnorm(length(x), mean(x), sd(x))
## D = 0.12, p-value = 0.4676
## alternative hypothesis: two-sided
3.5 小白鼠在接種了3種不同菌型的傷寒桿菌后的存活天數如表3.8所示,試繪出數據的箱線圖(采用兩種方法,一種是plot語句,另一種是boxplot語句)來判斷小白鼠被注射三種菌型后的平均存活天數有無顯著差異?
菌型 存活天數
1 2 4 3 2 4 7 7 2 2 5 4
2 5 6 8 5 10 7 12 12 6 6
3 7 11 6 6 7 9 5 5 10 6 3 10
x1 <- c(2, 4, 3, 2, 4, 7, 7, 2, 2, 5, 4)
x2 <- c(5, 6, 8, 5, 10, 7, 12, 12, 6, 6)
x3 <- c(7, 11, 6, 6, 7, 9, 5, 5, 10, 6, 3, 10)
#
# 從箱線圖中可以看到,菌型2和3的平均存活天數無顯著差異,但是與菌型1的有顯著差異
boxplot(x1, x2, x3)

# 用plot畫箱線圖
x <- c(x1, x2, x3)
f <- factor(c(rep(1, 11), rep(2, 10), rep(3, 12)))
plot(f, x)

3.6 繪出例3.16關於三項指標的離散圖,從圖中分析例3.16的結論的合理性。
# 3.16的結論是x1、x2和x3兩兩均不相關 從散點圖看出,三個屬性兩兩均不相關
df <- data.frame(硬度 = c(65, 70, 70, 69, 66, 67, 68, 72, 66, 68), 變形 = c(45,
45, 48, 46, 50, 46, 47, 43, 47, 48), 彈性 = c(27.6, 30.7, 31.8, 32.6, 31,
31.3, 37, 33.6, 33.1, 34.2))
pairs(df)

3.7 某校測得19名學生的四項指標,性別、年齡、身高(cm)和 體重(磅),具體數據由表3.9所示,
(1)試繪出體重對於身高的散點圖,
(2)繪出不同性別情況下,體重與身高的散點圖
(3)繪出不同年齡段的體重和身高的散點圖
(4)分不同性別和不同年齡段的體重與身高的散點圖
學號 姓名 性別 年齡 身高 體重
01 Alice F 13 56.5 84.0
02 Becka F 13 65.3 98.0
03 Gail F 14 64.3 90.0
04 Karen F 12 56.3 77.0
05 Kathy F 12 59.8 84.5
06 Mary F 15 66.5 112.0
07 Sandy F 11 51.3 50.5
08 Sharon F 15 62.5 112.5
09 Tammy F 14 62.8 102.5
10 Alfred M 14 69.0 112.5
11 Duke M 14 63.5 102.5
12 Guido M 15 67.0 133.0
13 James M 12 57.3 83.0
14 Jeffrey M 13 62.5 84.0
15 John M 12 59.0 99.5
16 Philip M 16 72.0 150.0
17 Robert M 12 64.8 128.0
18 Thomas M 11 57.5 85.0
19 William M 15 66.5 112.0
df <- data.frame(no = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19), name = c("Alice", "Becka", "Gail", "Karen", "Kathy", "Mary",
"Sandy", "Sharon", "Tammy", "Alfred", "Duke", "Guido", "James", "Jeffrey",
"John", "Philip", "Robert", "Thomas", "William"), sex = c("F", "F", "F",
"F", "F", "F", "F", "F", "F", "M", "M", "M", "M", "M", "M", "M", "M", "M",
"M"), age = c(13, 13, 14, 12, 12, 15, 11, 15, 14, 14, 14, 15, 12, 13, 12,
16, 12, 11, 15), high = c(56.5, 65.3, 64.3, 56.3, 59.8, 66.5, 51.3, 62.5,
62.8, 69, 63.5, 67, 57.3, 62.5, 59, 72, 64.8, 57.5, 66.5), weight = c(84,
98, 90, 77, 84.5, 112, 50.5, 112.5, 102.5, 112.5, 102.5, 133, 83, 84, 99.5,
150, 128, 85, 112))
# 試繪出體重對於身高的散點圖
plot(df$high, df$weight)

# 繪出不同性別情況下,體重與身高的散點圖
coplot(df$high ~ df$weight | df$sex)

# 繪出不同年齡段的體重和身高的散點圖
coplot(df$high ~ df$weight | df$age)

# 分不同性別和不同年齡段的體重與身高的散點圖
coplot(df$high ~ df$weight | df$sex + df$age)

3.8 畫出函數\(z=x^4-2x^2y+x^2-2xy+2y^2+\frac{9}{2}x-4y+4\) 在區間\(-2\leqslant x \leqslant 3, -1\leqslant y\leqslant 7\)上的三維網格曲面和二維等值線,其中\(x\)與\(y\)各點之間的間隔為0.05,等值線的值分別為0, 1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 80, 100,共15條,(注,在三維圖形中選擇合適的角度)
# 畫二維等值線
x <- seq(-2, 3, 0.05)
y <- seq(-1, 7, 0.05)
f <- outer(x, y, function(x, y) x^4 - 2 * x^2 * y + x^2 - 2 * x * y + 2 * y^2 +
9/2 * x - 4 * y + 4)
contour(x, y, f, levels = c(0, 1, 2, 3, 4, 5, 10, 15, 20, 30, 40, 50, 60, 80,
100))

# 畫三維網格曲面
persp(x, y, f, theta = 90, phi = 25, expand = 0.5)

3.9 用\(pearson\)相關檢驗發檢驗習題3.7中的身高與體重是否相關
# 第一二為兩個長度相等的向量,p值小於0.05時拒絕原假設,認為兩個向量相關
cor.test(df$high, df$weight, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: df$high and df$weight
## t = 7.555, df = 17, p-value = 7.887e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7044 0.9523
## sample estimates:
## cor
## 0.8778
3.10 繪出例3.17中48名求職者數據的星圖
(1)以15項自變量FL,APP,... SUIT為星圖的軸
(2)以\(G_1,G_2, \cdots,G_5\)為星圖的軸,通過這些星圖,你能否說明應選哪6名應聘者。
為使星圖能夠充分反映應聘者的情況,在作圖中可適當調整各種參數。
rt <- data.frame(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48), FL = c(6, 9, 7,
5, 6, 7, 9, 9, 9, 4, 4, 4, 6, 8, 4, 6, 8, 6, 6, 4, 3, 9, 7, 9, 6, 7, 2,
6, 4, 4, 5, 3, 2, 3, 6, 9, 4, 4, 10, 10, 10, 10, 3, 7, 9, 9, 0, 0), APP = c(7,
10, 8, 6, 8, 7, 9, 9, 9, 7, 7, 7, 9, 9, 8, 9, 7, 8, 7, 8, 8, 8, 10, 8, 9,
8, 10, 3, 3, 6, 5, 3, 3, 4, 7, 8, 9, 9, 6, 6, 7, 3, 4, 7, 6, 8, 7, 6), AA = c(2,
5, 3, 8, 8, 7, 8, 9, 7, 10, 10, 10, 8, 8, 8, 6, 7, 8, 8, 7, 6, 7, 7, 7,
7, 7, 7, 5, 4, 5, 4, 5, 5, 6, 4, 5, 6, 6, 9, 9, 8, 8, 9, 7, 10, 10, 10,
10), LA = c(5, 8, 6, 5, 8, 6, 8, 8, 8, 2, 0, 4, 10, 9, 7, 7, 7, 4, 4, 8,
8, 8, 9, 10, 7, 8, 9, 3, 3, 6, 7, 7, 7, 4, 3, 5, 4, 6, 10, 10, 0, 0, 8,
6, 9, 10, 3, 1), SC = c(8, 10, 9, 6, 4, 8, 8, 9, 8, 10, 10, 10, 5, 6, 5,
8, 9, 8, 7, 8, 8, 9, 9, 8, 4, 5, 8, 5, 3, 9, 8, 7, 7, 3, 3, 6, 10, 9, 9,
9, 2, 1, 2, 9, 7, 7, 5, 5), LC = c(7, 9, 8, 5, 4, 7, 8, 9, 8, 10, 8, 10,
4, 3, 4, 9, 5, 8, 8, 9, 8, 10, 9, 10, 5, 4, 9, 3, 0, 4, 4, 9, 9, 3, 0, 6,
8, 9, 10, 10, 1, 1, 4, 8, 7, 9, 0, 0), HON = c(8, 9, 9, 9, 9, 10, 8, 8,
8, 7, 3, 7, 9, 8, 10, 8, 8, 6, 5, 10, 10, 10, 10, 10, 9, 8, 10, 5, 0, 10,
10, 10, 10, 8, 9, 8, 8, 7, 10, 10, 2, 0, 5, 8, 10, 10, 10, 10), SMS = c(8,
10, 7, 2, 5, 5, 8, 8, 5, 10, 9, 8, 4, 2, 2, 9, 6, 4, 4, 5, 5, 10, 10, 10,
3, 2, 5, 0, 0, 3, 3, 3, 3, 1, 0, 2, 9, 9, 10, 10, 0, 0, 3, 6, 2, 3, 0, 0),
EXP = c(3, 5, 4, 8, 8, 9, 10, 10, 9, 3, 5, 2, 4, 5, 7, 8, 6, 3, 4, 2, 3,
3, 3, 2, 2, 3, 3, 0, 0, 1, 2, 2, 2, 1, 1, 2, 1, 1, 10, 10, 10, 10, 6,
8, 1, 1, 0, 0), DRV = c(8, 9, 9, 4, 5, 6, 8, 9, 8, 10, 9, 8, 4, 2, 5,
8, 7, 3, 2, 6, 6, 10, 9, 9, 4, 4, 5, 3, 4, 3, 5, 5, 2, 3, 0, 2, 3, 2,
10, 10, 2, 0, 2, 8, 5, 5, 2, 2), AMB = c(9, 9, 9, 5, 5, 5, 10, 10, 9,
10, 10, 8, 5, 6, 3, 7, 8, 6, 6, 7, 7, 8, 9, 7, 4, 5, 6, 3, 4, 3, 5,
3, 3, 3, 2, 4, 9, 10, 8, 10, 0, 0, 1, 10, 5, 7, 2, 2), GSP = c(7, 8,
8, 8, 8, 8, 8, 9, 8, 10, 8, 10, 4, 6, 6, 6, 6, 7, 8, 9, 8, 10, 10, 9,
4, 6, 7, 0, 0, 2, 3, 7, 6, 3, 3, 5, 7, 8, 10, 10, 3, 0, 3, 8, 7, 9,
0, 0), POT = c(5, 8, 6, 7, 8, 6, 9, 9, 8, 9, 10, 10, 7, 7, 6, 8, 6,
2, 3, 8, 8, 8, 9, 9, 4, 5, 6, 0, 0, 2, 4, 5, 4, 2, 1, 6, 5, 5, 10, 10,
0, 0, 3, 8, 8, 9, 0, 0), KJ = c(7, 8, 8, 6, 7, 6, 8, 9, 8, 3, 2, 3,
6, 5, 4, 6, 7, 6, 5, 8, 5, 10, 10, 10, 5, 5, 4, 5, 5, 7, 8, 5, 5, 5,
5, 6, 3, 5, 10, 10, 0, 0, 3, 6, 4, 4, 0, 0), SUIT = c(10, 10, 10, 5,
7, 6, 10, 10, 10, 10, 5, 7, 8, 6, 6, 10, 8, 4, 4, 9, 8, 8, 8, 8, 4,
6, 5, 0, 0, 3, 3, 2, 2, 2, 3, 3, 2, 2, 10, 10, 10, 10, 8, 5, 5, 4, 0,
0))
# 以15項自變量FL,APP,... SUIT為星圖的軸
stars(rt[, -1]) #把id列去掉

#
# 以$G_1,G_2,\cdots,G_5$為星圖的軸,通過這些星圖,你能否說明應選哪6名應聘者。
rt2 <- data.frame(G1 = (rt$SC + rt$LC + rt$SMS + rt$DRV + rt$AMB + rt$GSP +
rt$POT)/7, G2 = (rt$FL + rt$EXP + rt$SUIT)/3, G3 = (rt$LA + rt$HON + rt$KJ)/3,
G4 = rt$AA, G5 = rt$APP)
stars(rt2)

3.11 繪出例3.17中48名求職者數據的調和曲線,以\(G_1,G_2, \cdots,G_5\)為自變量。
#
# 可以畫調和曲線的包有兩個,一個是謝老大編的MSG包,里面的andrews.curve()可以很輕松的畫出來,目前依然可用,另一個是Jaroslav
# Myslivec專門為調和曲線寫的andrews包,里面的andrews()函數可以畫,畫調和曲線這兩個足夠用了
unison <- function(x) {
# x is a matrix or data frame of data
if (is.data.frame(x) == TRUE)
x <- as.matrix(x)
t <- seq(-pi, pi, pi/30)
m <- nrow(x)
n <- ncol(x)
f <- array(0, c(m, length(t)))
for (i in 1:m) {
f[i, ] <- x[i, 1]/sqrt(2)
for (j in 2:n) {
if (j%%2 == 0)
f[i, ] <- f[i, ] + x[i, j] * sin(j/2 * t) else f[i, ] <- f[i, ] + x[i, j] * cos(j%/%2 * t)
}
}
plot(c(-pi, pi), c(min(f), max(f)), type = "n", main = "The Unison graph of Data",
xlab = "t", ylab = "f(t)")
for (i in 1:m) lines(t, f[i, ], col = i)
}
unison(rt2)

