薛毅书P557(电子版574),9.1,9.2,9.3,9.4(1)
options(width = 600)
# 整理数据:将电子书中的数据复制保存到9.1.txt文件中,置于工作目录下
tmp <- read.table("9.1.txt", header = F)
d1 <- tmp[, -1]
names(d1) <- paste("X", 1:8, sep = "")
d1
## X1 X2 X3 X4 X5 X6 X7 X8
## 1 90342 52455 101091 19272 82.0 16.1 197435 0.172
## 2 4903 1973 2035 10313 34.2 7.1 592077 0.003
## 3 6735 21139 3767 1780 36.1 8.2 726396 0.003
## 4 49454 36241 81557 22504 98.1 25.9 348226 0.985
## 5 139190 203505 215898 10609 93.2 12.6 139572 0.628
## 6 12215 16219 10351 6382 62.5 8.7 145818 0.066
## 7 2372 6572 8103 12329 184.4 22.2 20921 0.152
## 8 11062 23078 54935 23804 370.4 41.0 65486 0.263
## 9 17111 23907 52108 21796 221.5 21.5 63806 0.276
## 10 1206 3930 6126 15586 330.4 29.5 1840 0.437
## 11 2150 5704 6200 10870 184.2 12.0 8913 0.274
## 12 5251 6155 10383 16875 146.4 27.5 78796 0.151
## 13 14341 13203 19396 14691 94.6 17.8 6354 1.574
# 鉴于各个变量的值大小差别较远,采用相关系数矩阵计算主成分
pc <- princomp(d1, cor = T)
summary(pc, loadings = T)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 1.7621 1.7022 0.9645 0.80133 0.55144 0.29427 0.179400 0.0494143
## Proportion of Variance 0.3881 0.3622 0.1163 0.08027 0.03801 0.01082 0.004023 0.0003052
## Cumulative Proportion 0.3881 0.7503 0.8666 0.94684 0.98485 0.99567 0.999695 1.0000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## X1 -0.477 -0.296 -0.104 0.184 0.758 0.245
## X2 -0.473 -0.278 -0.163 -0.174 -0.305 -0.518 0.527
## X3 -0.424 -0.378 -0.156 -0.174 -0.781
## X4 0.213 -0.451 0.516 0.539 0.288 -0.249 0.220
## X5 0.388 -0.331 -0.321 -0.199 -0.450 0.582 0.233
## X6 0.352 -0.403 -0.145 0.279 -0.317 -0.714
## X7 -0.215 0.377 -0.140 0.758 -0.418 0.194
## X8 -0.273 0.891 -0.322 0.122
# 碎石图和载荷图
screeplot(pc, type = "lines", main = "Industries")
biplot(pc, choices = 1:2)
biplot(pc, choices = 1:2, pc.biplot = T)
可以看出,前3个主成分已经可以解释86.7%的方差,前4个主成分则可以解释94.7%的方差。
对13个行业分别按3个主成分排序:
# 计算13个行业的前3个主成分
pred <- predict(pc)[, 1:3]
# 分别按3个主成分排序
ord <- cbind(No = 1:13, pred)
ord[order(ord[, 2], decreasing = T), ]
## No Comp.1 Comp.2 Comp.3
## [1,] 8 2.2846 -2.33577 -1.14410
## [2,] 10 2.1148 -0.85885 -0.24049
## [3,] 12 1.2505 -0.03158 -0.29874
## [4,] 7 1.1475 0.33092 -0.29333
## [5,] 9 0.8755 -0.93223 -0.36728
## [6,] 11 0.7425 0.78646 0.12756
## [7,] 13 0.2737 -0.48327 2.92089
## [8,] 6 -0.3434 1.84604 -0.03241
## [9,] 4 -0.4786 -1.23197 1.03842
## [10,] 2 -0.5186 2.69747 -0.23763
## [11,] 3 -1.0996 3.35724 -0.42613
## [12,] 1 -1.5355 -0.78961 -0.56001
## [13,] 5 -4.7134 -2.35482 -0.48674
ord[order(ord[, 3], decreasing = T), ]
## No Comp.1 Comp.2 Comp.3
## [1,] 3 -1.0996 3.35724 -0.42613
## [2,] 2 -0.5186 2.69747 -0.23763
## [3,] 6 -0.3434 1.84604 -0.03241
## [4,] 11 0.7425 0.78646 0.12756
## [5,] 7 1.1475 0.33092 -0.29333
## [6,] 12 1.2505 -0.03158 -0.29874
## [7,] 13 0.2737 -0.48327 2.92089
## [8,] 1 -1.5355 -0.78961 -0.56001
## [9,] 10 2.1148 -0.85885 -0.24049
## [10,] 9 0.8755 -0.93223 -0.36728
## [11,] 4 -0.4786 -1.23197 1.03842
## [12,] 8 2.2846 -2.33577 -1.14410
## [13,] 5 -4.7134 -2.35482 -0.48674
ord[order(ord[, 4], decreasing = T), ]
## No Comp.1 Comp.2 Comp.3
## [1,] 13 0.2737 -0.48327 2.92089
## [2,] 4 -0.4786 -1.23197 1.03842
## [3,] 11 0.7425 0.78646 0.12756
## [4,] 6 -0.3434 1.84604 -0.03241
## [5,] 2 -0.5186 2.69747 -0.23763
## [6,] 10 2.1148 -0.85885 -0.24049
## [7,] 7 1.1475 0.33092 -0.29333
## [8,] 12 1.2505 -0.03158 -0.29874
## [9,] 9 0.8755 -0.93223 -0.36728
## [10,] 3 -1.0996 3.35724 -0.42613
## [11,] 5 -4.7134 -2.35482 -0.48674
## [12,] 1 -1.5355 -0.78961 -0.56001
## [13,] 8 2.2846 -2.33577 -1.14410
用聚类分析对行业进行分类:
# 计算标准化的欧氏距离
dis <- dist(scale(pred))
# 进行聚类分析
hc1 <- hclust(dis, "complete")
hc2 <- hclust(dis, "average")
hc3 <- hclust(dis, "centroid")
hc4 <- hclust(dis, "ward")
# 画出谱系图
opar <- par(mfrow = c(2, 2))
plot(hc1, hang = -1, main = "complete")
plot(hc2, hang = -1, main = "average")
plot(hc3, hang = -1, main = "centroid")
plclust(hc4, hang = -1, main = "ward")
re4 <- rect.hclust(hc4, k = 4, border = "red")
par(opar)
可以看出,用Ward法求出的决策树可以将行业分为4类
# 整理数据:将电子书中的数据复制保存到9.1.txt文件中,置于工作目录下
tmp <- read.table("9.2.txt", header = F)
d2 <- tmp[, -1]
names(d2) <- c(paste("X", 1:4, sep = ""), "Y")
d2
## X1 X2 X3 X4 Y
## 1 82.9 92 17.1 94 8.4
## 2 88.0 93 21.3 96 9.6
## 3 99.9 96 25.1 97 10.4
## 4 105.3 94 29.0 97 11.4
## 5 117.7 100 34.0 100 12.2
## 6 131.0 101 40.0 101 14.2
## 7 148.2 105 44.0 104 15.8
## 8 161.8 112 49.0 109 17.9
## 9 174.2 112 51.0 111 19.6
## 10 184.7 112 53.0 111 20.8
# 先进行普通的线性回归
lm.sol <- lm(Y ~ ., data = d2)
summary(lm.sol)
##
## Call:
## lm(formula = Y ~ ., data = d2)
##
## Residuals:
## 1 2 3 4 5 6 7 8 9 10
## 0.02480 0.07948 0.01238 -0.00703 -0.28835 0.21609 -0.14209 0.15836 -0.13596 0.08231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.6677 5.9436 -2.97 0.0311 *
## X1 0.0901 0.0210 4.30 0.0077 **
## X2 -0.2313 0.0713 -3.24 0.0229 *
## X3 0.0181 0.0391 0.46 0.6633
## X4 0.4207 0.1185 3.55 0.0164 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.204 on 5 degrees of freedom
## Multiple R-squared: 0.999, Adjusted R-squared: 0.998
## F-statistic: 1.02e+03 on 4 and 5 DF, p-value: 1.83e-07
# 鉴于各个变量的值大小差别较远,采用相关系数矩阵计算主成分
pc2 <- princomp(~X1 + X2 + X3 + X4, data = d2, cor = T)
summary(pc2, loadings = T)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.986 0.199907 0.112190 0.0603086
## Proportion of Variance 0.986 0.009991 0.003147 0.0009093
## Cumulative Proportion 0.986 0.995944 0.999091 1.0000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.502 -0.237 0.579 0.598
## X2 -0.500 0.493 -0.610 0.367
## X3 -0.498 -0.707 -0.368 -0.342
## X4 -0.501 0.449 0.396 -0.626
# 可见只需取第1个主成分即可,它是X1~4的综合指数
# 由于最小特征值(0.0603085506)^2≈0,所以存在多重共线性
# 进行主成分回归
d2$Z <- predict(pc2)[, 1]
d2
## X1 X2 X3 X4 Y Z
## 1 82.9 92 17.1 94 8.4 2.74300
## 2 88.0 93 21.3 96 9.6 2.26913
## 3 99.9 96 25.1 97 10.4 1.66537
## 4 105.3 94 29.0 97 11.4 1.55845
## 5 117.7 100 34.0 100 12.2 0.53961
## 6 131.0 101 40.0 101 14.2 -0.04403
## 7 148.2 105 44.0 104 15.8 -0.96231
## 8 161.8 112 49.0 109 17.9 -2.22802
## 9 174.2 112 51.0 111 19.6 -2.65381
## 10 184.7 112 53.0 111 20.8 -2.88739
lm.pc <- lm(Y ~ Z, data = d2)
summary(lm.pc)
##
## Call:
## lm(formula = Y ~ Z, data = d2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7224 -0.2095 0.0515 0.2103 0.8186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.0300 0.1662 84.4 4.3e-13 ***
## Z -2.0612 0.0837 -24.6 7.9e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.525 on 8 degrees of freedom
## Multiple R-squared: 0.987, Adjusted R-squared: 0.985
## F-statistic: 607 on 1 and 8 DF, p-value: 7.87e-09
回归的结果非常显著。由此,可求出主成分回归方程:
Y = 14.0300 + -2.0612 Z
= 14.03 + 1.0339 x1 + 1.0302 x2 + 1.0264 x3 + 1.0320 x4
= -23.7777 + 0.0299 X1 + 0.1337 X2 + 0.0836 X3 + 0.1697 X4
答:例8.7见483页(电子版500页)
# 输入相关矩阵
x <- c(1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382, 0.846,
1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.277, 0.805, 0.881,
1, 0.801, 0.38, 0.319, 0.237, 0.345, 0.859, 0.826, 0.801,
1, 0.436, 0.329, 0.327, 0.365, 0.473, 0.376, 0.38, 0.436,
1, 0.762, 0.73, 0.629, 0.398, 0.326, 0.319, 0.329, 0.762,
1, 0.583, 0.577, 0.301, 0.277, 0.237, 0.327, 0.73, 0.583,
1, 0.539, 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539,
1)
names <- unlist(strsplit("身高 手臂长 上肢长 下肢长 体重 颈围 胸围 胸宽",
" "))
(r <- matrix(x, nrow = 8, dimnames = list(names, names)))
## 身高 手臂长 上肢长 下肢长 体重 颈围 胸围 胸宽
## 身高 1.000 0.846 0.805 0.859 0.473 0.398 0.301 0.382
## 手臂长 0.846 1.000 0.881 0.826 0.376 0.326 0.277 0.415
## 上肢长 0.805 0.881 1.000 0.801 0.380 0.319 0.237 0.345
## 下肢长 0.859 0.826 0.801 1.000 0.436 0.329 0.327 0.365
## 体重 0.473 0.376 0.380 0.436 1.000 0.762 0.730 0.629
## 颈围 0.398 0.326 0.319 0.329 0.762 1.000 0.583 0.577
## 胸围 0.301 0.277 0.237 0.327 0.730 0.583 1.000 0.539
## 胸宽 0.382 0.277 0.345 0.365 0.629 0.577 0.539 1.000
# 进行主成分分析
# 由协方差和相关系数的定义可知,相关矩阵等于数据标准化后的协方差矩阵
pc3 <- princomp(covmat = r)
summary(pc3, loadings = T)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 2.154 1.3424 0.69491 0.64768 0.4858 0.43113 0.36895 0.31185
## Proportion of Variance 0.580 0.2252 0.06036 0.05244 0.0295 0.02323 0.01702 0.01216
## Cumulative Proportion 0.580 0.8053 0.86566 0.91809 0.9476 0.97083 0.98784 1.00000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 身高 -0.401 -0.274 -0.437 -0.125 0.649 -0.363
## 手臂长 -0.383 -0.351 0.126 0.353 0.252 0.724
## 上肢长 -0.379 -0.337 0.550 0.324 -0.238 -0.520
## 下肢长 -0.391 -0.290 0.165 -0.478 -0.249 -0.659 0.102
## 体重 -0.353 0.387 0.193 -0.116 -0.292 0.751 0.154
## 颈围 -0.314 0.393 -0.728 0.144 -0.418 -0.121
## 胸围 -0.288 0.426 0.483 0.595 0.209 -0.271 -0.157
## 胸宽 -0.300 0.344 -0.837 0.263 0.109
# 进行因子分析
# 调用薛毅书中536页(电子版553页)factor.analy.R程序,为此需要将factor.analy1.R、factor.analy2.R、factor.analy3.R这三个程序都放在工作目录下
# 书中提供的三种方法中,只有主成分法返回了结果,另外两种方法报错,原因暂未弄清
source("factor.analy.R")
factor.analy(r, method = "princomp")
## $method
## [1] "Principal Component Method"
##
## $loadings
## Factor1 Factor2
## X1 -0.8625 -0.3785
## X2 -0.8444 -0.4447
## X3 -0.8162 -0.4632
## X4 -0.8427 -0.4012
## X5 -0.7580 0.5136
## X6 -0.6740 0.5229
## X7 -0.6169 0.5694
## X8 -0.6430 0.4554
##
## $var
## common spcific
## X1 0.8872 0.11284
## X2 0.9108 0.08917
## X3 0.8808 0.11921
## X4 0.8710 0.12900
## X5 0.8384 0.16160
## X6 0.7278 0.27218
## X7 0.7047 0.29526
## X8 0.6208 0.37919
##
## $B
## Factor1 Factor2
## SS loadings 4.656 1.7854
## Proportion Var 0.582 0.2232
## Cumulative Var 0.582 0.8052
# factor.analy(r,method='factor')
# factor.analy(r,method='likelihood')
# R语言的基础stats包也提供了因子分析函数factanal
factanal(covmat = r, factors = 2)
##
## Call:
## factanal(factors = 2, covmat = r)
##
## Uniquenesses:
## 身高 手臂长 上肢长 下肢长 体重 颈围 胸围 胸宽
## 0.166 0.110 0.166 0.196 0.099 0.360 0.414 0.538
##
## Loadings:
## Factor1 Factor2
## 身高 0.869 0.282
## 手臂长 0.929 0.164
## 上肢长 0.896 0.174
## 下肢长 0.862 0.247
## 体重 0.244 0.917
## 颈围 0.201 0.774
## 胸围 0.141 0.752
## 胸宽 0.222 0.643
##
## Factor1 Factor2
## SS loadings 3.333 2.618
## Proportion Var 0.417 0.327
## Cumulative Var 0.417 0.744
##
## The degrees of freedom for the model is 13 and the fit was 0.2495
和主成分分析的结果类似,两个因子依次为“身长因子”和“身宽因子”。
答:(1)例3.18见181页(电子版198页)
# 读取数据
course <- read.table("course.data")
# 因子分析
factor.analy(cor(course), method = "princomp")
## $method
## [1] "Principal Component Method"
##
## $loadings
## Factor1 Factor2
## X1 -0.7665 0.59539
## X2 -0.8721 0.38006
## X3 -0.7956 0.02803
## X4 -0.8502 -0.43386
## X5 -0.6993 -0.63101
##
## $var
## common spcific
## X1 0.9420 0.05795
## X2 0.9050 0.09503
## X3 0.6338 0.36621
## X4 0.9111 0.08892
## X5 0.8872 0.11284
##
## $B
## Factor1 Factor2
## SS loadings 3.1929 1.0861
## Proportion Var 0.6386 0.2172
## Cumulative Var 0.6386 0.8558
factor.analy(cor(course), method = "factor") #kmax=20改为100
## $method
## [1] "Principal Factor Method"
##
## $loadings
## Factor1 Factor2
## X1 -0.8261 0.65799
## X2 -0.8418 0.28764
## X3 -0.6809 -0.02151
## X4 -0.8974 -0.55375
## X5 -0.6183 -0.44331
##
## $var
## common spcific
## X1 1.1154 -0.1154
## X2 0.7914 0.2086
## X3 0.4641 0.5359
## X4 1.1120 -0.1120
## X5 0.5789 0.4211
##
## $B
## Factor1 Factor2
## SS loadings 3.0425 1.0193
## Proportion Var 0.6085 0.2039
## Cumulative Var 0.6085 0.8124
##
## $iterative
## [1] 88
factor.analy(cor(course), method = "likelihood") #kmax=20改为200
## $method
## [1] "Maximum Likelihood Method"
##
## $loadings
## Factor1 Factor2
## [1,] -0.7913 0.60026
## [2,] -0.8710 0.32549
## [3,] -0.7107 -0.03016
## [4,] -0.8600 -0.49847
## [5,] -0.6443 -0.49334
##
## $var
## common spcific
## X1 0.9864 0.01357
## X2 0.8645 0.13550
## X3 0.5060 0.49402
## X4 0.9881 0.01186
## X5 0.6585 0.34146
##
## $B
## Factor1 Factor2
## SS loadings 3.0446 0.9590
## Proportion Var 0.6089 0.1918
## Cumulative Var 0.6089 0.8007
##
## $iterative
## [1] 176
factanal(course, factors = 2)
##
## Call:
## factanal(x = course, factors = 2)
##
## Uniquenesses:
## X1 X2 X3 X4 X5
## 0.005 0.141 0.494 0.005 0.346
##
## Loadings:
## Factor1 Factor2
## X1 0.992 0.104
## X2 0.854 0.360
## X3 0.497 0.509
## X4 0.284 0.956
## X5 0.132 0.798
##
## Factor1 Factor2
## SS loadings 2.059 1.950
## Proportion Var 0.412 0.390
## Cumulative Var 0.412 0.802
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 0.28 on 1 degree of freedom.
## The p-value is 0.598
因子1可看做“综合成绩因子”,因子2可看做“文科成绩因子”(包括政治、语文)。