we91 <- data.frame(X1 = c(90342, 4903, 6735, 49454, 139190, 12215, 2372, 11062,
17111, 1206, 2150, 5251, 14341), X2 = c(52455, 1973, 21139, 36241, 203505,
16219, 6572, 23078, 23907, 3930, 5704, 6155, 13203), X3 = c(101091, 2035,
3767, 81557, 215898, 10351, 8103, 54935, 52108, 6126, 6200, 10383, 19396),
X4 = c(19272, 10313, 1780, 22504, 10609, 6382, 12329, 23804, 21796, 15586,
10870, 16875, 14691), X5 = c(82, 34.2, 36.1, 98.1, 93.2, 62.5, 184.4,
370.4, 221.5, 330.4, 184.2, 146.4, 94.6), X6 = c(16.1, 7.1, 8.2, 25.9,
12.6, 8.7, 22.2, 41, 21.5, 29.5, 12, 27.5, 17.8), X7 = c(197435, 592077,
726396, 348226, 139572, 145818, 20921, 65486, 63806, 1840, 8913, 78796,
6354), X8 = c(0.172, 0.003, 0.003, 0.985, 0.628, 0.066, 0.152, 0.263,
0.276, 0.437, 0.274, 0.151, 1.574))
we91.pr <- princomp(we91, cor = TRUE)
summary(we91.pr, loadings = TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## Standard deviation 1.7621 1.7022 0.9645 0.80133 0.55144 0.29427
## Proportion of Variance 0.3881 0.3622 0.1163 0.08027 0.03801 0.01082
## Cumulative Proportion 0.3881 0.7503 0.8666 0.94684 0.98485 0.99567
## Comp.7 Comp.8
## Standard deviation 0.179400 0.0494143
## Proportion of Variance 0.004023 0.0003052
## Cumulative Proportion 0.999695 1.0000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## X1 -0.477 -0.296 -0.104 0.184 0.758 0.245
## X2 -0.473 -0.278 -0.163 0.174 -0.305 -0.518 0.527
## X3 -0.424 -0.378 -0.156 -0.174 -0.781
## X4 0.213 -0.451 -0.516 0.539 0.288 -0.249 0.220
## X5 0.388 -0.331 -0.321 0.199 -0.450 0.582 0.233
## X6 0.352 -0.403 -0.145 -0.279 -0.317 -0.714
## X7 -0.215 0.377 -0.140 -0.758 -0.418 0.194
## X8 -0.273 0.891 -0.322 0.122
由于前四个主成分的累积贡献率已达94.68%,为此可用该4个主成分来代替8个指标以达到降维的目的;
# 计算各样本的主成分得分
we91.prdt <- predict(we91.pr); we91.prdt
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## [1,] -1.5355 -0.78961 -0.56001 -0.50982 1.10179 0.002675 0.410987 0.0045907
## [2,] -0.5186 2.69747 -0.23763 -0.88669 0.16713 0.302963 -0.132418 0.0696051
## [3,] -1.0996 3.35724 -0.42613 -0.60625 -0.96794 -0.061794 0.085556 -0.0249831
## [4,] -0.4786 -1.23197 1.03842 -1.66487 0.01184 -0.077609 -0.008986 -0.0540978
## [5,] -4.7134 -2.35482 -0.48674 0.78902 -0.51657 -0.019903 -0.126040 0.0235021
## [6,] -0.3434 1.84604 -0.03241 0.97630 0.38398 -0.214601 -0.028390 -0.0695329
## [7,] 1.1475 0.33092 -0.29333 0.71995 0.09516 -0.315671 -0.005296 -0.0364517
## [8,] 2.2846 -2.33577 -1.14410 -0.57948 -0.59525 -0.011743 -0.041535 -0.0545827
## [9,] 0.8755 -0.93223 -0.36728 -0.13377 0.54814 0.487868 -0.299949 -0.0009447
## [10,] 2.1148 -0.85885 -0.24049 0.53512 -0.67391 0.185932 0.290797 0.0756972
## [11,] 0.7425 0.78646 0.12756 1.15634 0.24384 0.397822 0.018545 -0.0307115
## [12,] 1.2505 -0.03158 -0.29874 -0.08509 0.38556 -0.668578 -0.176243 0.0818481
## [13,] 0.2737 -0.48327 2.92089 0.28923 -0.18378 -0.007362 0.012972 0.0160612
rownames(we91.prdt) <- c("冶金", "电力", "煤炭", "化学", "机械", "建材", "森工",
"食品", "纺织", "缝纫", "皮革", "造纸", "文教艺术用品")
# 基于主成分得分对行业进行排序(排序越靠前越好)
rownames(we91.prdt)[order(we91.prdt[,1])]
## [1] "机械" "冶金" "煤炭" "电力"
## [5] "化学" "建材" "文教艺术用品" "皮革"
## [9] "纺织" "森工" "造纸" "缝纫"
## [13] "食品"
rownames(we91.prdt)[order(we91.prdt[,2])]
## [1] "机械" "食品" "化学" "纺织"
## [5] "缝纫" "冶金" "文教艺术用品" "造纸"
## [9] "森工" "皮革" "建材" "电力"
## [13] "煤炭"
rownames(we91.prdt)[order(we91.prdt[,3])]
## [1] "食品" "冶金" "机械" "煤炭"
## [5] "纺织" "造纸" "森工" "缝纫"
## [9] "电力" "建材" "皮革" "化学"
## [13] "文教艺术用品"
rownames(we91.prdt)[order(we91.prdt[,4])]
## [1] "化学" "电力" "煤炭" "食品"
## [5] "冶金" "纺织" "造纸" "文教艺术用品"
## [9] "缝纫" "森工" "机械" "建材"
## [13] "皮革"
#得出主成分的碎石图
screeplot(we91.pr,npcs=4,type="lines",main="主成分碎石图")
d = dist(scale(we91.prdt[,1:4]), method = "euclidean");
hc1<-hclust(d,"complete"); #最长距离法聚类
plclust(hc1,hang=-1);rect.hclust(hc1, k=6,border="red"); #分类数可根据自己的理解设定;
we92 = data.frame(X1 = c(82.9, 88, 99.9, 105.3, 117.7, 131, 148.2, 161.8, 174.2,
184.7), X2 = c(92, 93, 96, 94, 100, 101, 105, 112, 112, 112), X3 = c(17.1,
21.3, 25.1, 29, 34, 40, 44, 49, 51, 53), X4 = c(94, 96, 97, 97, 100, 101,
104, 109, 111, 111), Y = c(8.4, 9.6, 10.4, 11.4, 12.2, 14.2, 15.8, 17.9,
19.6, 20.8))
lm.sol = lm(Y ~ X1 + X2 + X3 + X4, data = we92)
summary(lm.sol)
##
## Call:
## lm(formula = Y ~ X1 + X2 + X3 + X4, data = we92)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## 0.02480 0.07948 0.01238 -0.00703 -0.28835 0.21609 -0.14209 0.15836
## 9 10
## -0.13596 0.08231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.6677 5.9436 -2.97 0.0311 *
## X1 0.0901 0.0210 4.30 0.0077 **
## X2 -0.2313 0.0713 -3.24 0.0229 *
## X3 0.0181 0.0391 0.46 0.6633
## X4 0.4207 0.1185 3.55 0.0164 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.204 on 5 degrees of freedom
## Multiple R-squared: 0.999, Adjusted R-squared: 0.998
## F-statistic: 1.02e+03 on 4 and 5 DF, p-value: 1.83e-07
从结果来看,回归方程效果不太好,X3回归系数未通过显著性检验
we92.pr = princomp(~X1 + X2 + X3 + X4, data = we92, cor = TRUE)
summary(we92.pr, loadings = TRUE)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.986 0.199907 0.112190 0.0603086
## Proportion of Variance 0.986 0.009991 0.003147 0.0009093
## Cumulative Proportion 0.986 0.995944 0.999091 1.0000000
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.502 -0.237 0.579 0.598
## X2 -0.500 0.493 -0.610 0.367
## X3 -0.498 -0.707 -0.368 -0.342
## X4 -0.501 0.449 0.396 -0.626
由于前两个主成分的累积贡献率已达到99%,因此舍去其他主成分,达到降维的目的。
## 预测样本主成分,并作主成分分析
pre = predict(we92.pr)
we92$z1 = pre[, 1]
we92$z2 = pre[, 2]
lm.sol = lm(Y ~ z1 + z2, data = we92)
summary(lm.sol)
##
## Call:
## lm(formula = Y ~ z1 + z2, data = we92)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7432 -0.2922 0.0175 0.3081 0.8085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.0300 0.1712 81.93 1.1e-11 ***
## z1 -2.0612 0.0862 -23.90 5.7e-08 ***
## z2 -0.6241 0.8566 -0.73 0.49
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.542 on 7 degrees of freedom
## Multiple R-squared: 0.988, Adjusted R-squared: 0.984
## F-statistic: 286 on 2 and 7 DF, p-value: 1.94e-07
回归系数和回归方程均通过检验,效果显著。 Y = 14.03000 - 2.06119Z1 - 0.62409Z2
## 做变换,得到原坐标下的关系表达式
beta = coef(lm.sol)
A = loadings(we92.pr)
x.bar = we92.pr$center
x.sd = we92.pr$scale
coef = (beta[2] * A[, 1] + beta[3] * A[, 2])/x.sd
beta0 = beta[1] - sum(x.bar * coef)
print(c(beta0, coef))
## (Intercept) X1 X2 X3 X4
## -16.88461 0.03421 0.09376 0.11955 0.12360
回归方程为:Y = -16.8846 + 0.03421X1 + 0.09376X2 + 0.11955X3 + 0.12360X4
x <- c(1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382, 0.846, 1, 0.881,
0.826, 0.376, 0.326, 0.277, 0.277, 0.805, 0.881, 1, 0.801, 0.38, 0.319,
0.237, 0.345, 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365, 0.473,
0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629, 0.398, 0.326, 0.319, 0.329, 0.762,
1, 0.583, 0.577, 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539, 0.382,
0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1)
names = c("身高 x1", "手臂长 x2", "上肢长 x3", "下肢长 x4", "体重 x5",
"颈围 x6", "胸围 x7", "胸宽 x8")
r <- matrix(x, nrow = 8, dimnames = list(names, names))
source("../R-Book-Demo/ch9/factor.analy1.R")
fa <- factor.analy1(r, m = 2)
fa # 选取2个因子
## $method
## [1] "Principal Component Method"
##
## $loadings
## Factor1 Factor2
## X1 -0.8625 -0.3785
## X2 -0.8444 -0.4447
## X3 -0.8162 -0.4632
## X4 -0.8427 -0.4012
## X5 -0.7580 0.5136
## X6 -0.6740 0.5229
## X7 -0.6169 0.5694
## X8 -0.6430 0.4554
##
## $var
## common spcific
## X1 0.8872 0.11284
## X2 0.9108 0.08917
## X3 0.8808 0.11921
## X4 0.8710 0.12900
## X5 0.8384 0.16160
## X6 0.7278 0.27218
## X7 0.7047 0.29526
## X8 0.6208 0.37919
##
## $B
## Factor1 Factor2
## SS loadings 4.656 1.7854
## Proportion Var 0.582 0.2232
## Cumulative Var 0.582 0.8052
vm1 <- varimax(fa$loadings, normalize = F)
vm1
## $loadings
##
## Loadings:
## Factor1 Factor2
## X1 -0.913 0.232
## X2 -0.939 0.168
## X3 -0.929 0.136
## X4 -0.911 0.201
## X5 -0.282 0.871
## X6 -0.210 0.827
## X7 -0.137 0.828
## X8 -0.227 0.754
##
## Factor1 Factor2
## SS loadings 3.603 2.839
## Proportion Var 0.450 0.355
## Cumulative Var 0.450 0.805
##
## $rotmat
## [,1] [,2]
## [1,] 0.7888 -0.6146
## [2,] 0.6146 0.7888
结论:在计算结果中,因子Factor1前几个变量(X1,X2,X3,X4)的载荷因子接近1,可称Factor1是长度因子。 而因子Factor2后几个变量(X5,X6,X7,X8)的载荷因子接近1,可称Factor2是宽度因子。
we94 = data.frame(
x1 = c(99,99,100,93,100,90,75,93,87,95,76,85), # 政治
x2 = c(94,88,98,88,91,78,73,84,73,82,72,75), # 语文
x3 = c(93,96,81,88,72,82,88,83,60,90,43,50), # 外语
x4 = c(100,99,96,99,96,75,97,68,76,62,67,34), # 数学
x5 = c(100,97,100,96,78,97,89,88,84,39,78,37)); # 物理
r94 = cor(we94);
fa94 <- factor.analy1(r94, m = 3); fa94 # 选取3个因子
## $method
## [1] "Principal Component Method"
##
## $loadings
## Factor1 Factor2 Factor3
## X1 -0.7665 0.59539 0.13511
## X2 -0.8721 0.38006 0.22906
## X3 -0.7956 0.02803 -0.59902
## X4 -0.8502 -0.43386 0.01694
## X5 -0.6993 -0.63101 0.22719
##
## $var
## common spcific
## X1 0.9603 0.039698
## X2 0.9574 0.042561
## X3 0.9926 0.007391
## X4 0.9114 0.088634
## X5 0.9388 0.061221
##
## $B
## Factor1 Factor2 Factor3
## SS loadings 3.1929 1.0861 0.48145
## Proportion Var 0.6386 0.2172 0.09629
## Cumulative Var 0.6386 0.8558 0.95210
vm94 <- varimax(fa94$loadings, normalize = F); vm94
## $loadings
##
## Loadings:
## Factor1 Factor2 Factor3
## X1 -0.945 -0.253
## X2 -0.903 -0.307 -0.218
## X3 -0.319 -0.265 -0.906
## X4 -0.267 -0.832 -0.384
## X5 -0.112 -0.954 -0.125
##
## Factor1 Factor2 Factor3
## SS loadings 1.895 1.771 1.095
## Proportion Var 0.379 0.354 0.219
## Cumulative Var 0.379 0.733 0.952
##
## $rotmat
## [,1] [,2] [,3]
## [1,] 0.6488 0.5939 0.47574
## [2,] -0.6704 0.7419 -0.01191
## [3,] -0.3600 -0.3112 0.87950