Machine Learning Exercise 04

主成分分析和因子分析

薛毅书P557（电子版574），9.2，9.3

9.2

答：

options(width = 600)

# 整理数据：将电子书中的数据复制保存到9.2.txt文件中，置于工作目录下
tmp <- read.table("9.2.txt", header = F)
d2 <- tmp[, -1]
names(d2) <- c(paste("X", 1:4, sep = ""), "Y")
d2

##       X1  X2   X3  X4    Y
## 1   82.9  92 17.1  94  8.4
## 2   88.0  93 21.3  96  9.6
## 3   99.9  96 25.1  97 10.4
## 4  105.3  94 29.0  97 11.4
## 5  117.7 100 34.0 100 12.2
## 6  131.0 101 40.0 101 14.2
## 7  148.2 105 44.0 104 15.8
## 8  161.8 112 49.0 109 17.9
## 9  174.2 112 51.0 111 19.6
## 10 184.7 112 53.0 111 20.8


# 先进行普通的线性回归

lm.sol <- lm(Y ~ ., data = d2)
summary(lm.sol)

## 
## Call:
## lm(formula = Y ~ ., data = d2)
## 
## Residuals:
##        1        2        3        4        5        6        7        8        9       10 
##  0.02480  0.07948  0.01238 -0.00703 -0.28835  0.21609 -0.14209  0.15836 -0.13596  0.08231 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -17.6677     5.9436   -2.97   0.0311 * 
## X1            0.0901     0.0210    4.30   0.0077 **
## X2           -0.2313     0.0713   -3.24   0.0229 * 
## X3            0.0181     0.0391    0.46   0.6633   
## X4            0.4207     0.1185    3.55   0.0164 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.204 on 5 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.998 
## F-statistic: 1.02e+03 on 4 and 5 DF,  p-value: 1.83e-07


# 鉴于各个变量的值大小差别较远，采用相关系数矩阵计算主成分

pc2 <- princomp(~X1 + X2 + X3 + X4, data = d2, cor = T)
summary(pc2, loadings = T)

## Importance of components:
##                        Comp.1   Comp.2   Comp.3    Comp.4
## Standard deviation      1.986 0.199907 0.112190 0.0603086
## Proportion of Variance  0.986 0.009991 0.003147 0.0009093
## Cumulative Proportion   0.986 0.995944 0.999091 1.0000000
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.502 -0.237  0.579  0.598
## X2 -0.500  0.493 -0.610  0.367
## X3 -0.498 -0.707 -0.368 -0.342
## X4 -0.501  0.449  0.396 -0.626


# 可见只需取第1个主成分即可，它是X1~4的综合指数

# 由于最小特征值(0.0603085506)^2≈0，所以存在多重共线性

# 进行主成分回归

d2$Z <- predict(pc2)[, 1]
d2

##       X1  X2   X3  X4    Y        Z
## 1   82.9  92 17.1  94  8.4  2.74300
## 2   88.0  93 21.3  96  9.6  2.26913
## 3   99.9  96 25.1  97 10.4  1.66537
## 4  105.3  94 29.0  97 11.4  1.55845
## 5  117.7 100 34.0 100 12.2  0.53961
## 6  131.0 101 40.0 101 14.2 -0.04403
## 7  148.2 105 44.0 104 15.8 -0.96231
## 8  161.8 112 49.0 109 17.9 -2.22802
## 9  174.2 112 51.0 111 19.6 -2.65381
## 10 184.7 112 53.0 111 20.8 -2.88739


lm.pc <- lm(Y ~ Z, data = d2)
summary(lm.pc)

## 
## Call:
## lm(formula = Y ~ Z, data = d2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7224 -0.2095  0.0515  0.2103  0.8186 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0300     0.1662    84.4  4.3e-13 ***
## Z            -2.0612     0.0837   -24.6  7.9e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.525 on 8 degrees of freedom
## Multiple R-squared:  0.987,  Adjusted R-squared:  0.985 
## F-statistic:  607 on 1 and 8 DF,  p-value: 7.87e-09

回归的结果非常显著。由此，可求出主成分回归方程：

Y = 14.0300 + -2.0612 Z

= 14.0300 + 1.0339 x1 + 1.0302 x2 + 1.0264 x3 + 1.0320 x4

= -23.7777 + 0.0299 X1 + 0.1337 X2 + 0.0836 X3 + 0.1697 X4

9.3

答：例8.7见483页（电子版500页）

# 输入相关矩阵

x <- c(1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382, 0.846, 
    1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.277, 0.805, 0.881, 
    1, 0.801, 0.38, 0.319, 0.237, 0.345, 0.859, 0.826, 0.801, 
    1, 0.436, 0.329, 0.327, 0.365, 0.473, 0.376, 0.38, 0.436, 
    1, 0.762, 0.73, 0.629, 0.398, 0.326, 0.319, 0.329, 0.762, 
    1, 0.583, 0.577, 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 
    1, 0.539, 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 
    1)
names <- unlist(strsplit("身高 手臂长 上肢长 下肢长 体重 颈围 胸围 胸宽", 
    " "))
(r <- matrix(x, nrow = 8, dimnames = list(names, names)))

##         身高 手臂长 上肢长 下肢长  体重  颈围  胸围  胸宽
## 身高   1.000  0.846  0.805  0.859 0.473 0.398 0.301 0.382
## 手臂长 0.846  1.000  0.881  0.826 0.376 0.326 0.277 0.415
## 上肢长 0.805  0.881  1.000  0.801 0.380 0.319 0.237 0.345
## 下肢长 0.859  0.826  0.801  1.000 0.436 0.329 0.327 0.365
## 体重   0.473  0.376  0.380  0.436 1.000 0.762 0.730 0.629
## 颈围   0.398  0.326  0.319  0.329 0.762 1.000 0.583 0.577
## 胸围   0.301  0.277  0.237  0.327 0.730 0.583 1.000 0.539
## 胸宽   0.382  0.277  0.345  0.365 0.629 0.577 0.539 1.000


# 进行主成分分析

# 由协方差和相关系数的定义可知，相关矩阵等于数据标准化后的协方差矩阵

pc3 <- princomp(covmat = r)
summary(pc3, loadings = T)

## Importance of components:
##                        Comp.1 Comp.2  Comp.3  Comp.4 Comp.5  Comp.6  Comp.7  Comp.8
## Standard deviation      2.154 1.3424 0.69491 0.64768 0.4858 0.43113 0.36895 0.31185
## Proportion of Variance  0.580 0.2252 0.06036 0.05244 0.0295 0.02323 0.01702 0.01216
## Cumulative Proportion   0.580 0.8053 0.86566 0.91809 0.9476 0.97083 0.98784 1.00000
## 
## Loadings:
##        Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## 身高   -0.401 -0.274               -0.437 -0.125  0.649 -0.363
## 手臂长 -0.383 -0.351  0.126         0.353         0.252  0.724
## 上肢长 -0.379 -0.337                0.550  0.324 -0.238 -0.520
## 下肢长 -0.391 -0.290         0.165 -0.478 -0.249 -0.659  0.102
## 体重   -0.353  0.387  0.193 -0.116 -0.292  0.751         0.154
## 颈围   -0.314  0.393        -0.728  0.144 -0.418 -0.121       
## 胸围   -0.288  0.426  0.483  0.595  0.209 -0.271        -0.157
## 胸宽   -0.300  0.344 -0.837  0.263                       0.109


# 进行因子分析

# 调用薛毅书中536页（电子版553页）factor.analy.R程序，为此需要将factor.analy1.R、factor.analy2.R、factor.analy3.R这三个程序都放在工作目录下

# 书中提供的三种方法中，只有主成分法返回了结果，另外两种方法报错，原因暂未弄清

source("factor.analy.R")
factor.analy(r, method = "princomp")

## $method
## [1] "Principal Component Method"
## 
## $loadings
##    Factor1 Factor2
## X1 -0.8625 -0.3785
## X2 -0.8444 -0.4447
## X3 -0.8162 -0.4632
## X4 -0.8427 -0.4012
## X5 -0.7580  0.5136
## X6 -0.6740  0.5229
## X7 -0.6169  0.5694
## X8 -0.6430  0.4554
## 
## $var
##    common spcific
## X1 0.8872 0.11284
## X2 0.9108 0.08917
## X3 0.8808 0.11921
## X4 0.8710 0.12900
## X5 0.8384 0.16160
## X6 0.7278 0.27218
## X7 0.7047 0.29526
## X8 0.6208 0.37919
## 
## $B
##                Factor1 Factor2
## SS loadings      4.656  1.7854
## Proportion Var   0.582  0.2232
## Cumulative Var   0.582  0.8052


# factor.analy(r,method='factor')

# factor.analy(r,method='likelihood')

# R语言的基础stats包也提供了因子分析函数factanal

factanal(covmat = r, factors = 2)

## 
## Call:
## factanal(factors = 2, covmat = r)
## 
## Uniquenesses:
##   身高 手臂长 上肢长 下肢长   体重   颈围   胸围   胸宽 
##  0.166  0.110  0.166  0.196  0.099  0.360  0.414  0.538 
## 
## Loadings:
##        Factor1 Factor2
## 身高   0.869   0.282  
## 手臂长 0.929   0.164  
## 上肢长 0.896   0.174  
## 下肢长 0.862   0.247  
## 体重   0.244   0.917  
## 颈围   0.201   0.774  
## 胸围   0.141   0.752  
## 胸宽   0.222   0.643  
## 
##                Factor1 Factor2
## SS loadings      3.333   2.618
## Proportion Var   0.417   0.327
## Cumulative Var   0.417   0.744
## 
## The degrees of freedom for the model is 13 and the fit was 0.2495

和主成分分析的结果类似，两个因子依次为“身长因子”和“身宽因子”。