Detailed data attached at the end
Detailed code see github.com

## principal components analysis
student.pr=princomp(~X1+X2+X3+X4, data=student, cor=TRUE) 
## summary
summary(student.pr, loadings=TRUE)
## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     1.8817805 0.55980636 0.28179594 0.25711844
## Proportion of Variance 0.8852745 0.07834579 0.01985224 0.01652747
## Cumulative Proportion  0.8852745 0.96362029 0.98347253 1.00000000
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4
## X1 -0.497  0.543 -0.450  0.506
## X2 -0.515 -0.210 -0.462 -0.691
## X3 -0.481 -0.725  0.175  0.461
## X4 -0.507  0.368  0.744 -0.232
##  scores
predict(student.pr)
##         Comp.1      Comp.2      Comp.3       Comp.4
## 1   0.06990950 -0.23813701 -0.35509248 -0.266120139
## 2   1.59526340 -0.71847399  0.32813232 -0.118056646
## 3  -2.84793151  0.38956679 -0.09731731 -0.279482487
## 4   0.75996988  0.80604335 -0.04945722 -0.162949298
## 5  -2.73966777  0.01718087  0.36012615  0.358653044
## 6   2.10583168  0.32284393  0.18600422 -0.036456084
## 7  -1.42105591 -0.06053165  0.21093321 -0.044223092
## 8  -0.82583977 -0.78102576 -0.27557798  0.057288572
## 9  -0.93464402 -0.58469242 -0.08814136  0.181037746
## 10  2.36463820 -0.36532199  0.08840476  0.045520127
## 11  2.83741916  0.34875841  0.03310423 -0.031146930
## 12 -2.60851224  0.21278728 -0.33398037  0.210157574
## 13 -2.44253342 -0.16769496 -0.46918095 -0.162987830
## 14  1.86630669  0.05021384  0.37720280 -0.358821916
## 15  2.81347421 -0.31790107 -0.03291329 -0.222035112
## 16  0.06392983  0.20718448  0.04334340  0.703533624
## 17 -1.55561022 -1.70439674 -0.33126406  0.007551879
## 18  1.07392251 -0.06763418  0.02283648  0.048606680
## 19 -2.52174212  0.97274301  0.12164633 -0.390667991
## 20 -2.14072377  0.02217881  0.37410972  0.129548960
## 21 -0.79624422  0.16307887  0.12781270 -0.294140762
## 22  0.28708321 -0.35744666 -0.03962116  0.080991989
## 23 -0.25151075  1.25555188 -0.55617325  0.109068939
## 24  2.05706032  0.78894494 -0.26552109  0.388088643
## 25 -3.08596855 -0.05775318  0.62110421 -0.218939612
## 26 -0.16367555  0.04317932  0.24481850  0.560248997
## 27  1.37265053  0.02220972 -0.23378320 -0.257399715
## 28  2.16097778  0.13733233  0.35589739  0.093123683
## 29  2.40434827 -0.48613137 -0.16154441 -0.007914021
## 30  0.50287468  0.14734317 -0.20590831 -0.122078819
##  screepplot
screeplot(student.pr,type="lines")

biplot(student.pr)

Principal component regression analysis using psych package: A brief example

For more details of Parallel Analysis, see Factor Retention Decisions in Exploratory Factor Analysis: A Tutorial on Parallel Analysis

Linear model: vif值很大, 说明存在严重多重共线性

lmo.sol=lm(y~., data=fertilization)
summary(lmo.sol)
## 
## Call:
## lm(formula = y ~ ., data = fertilization)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8202 -0.6534 -0.0676  0.7305  2.7770 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.158668   1.699788   6.565 3.62e-06 ***
## x1           1.702875   0.361459   4.711 0.000174 ***
## x2          -2.188405   0.527028  -4.152 0.000598 ***
## x3           0.007649   0.001137   6.728 2.63e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.146 on 18 degrees of freedom
## Multiple R-squared:  0.9761, Adjusted R-squared:  0.9721 
## F-statistic: 245.2 on 3 and 18 DF,  p-value: 8.824e-15
library(car)
vif(lmo.sol)
##         x1         x2         x3 
## 196.842222 209.058254   9.841936

Principal components analysis

判断主成分个数的方法:

  • 先验经验和理论知识
  • 事先确定的累计贡献率阈值
  • 根据Kaiser-Harris准则: 保留特征值大于1的主成分
  • 进行平行分析: 基于多次模拟数据矩阵的特征值均值来选取主成分
library(psych)
fa.parallel(fertilization1, fa = "pc", n.iter = 100, 
            show.legend = FALSE, main = "Scree plot with parallel analysis")

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  1

按照parallel analysis的结果, 选取一个主成分, 并作出主成分与原变量的关系图

fertilization.pr=principal(fertilization1, nfactors=1, rotate="none")
fertilization.pr
## Principal Components Analysis
## Call: principal(r = fertilization1, nfactors = 1, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##     PC1   h2    u2 com
## x1 0.99 0.99 0.014   1
## x2 0.99 0.99 0.012   1
## x3 0.98 0.95 0.048   1
## 
##                 PC1
## SS loadings    2.93
## Proportion Var 0.98
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.02 
##  with the empirical chi square  0.05  with prob <  NA 
## 
## Fit based upon off diagonal values = 1
fa.diagram(fertilization.pr)

建立主成分回归模型并计算原方程系数

lm.sol=lm(y~F1, data=fertilization)
summary(lm.sol)
## 
## Call:
## lm(formula = y ~ F1, data = fertilization)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7389 -0.7798 -0.0406  0.8496  4.6182 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.7273     0.3819   62.13  < 2e-16 ***
## F1           -3.7928     0.2232  -16.99 2.37e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.791 on 20 degrees of freedom
## Multiple R-squared:  0.9352, Adjusted R-squared:  0.932 
## F-statistic: 288.7 on 1 and 20 DF,  p-value: 2.369e-13
## (Intercept)          x1          x2          x3 
## 5.412508163 0.232064543 0.328696909 0.003207758

Appendix: datasets

student
X1 X2 X3 X4
148 41 72 78
139 34 71 76
160 49 77 86
149 36 67 79
159 45 80 86
142 31 66 76
153 43 76 83
150 43 77 79
151 42 77 80
139 31 68 74
140 29 64 74
161 47 78 84
158 49 78 83
140 33 67 77
137 31 66 73
152 35 73 79
149 47 82 79
145 35 70 77
160 47 74 87
156 44 78 85
151 42 73 82
147 38 73 78
157 39 68 80
147 30 65 75
157 48 80 88
151 36 74 80
144 36 68 76
141 30 67 76
139 32 68 73
148 38 70 78
fertilization
x1 x2 x3 y
13.0 9.2 50 13
18.7 13.2 102 14
21.0 14.8 150 15
19.0 13.3 110 16
22.8 16.0 200 17
26.0 18.2 330 18
28.0 19.7 450 19
31.4 22.5 450 20
30.3 21.0 550 21
29.2 20.5 640 22
36.2 25.2 800 23
37.0 26.1 1090 24
37.9 27.2 1140 25
41.6 30.0 1500 26
38.2 27.1 1180 27
39.4 27.4 1320 28
39.2 27.6 1400 29
42.0 29.4 1600 30
43.0 30.0 1600 31
41.1 27.2 1400 33
43.0 31.0 2050 35
49.0 34.8 2500 36