Question 1 Last two predictors suggest multicollinearity

data(longley)
head(longley)
##      GNP.deflator     GNP Unemployed Armed.Forces Population Year Employed
## 1947         83.0 234.289      235.6        159.0    107.608 1947   60.323
## 1948         88.5 259.426      232.5        145.6    108.632 1948   61.122
## 1949         88.2 258.054      368.2        161.6    109.773 1949   60.171
## 1950         89.5 284.599      335.1        165.0    110.929 1950   61.187
## 1951         96.2 328.975      209.9        309.9    112.075 1951   63.221
## 1952         98.1 346.999      193.2        359.4    113.270 1952   63.639
longley.lm <- lm(Employed ~ ., data = longley)
summary(longley.lm)
## 
## Call:
## lm(formula = Employed ~ ., data = longley)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.41011 -0.15767 -0.02816  0.10155  0.45539 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.482e+03  8.904e+02  -3.911 0.003560 ** 
## GNP.deflator  1.506e-02  8.492e-02   0.177 0.863141    
## GNP          -3.582e-02  3.349e-02  -1.070 0.312681    
## Unemployed   -2.020e-02  4.884e-03  -4.136 0.002535 ** 
## Armed.Forces -1.033e-02  2.143e-03  -4.822 0.000944 ***
## Population   -5.110e-02  2.261e-01  -0.226 0.826212    
## Year          1.829e+00  4.555e-01   4.016 0.003037 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3049 on 9 degrees of freedom
## Multiple R-squared:  0.9955, Adjusted R-squared:  0.9925 
## F-statistic: 330.3 on 6 and 9 DF,  p-value: 4.984e-10
X = model.matrix(longley.lm)[,-1]
R = cor(X)
ev = eigen(R)$val
sqrt(ev[1]*(ev)^(-1))
## [1]   1.000000   1.979048   4.757028  17.560372  42.470986 110.544153

Question 2 All predictors except armed forces suggest multicollinearity. Since Year and unemployed are significant, they may not be significant due to high VIF scores

library(faraway)
## Warning: package 'faraway' was built under R version 4.5.2
vif(longley.lm)
## GNP.deflator          GNP   Unemployed Armed.Forces   Population         Year 
##    135.53244   1788.51348     33.61889      3.58893    399.15102    758.98060

Question 3 The adjusted r^2 is the same for both models. The VIF and condition numbers are reasonable now.

longley.lm.removed <- lm(Employed ~ Unemployed + Armed.Forces + Year, data = longley)
summary(longley.lm.removed)
## 
## Call:
## lm(formula = Employed ~ Unemployed + Armed.Forces + Year, data = longley)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57285 -0.11989  0.04087  0.13979  0.75303 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.797e+03  6.864e+01 -26.183 5.89e-12 ***
## Unemployed   -1.470e-02  1.671e-03  -8.793 1.41e-06 ***
## Armed.Forces -7.723e-03  1.837e-03  -4.204  0.00122 ** 
## Year          9.564e-01  3.553e-02  26.921 4.24e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3321 on 12 degrees of freedom
## Multiple R-squared:  0.9928, Adjusted R-squared:  0.9911 
## F-statistic: 555.2 on 3 and 12 DF,  p-value: 3.916e-13
X.rm = model.matrix(longley.lm.removed)[,-1]
r.rm = cor(X.rm)
ev.rm = eigen(r.rm)$val
sqrt(ev.rm[1]*(ev.rm)^(-1))
## [1] 1.000000 1.217831 3.703007
vif(longley.lm.removed)
##   Unemployed Armed.Forces         Year 
##     3.317929     2.223317     3.890861

Question 4 The same predictors are significant

longley.scale = data.frame(scale(longley))
apply(longley.scale, 2, mean) # verify that each column has mean 0
##  GNP.deflator           GNP    Unemployed  Armed.Forces    Population 
## -4.774826e-16 -1.118897e-16 -5.724587e-17  5.767956e-17  8.949411e-16 
##          Year      Employed 
##  0.000000e+00 -1.879573e-15
apply(longley.scale, 2, sd) # verify that each column has SD 1
## GNP.deflator          GNP   Unemployed Armed.Forces   Population         Year 
##            1            1            1            1            1            1 
##     Employed 
##            1
longley.scale.lm <- lm(Employed ~ ., data = longley.scale)
summary(lm(Employed ~ ., data = longley.scale))
## 
## Call:
## lm(formula = Employed ~ ., data = longley.scale)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.116776 -0.044896 -0.008019  0.028916  0.129669 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.752e-15  2.170e-02   0.000 1.000000    
## GNP.deflator  4.628e-02  2.609e-01   0.177 0.863141    
## GNP          -1.014e+00  9.479e-01  -1.070 0.312681    
## Unemployed   -5.375e-01  1.300e-01  -4.136 0.002535 ** 
## Armed.Forces -2.047e-01  4.246e-02  -4.822 0.000944 ***
## Population   -1.012e-01  4.478e-01  -0.226 0.826212    
## Year          2.480e+00  6.175e-01   4.016 0.003037 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0868 on 9 degrees of freedom
## Multiple R-squared:  0.9955, Adjusted R-squared:  0.9925 
## F-statistic: 330.3 on 6 and 9 DF,  p-value: 4.984e-10

Question 5 Many variables are strongly correlated with each other

longley.cor <- cor(longley.scale)
longley.cor
##              GNP.deflator       GNP Unemployed Armed.Forces Population
## GNP.deflator    1.0000000 0.9915892  0.6206334    0.4647442  0.9791634
## GNP             0.9915892 1.0000000  0.6042609    0.4464368  0.9910901
## Unemployed      0.6206334 0.6042609  1.0000000   -0.1774206  0.6865515
## Armed.Forces    0.4647442 0.4464368 -0.1774206    1.0000000  0.3644163
## Population      0.9791634 0.9910901  0.6865515    0.3644163  1.0000000
## Year            0.9911492 0.9952735  0.6682566    0.4172451  0.9939528
## Employed        0.9708985 0.9835516  0.5024981    0.4573074  0.9603906
##                   Year  Employed
## GNP.deflator 0.9911492 0.9708985
## GNP          0.9952735 0.9835516
## Unemployed   0.6682566 0.5024981
## Armed.Forces 0.4172451 0.4573074
## Population   0.9939528 0.9603906
## Year         1.0000000 0.9713295
## Employed     0.9713295 1.0000000

Question 6

PCA = princomp(model.frame(longley.lm)[,2:7],cor = TRUE)
PCA$loadings #loading matrix
## 
## Loadings:
##              Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## GNP.deflator  0.462         0.149  0.793  0.338  0.135
## GNP           0.462         0.278 -0.122 -0.150 -0.818
## Unemployed    0.321 -0.596 -0.728               -0.107
## Armed.Forces  0.202  0.798 -0.562                     
## Population    0.462         0.196 -0.590  0.549  0.312
## Year          0.465         0.128        -0.750  0.450
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.167  0.167  0.167  0.167  0.167  0.167
## Cumulative Var  0.167  0.333  0.500  0.667  0.833  1.000
summary(PCA) # variance
## Importance of components:
##                           Comp.1    Comp.2     Comp.3      Comp.4       Comp.5
## Standard deviation     2.1455482 1.0841312 0.45102702 0.122181253 0.0505179747
## Proportion of Variance 0.7672295 0.1958901 0.03390423 0.002488043 0.0004253443
## Cumulative Proportion  0.7672295 0.9631196 0.99702383 0.999511871 0.9999372153
##                              Comp.6
## Standard deviation     1.940897e-02
## Proportion of Variance 6.278469e-05
## Cumulative Proportion  1.000000e+00
r = eigen(cor(model.frame(longley.lm)[,2:7]))$vectors # loading
ev = eigen(r)$val # variance
sqrt(ev[1]*(ev)^(-1)) ## condition indices
## [1] 1.0000000+0.0000000i 0.0000000-1.0000000i 0.2898226-0.9570804i
## [4] 0.2898226+0.9570804i 0.7667560-0.6419387i 0.7667560+0.6419387i

Question 7

X.tilde <- as.matrix(longley.scale[ , colnames(longley.scale) != "Employed"])

pca <- prcomp(X.tilde, scale = FALSE, center = FALSE)

V <- pca$rotation
PC <- X.tilde %*% V
colnames(PC) <- paste("PC", 1:ncol(PC), sep = "")

PC <- data.frame(PC)
PC$Employed <- longley.scale[, "Employed"]

PCR.lm <- lm(Employed ~ . + 0, data = PC)

summary(PCR.lm)
## 
## Call:
## lm(formula = Employed ~ . + 0, data = PC)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.116776 -0.044896 -0.008019  0.028916  0.129669 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## PC1  0.44565    0.00991  44.969 7.11e-13 ***
## PC2  0.11157    0.01961   5.689 0.000201 ***
## PC3  0.52973    0.04714  11.237 5.41e-07 ***
## PC4 -0.10174    0.17403  -0.585 0.571760    
## PC5  1.75680    0.42089   4.174 0.001906 ** 
## PC6  1.98275    1.09550   1.810 0.100419    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08235 on 10 degrees of freedom
## Multiple R-squared:  0.9955, Adjusted R-squared:  0.9928 
## F-statistic:   367 on 6 and 10 DF,  p-value: 3.937e-11

Question 8

PCR.lm.noPC4 <- lm(Employed ~ . - PC4 + 0, data = PC)

alpha <- c(PCR.lm.noPC4$coefficients, PC4 = 0)
alpha <- alpha[order(names(alpha))]

beta_PCR <- V %*% alpha
beta_PCR
##                     [,1]
## GNP.deflator -0.03438311
## GNP          -1.00137289
## Unemployed   -0.53832044
## Armed.Forces -0.19688095
## Population   -0.04122182
## Year          2.48498390

Question 9 PCr coeffieicents are generally larger in magnitude

coef_comp <- data.frame(
  OLS = longley.lm$coefficients[-1],
  PCR = beta_PCR
)

coef_comp
##                      OLS         PCR
## GNP.deflator  0.01506187 -0.03438311
## GNP          -0.03581918 -1.00137289
## Unemployed   -0.02020230 -0.53832044
## Armed.Forces -0.01033227 -0.19688095
## Population   -0.05110411 -0.04122182
## Year          1.82915146  2.48498390