Question 1 Last two predictors suggest multicollinearity
data(longley)
head(longley)
## GNP.deflator GNP Unemployed Armed.Forces Population Year Employed
## 1947 83.0 234.289 235.6 159.0 107.608 1947 60.323
## 1948 88.5 259.426 232.5 145.6 108.632 1948 61.122
## 1949 88.2 258.054 368.2 161.6 109.773 1949 60.171
## 1950 89.5 284.599 335.1 165.0 110.929 1950 61.187
## 1951 96.2 328.975 209.9 309.9 112.075 1951 63.221
## 1952 98.1 346.999 193.2 359.4 113.270 1952 63.639
longley.lm <- lm(Employed ~ ., data = longley)
summary(longley.lm)
##
## Call:
## lm(formula = Employed ~ ., data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.41011 -0.15767 -0.02816 0.10155 0.45539
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.482e+03 8.904e+02 -3.911 0.003560 **
## GNP.deflator 1.506e-02 8.492e-02 0.177 0.863141
## GNP -3.582e-02 3.349e-02 -1.070 0.312681
## Unemployed -2.020e-02 4.884e-03 -4.136 0.002535 **
## Armed.Forces -1.033e-02 2.143e-03 -4.822 0.000944 ***
## Population -5.110e-02 2.261e-01 -0.226 0.826212
## Year 1.829e+00 4.555e-01 4.016 0.003037 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3049 on 9 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9925
## F-statistic: 330.3 on 6 and 9 DF, p-value: 4.984e-10
X = model.matrix(longley.lm)[,-1]
R = cor(X)
ev = eigen(R)$val
sqrt(ev[1]*(ev)^(-1))
## [1] 1.000000 1.979048 4.757028 17.560372 42.470986 110.544153
Question 2 All predictors except armed forces suggest multicollinearity. Since Year and unemployed are significant, they may not be significant due to high VIF scores
library(faraway)
## Warning: package 'faraway' was built under R version 4.5.2
vif(longley.lm)
## GNP.deflator GNP Unemployed Armed.Forces Population Year
## 135.53244 1788.51348 33.61889 3.58893 399.15102 758.98060
Question 3 The adjusted r^2 is the same for both models. The VIF and condition numbers are reasonable now.
longley.lm.removed <- lm(Employed ~ Unemployed + Armed.Forces + Year, data = longley)
summary(longley.lm.removed)
##
## Call:
## lm(formula = Employed ~ Unemployed + Armed.Forces + Year, data = longley)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57285 -0.11989 0.04087 0.13979 0.75303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.797e+03 6.864e+01 -26.183 5.89e-12 ***
## Unemployed -1.470e-02 1.671e-03 -8.793 1.41e-06 ***
## Armed.Forces -7.723e-03 1.837e-03 -4.204 0.00122 **
## Year 9.564e-01 3.553e-02 26.921 4.24e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3321 on 12 degrees of freedom
## Multiple R-squared: 0.9928, Adjusted R-squared: 0.9911
## F-statistic: 555.2 on 3 and 12 DF, p-value: 3.916e-13
X.rm = model.matrix(longley.lm.removed)[,-1]
r.rm = cor(X.rm)
ev.rm = eigen(r.rm)$val
sqrt(ev.rm[1]*(ev.rm)^(-1))
## [1] 1.000000 1.217831 3.703007
vif(longley.lm.removed)
## Unemployed Armed.Forces Year
## 3.317929 2.223317 3.890861
Question 4 The same predictors are significant
longley.scale = data.frame(scale(longley))
apply(longley.scale, 2, mean) # verify that each column has mean 0
## GNP.deflator GNP Unemployed Armed.Forces Population
## -4.774826e-16 -1.118897e-16 -5.724587e-17 5.767956e-17 8.949411e-16
## Year Employed
## 0.000000e+00 -1.879573e-15
apply(longley.scale, 2, sd) # verify that each column has SD 1
## GNP.deflator GNP Unemployed Armed.Forces Population Year
## 1 1 1 1 1 1
## Employed
## 1
longley.scale.lm <- lm(Employed ~ ., data = longley.scale)
summary(lm(Employed ~ ., data = longley.scale))
##
## Call:
## lm(formula = Employed ~ ., data = longley.scale)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.116776 -0.044896 -0.008019 0.028916 0.129669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.752e-15 2.170e-02 0.000 1.000000
## GNP.deflator 4.628e-02 2.609e-01 0.177 0.863141
## GNP -1.014e+00 9.479e-01 -1.070 0.312681
## Unemployed -5.375e-01 1.300e-01 -4.136 0.002535 **
## Armed.Forces -2.047e-01 4.246e-02 -4.822 0.000944 ***
## Population -1.012e-01 4.478e-01 -0.226 0.826212
## Year 2.480e+00 6.175e-01 4.016 0.003037 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0868 on 9 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9925
## F-statistic: 330.3 on 6 and 9 DF, p-value: 4.984e-10
Question 5 Many variables are strongly correlated with each other
longley.cor <- cor(longley.scale)
longley.cor
## GNP.deflator GNP Unemployed Armed.Forces Population
## GNP.deflator 1.0000000 0.9915892 0.6206334 0.4647442 0.9791634
## GNP 0.9915892 1.0000000 0.6042609 0.4464368 0.9910901
## Unemployed 0.6206334 0.6042609 1.0000000 -0.1774206 0.6865515
## Armed.Forces 0.4647442 0.4464368 -0.1774206 1.0000000 0.3644163
## Population 0.9791634 0.9910901 0.6865515 0.3644163 1.0000000
## Year 0.9911492 0.9952735 0.6682566 0.4172451 0.9939528
## Employed 0.9708985 0.9835516 0.5024981 0.4573074 0.9603906
## Year Employed
## GNP.deflator 0.9911492 0.9708985
## GNP 0.9952735 0.9835516
## Unemployed 0.6682566 0.5024981
## Armed.Forces 0.4172451 0.4573074
## Population 0.9939528 0.9603906
## Year 1.0000000 0.9713295
## Employed 0.9713295 1.0000000
Question 6
PCA = princomp(model.frame(longley.lm)[,2:7],cor = TRUE)
PCA$loadings #loading matrix
##
## Loadings:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## GNP.deflator 0.462 0.149 0.793 0.338 0.135
## GNP 0.462 0.278 -0.122 -0.150 -0.818
## Unemployed 0.321 -0.596 -0.728 -0.107
## Armed.Forces 0.202 0.798 -0.562
## Population 0.462 0.196 -0.590 0.549 0.312
## Year 0.465 0.128 -0.750 0.450
##
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## SS loadings 1.000 1.000 1.000 1.000 1.000 1.000
## Proportion Var 0.167 0.167 0.167 0.167 0.167 0.167
## Cumulative Var 0.167 0.333 0.500 0.667 0.833 1.000
summary(PCA) # variance
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 2.1455482 1.0841312 0.45102702 0.122181253 0.0505179747
## Proportion of Variance 0.7672295 0.1958901 0.03390423 0.002488043 0.0004253443
## Cumulative Proportion 0.7672295 0.9631196 0.99702383 0.999511871 0.9999372153
## Comp.6
## Standard deviation 1.940897e-02
## Proportion of Variance 6.278469e-05
## Cumulative Proportion 1.000000e+00
r = eigen(cor(model.frame(longley.lm)[,2:7]))$vectors # loading
ev = eigen(r)$val # variance
sqrt(ev[1]*(ev)^(-1)) ## condition indices
## [1] 1.0000000+0.0000000i 0.0000000-1.0000000i 0.2898226-0.9570804i
## [4] 0.2898226+0.9570804i 0.7667560-0.6419387i 0.7667560+0.6419387i
Question 7
X.tilde <- as.matrix(longley.scale[ , colnames(longley.scale) != "Employed"])
pca <- prcomp(X.tilde, scale = FALSE, center = FALSE)
V <- pca$rotation
PC <- X.tilde %*% V
colnames(PC) <- paste("PC", 1:ncol(PC), sep = "")
PC <- data.frame(PC)
PC$Employed <- longley.scale[, "Employed"]
PCR.lm <- lm(Employed ~ . + 0, data = PC)
summary(PCR.lm)
##
## Call:
## lm(formula = Employed ~ . + 0, data = PC)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.116776 -0.044896 -0.008019 0.028916 0.129669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## PC1 0.44565 0.00991 44.969 7.11e-13 ***
## PC2 0.11157 0.01961 5.689 0.000201 ***
## PC3 0.52973 0.04714 11.237 5.41e-07 ***
## PC4 -0.10174 0.17403 -0.585 0.571760
## PC5 1.75680 0.42089 4.174 0.001906 **
## PC6 1.98275 1.09550 1.810 0.100419
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08235 on 10 degrees of freedom
## Multiple R-squared: 0.9955, Adjusted R-squared: 0.9928
## F-statistic: 367 on 6 and 10 DF, p-value: 3.937e-11
Question 8
PCR.lm.noPC4 <- lm(Employed ~ . - PC4 + 0, data = PC)
alpha <- c(PCR.lm.noPC4$coefficients, PC4 = 0)
alpha <- alpha[order(names(alpha))]
beta_PCR <- V %*% alpha
beta_PCR
## [,1]
## GNP.deflator -0.03438311
## GNP -1.00137289
## Unemployed -0.53832044
## Armed.Forces -0.19688095
## Population -0.04122182
## Year 2.48498390
Question 9 PCr coeffieicents are generally larger in magnitude
coef_comp <- data.frame(
OLS = longley.lm$coefficients[-1],
PCR = beta_PCR
)
coef_comp
## OLS PCR
## GNP.deflator 0.01506187 -0.03438311
## GNP -0.03581918 -1.00137289
## Unemployed -0.02020230 -0.53832044
## Armed.Forces -0.01033227 -0.19688095
## Population -0.05110411 -0.04122182
## Year 1.82915146 2.48498390