Introduction
#=========================================================
# Data Sourse: http://www.jstor.org/stable/1911029
#=========================================================
#--------------------------------------------------------
# Potential consequences of serious multicollinearity
# For more details: Gujarati
#--------------------------------------------------------
# An example about Multicollinearity:
rm(list = ls())
df_multi <- read.csv("F:/usth/data/multi.csv")
library(magrittr)
library(tidyverse)
multi_ols <- df_multi %>% lm(Y ~ ., data = .)
multi_ols %>% summary()
##
## Call:
## lm(formula = Y ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1120 -4.4767 0.9206 4.1744 7.5844
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.77473 6.75250 3.669 0.00798 **
## X2 0.94154 0.82290 1.144 0.29016
## X3 -0.04243 0.08066 -0.526 0.61509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.808 on 7 degrees of freedom
## Multiple R-squared: 0.9635, Adjusted R-squared: 0.9531
## F-statistic: 92.4 on 2 and 7 DF, p-value: 9.286e-06
ols1 <- df_multi %>%
select(-X2) %>%
lm(Y ~ ., data = .)
ols1 %>% summary()
##
## Call:
## lm(formula = Y ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6227 -5.5867 0.9576 5.0414 9.4142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.411045 6.874097 3.551 0.0075 **
## X3 0.049764 0.003744 13.292 9.8e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.938 on 8 degrees of freedom
## Multiple R-squared: 0.9567, Adjusted R-squared: 0.9513
## F-statistic: 176.7 on 1 and 8 DF, p-value: 9.802e-07
ols2 <- df_multi %>%
select(-X3) %>%
lm(Y ~ ., data = .)
ols2 %>% summary()
##
## Call:
## lm(formula = Y ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.364 -4.977 1.409 4.364 8.364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.45455 6.41382 3.813 0.00514 **
## X2 0.50909 0.03574 14.243 5.75e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.493 on 8 degrees of freedom
## Multiple R-squared: 0.9621, Adjusted R-squared: 0.9573
## F-statistic: 202.9 on 1 and 8 DF, p-value: 5.753e-07
cor(df_multi)
## Y X2 X3
## Y 1.0000000 0.9808474 0.9780997
## X2 0.9808474 1.0000000 0.9989624
## X3 0.9780997 0.9989624 1.0000000
tuong_quan <- cor(df_multi$X2, df_multi$X3)
1 / (1 - tuong_quan^2)
## [1] 482.1275
library(car)
vif(multi_ols)
## X2 X3
## 482.1275 482.1275
#------------------------------------------------------------------------
# An example from "The Sensitivity of an Empirical Model of
# Married Women's Hours of Work to Economic and Statistical Assumptions"
# and remedies for handling multicollinearity
#-------------------------------------------------------------------------
library(readstata13)
mroz <- read.dta13("F:/usth/data/Table4_0.dta")
mroz <- mroz %>%
filter(hours != 0) %>%
select(-lfp) %>%
select(hours, everything())
ols1 <- mroz %>% lm(hours ~., data = .)
ols1 %>% summary()
##
## Call:
## lm(formula = hours ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1913.8 -431.3 7.4 366.1 3179.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.509e+03 1.033e+03 7.272 1.85e-12 ***
## taxableinc -2.141e-03 1.039e-02 -0.206 0.836791
## federaltax 6.686e-03 3.503e-02 0.191 0.848748
## hsiblings 1.831e+00 1.342e+01 0.136 0.891521
## hfathereduc 2.914e+01 1.142e+01 2.552 0.011071 *
## hmothereduc -7.638e+00 1.135e+01 -0.673 0.501505
## siblings -3.444e+01 1.366e+01 -2.521 0.012099 *
## kidsl6 -1.776e+02 8.531e+01 -2.082 0.037987 *
## kids618 -2.821e+01 2.738e+01 -1.030 0.303551
## age -8.428e+00 9.528e+00 -0.885 0.376901
## educ -2.034e+01 1.889e+01 -1.077 0.282309
## wage -6.357e+01 1.071e+01 -5.938 6.23e-09 ***
## wage76 6.610e+01 1.499e+01 4.411 1.32e-05 ***
## hhours -4.229e-01 7.276e-02 -5.812 1.25e-08 ***
## hage -7.738e+00 8.737e+00 -0.886 0.376320
## heduc -3.322e+00 1.368e+01 -0.243 0.808310
## hwage -1.326e+02 1.650e+01 -8.033 1.05e-14 ***
## faminc 1.422e-02 5.918e-03 2.402 0.016754 *
## mtr -5.360e+03 1.066e+03 -5.027 7.51e-07 ***
## mothereduc 4.399e+00 1.162e+01 0.378 0.705278
## fathereduc -1.300e+01 1.110e+01 -1.171 0.242171
## unemployment -1.428e+01 1.043e+01 -1.369 0.171834
## largecity -5.928e+01 7.108e+01 -0.834 0.404765
## exper 1.847e+01 4.748e+00 3.890 0.000117 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 621.3 on 404 degrees of freedom
## Multiple R-squared: 0.394, Adjusted R-squared: 0.3595
## F-statistic: 11.42 on 23 and 404 DF, p-value: < 2.2e-16
vif(ols1)
## taxableinc federaltax hsiblings hfathereduc hmothereduc
## 18.044295 17.594736 1.203526 1.514042 1.528453
## siblings kidsl6 kids618 age educ
## 1.166174 1.236764 1.436228 5.986613 2.061726
## wage wage76 hhours hage heduc
## 1.389653 1.478860 1.990072 5.337685 1.907884
## hwage faminc mtr mothereduc fathereduc
## 3.843313 5.277000 7.445930 1.635153 1.691722
## unemployment largecity exper
## 1.107239 1.290454 1.618132
# Method 1: Mallows Method
x <- mroz %>% select(-hours)
y <- mroz$hours
library(leaps)
out <- summary(regsubsets(x, y, nbest = 1, nvmax = ncol(x)))
kq <- cbind(out$which,
r_squared = out$rsq,
cp = out$cp,
bic = out$bic,
rss = out$rss) %>%
as.data.frame()
kq %>% head()
## (Intercept) taxableinc federaltax hsiblings hfathereduc hmothereduc
## 1 1 0 0 0 0 0
## 2 1 0 0 0 0 0
## 3 1 0 0 0 0 0
## 4 1 0 0 0 0 0
## 5 1 0 0 0 0 0
## 6 1 0 0 0 0 0
## siblings kidsl6 kids618 age educ wage wage76 hhours hage heduc hwage
## 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 1 0 0 1
## 4 0 0 0 0 0 1 0 1 0 0 1
## 5 0 0 0 0 0 1 1 1 0 0 1
## 6 0 0 0 0 0 1 1 1 0 0 1
## faminc mtr mothereduc fathereduc unemployment largecity exper r_squared
## 1 0 0 0 0 0 0 1 0.08953864
## 2 0 1 0 0 0 0 0 0.13923672
## 3 0 1 0 0 0 0 0 0.22821590
## 4 0 1 0 0 0 0 0 0.26885052
## 5 0 1 0 0 0 0 0 0.31386240
## 6 0 1 0 0 0 0 1 0.33094294
## cp bic rss
## 1 182.93608 -28.02979 234271740
## 2 151.80611 -45.99513 221483876
## 3 94.49039 -86.63709 198588554
## 4 69.40234 -103.72717 188132818
## 5 41.39631 -124.86305 176550767
## 6 32.01000 -129.59332 172155754
kq <- kq[, -1]
min(kq$cp)
## [1] 8.654335
which.min(kq$cp)
## [1] 13
kq %<>% mutate(n = 1:nrow(.))
theme_set(theme_minimal())
kq %>%
select(r_squared, cp, bic, rss, n) %>%
gather(a, b, -n) %>%
ggplot(aes(n, b)) +
geom_line() +
geom_point() +
facet_wrap(~ a, scales = "free") +
scale_x_continuous(breaks = seq(1, 23, by = 1))

kq[which.min(kq$cp), 1:23] %>% as.vector() -> u
k <- u == 1
small_df <- mroz[, c(TRUE, k)]
min_cp <- small_df %>%
lm(hours ~., data = .)
min_cp %>% summary()
##
## Call:
## lm(formula = hours ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1909.4 -434.3 -3.1 363.9 3221.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.448e+03 9.873e+02 7.544 2.92e-13 ***
## hfathereduc 2.401e+01 9.595e+00 2.502 0.01274 *
## siblings -3.064e+01 1.299e+01 -2.358 0.01883 *
## kidsl6 -1.734e+02 8.377e+01 -2.070 0.03910 *
## educ -2.780e+01 1.532e+01 -1.815 0.07022 .
## wage -6.245e+01 1.054e+01 -5.926 6.54e-09 ***
## wage76 6.530e+01 1.451e+01 4.499 8.89e-06 ***
## hhours -4.356e-01 6.911e-02 -6.303 7.48e-10 ***
## hage -1.349e+01 4.462e+00 -3.023 0.00266 **
## hwage -1.372e+02 1.552e+01 -8.843 < 2e-16 ***
## faminc 1.347e-02 5.696e-03 2.364 0.01852 *
## mtr -5.506e+03 1.007e+03 -5.470 7.82e-08 ***
## unemployment -1.715e+01 1.018e+01 -1.685 0.09268 .
## exper 1.909e+01 4.389e+00 4.350 1.71e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 617.3 on 414 degrees of freedom
## Multiple R-squared: 0.387, Adjusted R-squared: 0.3677
## F-statistic: 20.1 on 13 and 414 DF, p-value: < 2.2e-16
m <- ols1 %>% summary()
n <- min_cp %>% summary()
n$r.squared / m$r.squared
## [1] 0.9822776
(ncol(small_df) - 1) / (ncol(mroz) - 1)
## [1] 0.5652174
# Method 2: Remove Highly Correlated Predictors
mroz %>%
select(-hours) %>%
cor() -> v
library(corrplot)
corrplot(v, method = "color", order = "hclust")

library(caret)
tuongquancao <- findCorrelation(v, cutoff = .8)
ten <- mroz %>% names()
ten[tuongquancao + 1]
## [1] "mtr" "age" "taxableinc"
c2 <- mroz[, -(tuongquancao + 1)]
c2 %>%
lm(hours ~ ., data = .) %>%
summary()
##
## Call:
## lm(formula = hours ~ ., data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1863.91 -460.60 14.77 376.22 3117.65
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.554e+03 3.755e+02 6.802 3.69e-11 ***
## federaltax 1.389e-03 9.054e-03 0.153 0.87817
## hsiblings 2.719e+00 1.375e+01 0.198 0.84332
## hfathereduc 3.510e+01 1.165e+01 3.012 0.00275 **
## hmothereduc -7.297e+00 1.162e+01 -0.628 0.53048
## siblings -3.556e+01 1.392e+01 -2.554 0.01101 *
## kidsl6 -2.449e+02 8.633e+01 -2.836 0.00480 **
## kids618 -5.959e+01 2.691e+01 -2.214 0.02735 *
## educ -1.698e+01 1.942e+01 -0.874 0.38245
## wage -5.970e+01 1.099e+01 -5.431 9.66e-08 ***
## wage76 7.419e+01 1.504e+01 4.933 1.18e-06 ***
## hhours -2.789e-01 6.864e-02 -4.063 5.83e-05 ***
## hage -1.543e+01 4.862e+00 -3.174 0.00162 **
## heduc 2.687e+00 1.398e+01 0.192 0.84764
## hwage -9.660e+01 1.529e+01 -6.319 6.93e-10 ***
## faminc 3.467e-02 4.354e-03 7.964 1.68e-14 ***
## mothereduc 8.667e+00 1.191e+01 0.727 0.46735
## fathereduc -1.142e+01 1.138e+01 -1.004 0.31603
## unemployment -1.320e+01 1.066e+01 -1.238 0.21627
## largecity -5.184e+01 7.255e+01 -0.715 0.47526
## exper 2.070e+01 4.674e+00 4.430 1.21e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 639.5 on 407 degrees of freedom
## Multiple R-squared: 0.3531, Adjusted R-squared: 0.3213
## F-statistic: 11.11 on 20 and 407 DF, p-value: < 2.2e-16