Data Analysis Course at USTH (Section: Statistics, Part 3)

Biostatistics

Nguyen Chi Dung

Introduction

#=========================================================
#   Data Sourse: http://www.jstor.org/stable/1911029
#=========================================================

#--------------------------------------------------------
#  Potential consequences of serious multicollinearity
#  For more details: Gujarati 
#--------------------------------------------------------

# An example about Multicollinearity: 

rm(list = ls())
df_multi <- read.csv("F:/usth/data/multi.csv")
library(magrittr)
library(tidyverse)
multi_ols <- df_multi %>% lm(Y ~ ., data = .)
multi_ols %>% summary()
## 
## Call:
## lm(formula = Y ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.1120  -4.4767   0.9206   4.1744   7.5844 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 24.77473    6.75250   3.669  0.00798 **
## X2           0.94154    0.82290   1.144  0.29016   
## X3          -0.04243    0.08066  -0.526  0.61509   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.808 on 7 degrees of freedom
## Multiple R-squared:  0.9635, Adjusted R-squared:  0.9531 
## F-statistic:  92.4 on 2 and 7 DF,  p-value: 9.286e-06
ols1 <- df_multi %>% 
  select(-X2) %>% 
  lm(Y ~ ., data = .)

ols1 %>% summary()
## 
## Call:
## lm(formula = Y ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6227 -5.5867  0.9576  5.0414  9.4142 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.411045   6.874097   3.551   0.0075 ** 
## X3           0.049764   0.003744  13.292  9.8e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.938 on 8 degrees of freedom
## Multiple R-squared:  0.9567, Adjusted R-squared:  0.9513 
## F-statistic: 176.7 on 1 and 8 DF,  p-value: 9.802e-07
ols2 <- df_multi %>% 
  select(-X3) %>% 
  lm(Y ~ ., data = .)
  
ols2 %>% summary()
## 
## Call:
## lm(formula = Y ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.364  -4.977   1.409   4.364   8.364 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 24.45455    6.41382   3.813  0.00514 ** 
## X2           0.50909    0.03574  14.243 5.75e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.493 on 8 degrees of freedom
## Multiple R-squared:  0.9621, Adjusted R-squared:  0.9573 
## F-statistic: 202.9 on 1 and 8 DF,  p-value: 5.753e-07
cor(df_multi)
##            Y        X2        X3
## Y  1.0000000 0.9808474 0.9780997
## X2 0.9808474 1.0000000 0.9989624
## X3 0.9780997 0.9989624 1.0000000
tuong_quan <- cor(df_multi$X2, df_multi$X3)
1 / (1 - tuong_quan^2)
## [1] 482.1275
library(car)
vif(multi_ols)
##       X2       X3 
## 482.1275 482.1275
#------------------------------------------------------------------------
# An example from "The Sensitivity of an Empirical Model of 
# Married Women's Hours of Work to Economic and Statistical Assumptions"
# and remedies for handling multicollinearity
#-------------------------------------------------------------------------

library(readstata13)
mroz <- read.dta13("F:/usth/data/Table4_0.dta")
mroz <- mroz %>% 
  filter(hours != 0) %>% 
  select(-lfp) %>% 
  select(hours, everything())

ols1 <- mroz %>% lm(hours ~., data = .)
ols1 %>% summary()
## 
## Call:
## lm(formula = hours ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1913.8  -431.3     7.4   366.1  3179.2 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.509e+03  1.033e+03   7.272 1.85e-12 ***
## taxableinc   -2.141e-03  1.039e-02  -0.206 0.836791    
## federaltax    6.686e-03  3.503e-02   0.191 0.848748    
## hsiblings     1.831e+00  1.342e+01   0.136 0.891521    
## hfathereduc   2.914e+01  1.142e+01   2.552 0.011071 *  
## hmothereduc  -7.638e+00  1.135e+01  -0.673 0.501505    
## siblings     -3.444e+01  1.366e+01  -2.521 0.012099 *  
## kidsl6       -1.776e+02  8.531e+01  -2.082 0.037987 *  
## kids618      -2.821e+01  2.738e+01  -1.030 0.303551    
## age          -8.428e+00  9.528e+00  -0.885 0.376901    
## educ         -2.034e+01  1.889e+01  -1.077 0.282309    
## wage         -6.357e+01  1.071e+01  -5.938 6.23e-09 ***
## wage76        6.610e+01  1.499e+01   4.411 1.32e-05 ***
## hhours       -4.229e-01  7.276e-02  -5.812 1.25e-08 ***
## hage         -7.738e+00  8.737e+00  -0.886 0.376320    
## heduc        -3.322e+00  1.368e+01  -0.243 0.808310    
## hwage        -1.326e+02  1.650e+01  -8.033 1.05e-14 ***
## faminc        1.422e-02  5.918e-03   2.402 0.016754 *  
## mtr          -5.360e+03  1.066e+03  -5.027 7.51e-07 ***
## mothereduc    4.399e+00  1.162e+01   0.378 0.705278    
## fathereduc   -1.300e+01  1.110e+01  -1.171 0.242171    
## unemployment -1.428e+01  1.043e+01  -1.369 0.171834    
## largecity    -5.928e+01  7.108e+01  -0.834 0.404765    
## exper         1.847e+01  4.748e+00   3.890 0.000117 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 621.3 on 404 degrees of freedom
## Multiple R-squared:  0.394,  Adjusted R-squared:  0.3595 
## F-statistic: 11.42 on 23 and 404 DF,  p-value: < 2.2e-16
vif(ols1)
##   taxableinc   federaltax    hsiblings  hfathereduc  hmothereduc 
##    18.044295    17.594736     1.203526     1.514042     1.528453 
##     siblings       kidsl6      kids618          age         educ 
##     1.166174     1.236764     1.436228     5.986613     2.061726 
##         wage       wage76       hhours         hage        heduc 
##     1.389653     1.478860     1.990072     5.337685     1.907884 
##        hwage       faminc          mtr   mothereduc   fathereduc 
##     3.843313     5.277000     7.445930     1.635153     1.691722 
## unemployment    largecity        exper 
##     1.107239     1.290454     1.618132
# Method 1: Mallows Method
x <- mroz %>% select(-hours)
y <- mroz$hours

library(leaps)
out <- summary(regsubsets(x, y, nbest = 1, nvmax = ncol(x)))
kq <- cbind(out$which, 
            r_squared = out$rsq, 
            cp = out$cp, 
            bic = out$bic, 
            rss = out$rss) %>% 
  as.data.frame()

kq %>% head()
##   (Intercept) taxableinc federaltax hsiblings hfathereduc hmothereduc
## 1           1          0          0         0           0           0
## 2           1          0          0         0           0           0
## 3           1          0          0         0           0           0
## 4           1          0          0         0           0           0
## 5           1          0          0         0           0           0
## 6           1          0          0         0           0           0
##   siblings kidsl6 kids618 age educ wage wage76 hhours hage heduc hwage
## 1        0      0       0   0    0    0      0      0    0     0     0
## 2        0      0       0   0    0    0      0      0    0     0     1
## 3        0      0       0   0    0    0      0      1    0     0     1
## 4        0      0       0   0    0    1      0      1    0     0     1
## 5        0      0       0   0    0    1      1      1    0     0     1
## 6        0      0       0   0    0    1      1      1    0     0     1
##   faminc mtr mothereduc fathereduc unemployment largecity exper  r_squared
## 1      0   0          0          0            0         0     1 0.08953864
## 2      0   1          0          0            0         0     0 0.13923672
## 3      0   1          0          0            0         0     0 0.22821590
## 4      0   1          0          0            0         0     0 0.26885052
## 5      0   1          0          0            0         0     0 0.31386240
## 6      0   1          0          0            0         0     1 0.33094294
##          cp        bic       rss
## 1 182.93608  -28.02979 234271740
## 2 151.80611  -45.99513 221483876
## 3  94.49039  -86.63709 198588554
## 4  69.40234 -103.72717 188132818
## 5  41.39631 -124.86305 176550767
## 6  32.01000 -129.59332 172155754
kq <- kq[, -1]


min(kq$cp)
## [1] 8.654335
which.min(kq$cp)
## [1] 13
kq %<>% mutate(n = 1:nrow(.))

theme_set(theme_minimal())

kq %>% 
  select(r_squared, cp, bic, rss, n) %>% 
  gather(a, b, -n) %>% 
  ggplot(aes(n, b)) + 
  geom_line() + 
  geom_point() + 
  facet_wrap(~ a, scales = "free") + 
  scale_x_continuous(breaks = seq(1, 23, by = 1))

kq[which.min(kq$cp), 1:23] %>% as.vector() -> u
k <- u == 1

small_df <- mroz[, c(TRUE, k)]

min_cp <- small_df %>% 
  lm(hours ~., data = .)

min_cp %>% summary()
## 
## Call:
## lm(formula = hours ~ ., data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1909.4  -434.3    -3.1   363.9  3221.6 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.448e+03  9.873e+02   7.544 2.92e-13 ***
## hfathereduc   2.401e+01  9.595e+00   2.502  0.01274 *  
## siblings     -3.064e+01  1.299e+01  -2.358  0.01883 *  
## kidsl6       -1.734e+02  8.377e+01  -2.070  0.03910 *  
## educ         -2.780e+01  1.532e+01  -1.815  0.07022 .  
## wage         -6.245e+01  1.054e+01  -5.926 6.54e-09 ***
## wage76        6.530e+01  1.451e+01   4.499 8.89e-06 ***
## hhours       -4.356e-01  6.911e-02  -6.303 7.48e-10 ***
## hage         -1.349e+01  4.462e+00  -3.023  0.00266 ** 
## hwage        -1.372e+02  1.552e+01  -8.843  < 2e-16 ***
## faminc        1.347e-02  5.696e-03   2.364  0.01852 *  
## mtr          -5.506e+03  1.007e+03  -5.470 7.82e-08 ***
## unemployment -1.715e+01  1.018e+01  -1.685  0.09268 .  
## exper         1.909e+01  4.389e+00   4.350 1.71e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 617.3 on 414 degrees of freedom
## Multiple R-squared:  0.387,  Adjusted R-squared:  0.3677 
## F-statistic:  20.1 on 13 and 414 DF,  p-value: < 2.2e-16
m <- ols1 %>% summary()
n <- min_cp %>% summary()

n$r.squared / m$r.squared
## [1] 0.9822776
(ncol(small_df)  - 1) / (ncol(mroz) - 1)
## [1] 0.5652174
# Method 2: Remove Highly Correlated Predictors
mroz %>% 
  select(-hours) %>% 
  cor() -> v


library(corrplot)
corrplot(v, method = "color", order = "hclust")

library(caret)
tuongquancao <- findCorrelation(v, cutoff = .8) 
ten <- mroz %>% names()
ten[tuongquancao + 1]
## [1] "mtr"        "age"        "taxableinc"
c2 <- mroz[, -(tuongquancao + 1)]
c2 %>% 
  lm(hours ~ ., data = .) %>% 
  summary()
## 
## Call:
## lm(formula = hours ~ ., data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1863.91  -460.60    14.77   376.22  3117.65 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.554e+03  3.755e+02   6.802 3.69e-11 ***
## federaltax    1.389e-03  9.054e-03   0.153  0.87817    
## hsiblings     2.719e+00  1.375e+01   0.198  0.84332    
## hfathereduc   3.510e+01  1.165e+01   3.012  0.00275 ** 
## hmothereduc  -7.297e+00  1.162e+01  -0.628  0.53048    
## siblings     -3.556e+01  1.392e+01  -2.554  0.01101 *  
## kidsl6       -2.449e+02  8.633e+01  -2.836  0.00480 ** 
## kids618      -5.959e+01  2.691e+01  -2.214  0.02735 *  
## educ         -1.698e+01  1.942e+01  -0.874  0.38245    
## wage         -5.970e+01  1.099e+01  -5.431 9.66e-08 ***
## wage76        7.419e+01  1.504e+01   4.933 1.18e-06 ***
## hhours       -2.789e-01  6.864e-02  -4.063 5.83e-05 ***
## hage         -1.543e+01  4.862e+00  -3.174  0.00162 ** 
## heduc         2.687e+00  1.398e+01   0.192  0.84764    
## hwage        -9.660e+01  1.529e+01  -6.319 6.93e-10 ***
## faminc        3.467e-02  4.354e-03   7.964 1.68e-14 ***
## mothereduc    8.667e+00  1.191e+01   0.727  0.46735    
## fathereduc   -1.142e+01  1.138e+01  -1.004  0.31603    
## unemployment -1.320e+01  1.066e+01  -1.238  0.21627    
## largecity    -5.184e+01  7.255e+01  -0.715  0.47526    
## exper         2.070e+01  4.674e+00   4.430 1.21e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 639.5 on 407 degrees of freedom
## Multiple R-squared:  0.3531, Adjusted R-squared:  0.3213 
## F-statistic: 11.11 on 20 and 407 DF,  p-value: < 2.2e-16