# Cvičenie 10 – Multikolinearita v regresných modeloch
# 0. Balíky -------------------------------------------------------------
library(car)
library(dplyr)
rm(list = ls())
# 3. Východiskový model a údaje ----------------------------------------
# Používam svoju WDI databázu "wdi_data.csv" z podpriečinka "data".
# Chcem mať model v tvare:
# Life.expectancy ~ BMI + GDP + Schooling.
wdi_data <- read.csv(
"data/wdi_data.csv",
dec = ".",
sep = ",",
header = TRUE,
check.names = FALSE
)
# posledný dostupný rok v databáze
posledny_rok <- max(wdi_data$Time, na.rm = TRUE)
posledny_rok
## [1] 2021
# názvy stĺpcov z WDI, ktoré použijem:
col_life <- "Survival to age 65, female (% of cohort) [SP.DYN.TO65.FE.ZS]"
col_bmi <- "Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]"
col_gdp <- "Current health expenditure (% of GDP) [SH.XPD.CHEX.GD.ZS]"
col_sch <- "Educational attainment, at least completed upper secondary, population 25+, total (%) (cumulative) [SE.SEC.CUAT.UP.ZS]"
# prierez za posledný rok – len vybrané premenné
udaje_raw <- subset(
wdi_data,
Time == posledny_rok,
select = c(col_life, col_bmi, col_gdp, col_sch)
)
# premenujem stĺpce
names(udaje_raw) <- c("Life.expectancy", "BMI", "GDP", "Schooling")
# imputácia chýbajúcich hodnôt mediánom -------------------------------
udaje <- udaje_raw
column_medians <- sapply(udaje, median, na.rm = TRUE)
for (col in names(udaje)) {
udaje[[col]][is.na(udaje[[col]])] <- column_medians[col]
}
summary(udaje)
## Life.expectancy BMI GDP Schooling
## Min. :84.59 Min. : 2.828 Min. : 5.307 Min. :52.44
## 1st Qu.:92.05 1st Qu.: 4.284 1st Qu.:10.397 1st Qu.:74.76
## Median :93.08 Median : 5.116 Median :11.043 Median :79.90
## Mean :92.69 Mean : 6.095 Mean :11.161 Mean :77.24
## 3rd Qu.:93.90 3rd Qu.: 7.572 3rd Qu.:12.204 3rd Qu.:81.29
## Max. :95.30 Max. :14.781 Max. :17.363 Max. :91.31
head(udaje)
## Life.expectancy BMI GDP Schooling
## 191 93.33162 5.116 10.54364 81.13000
## 192 92.96035 6.459 12.10000 81.45425
## 193 92.22327 6.248 11.04291 74.24768
## 194 91.67388 7.527 12.33411 86.52000
## 195 92.24182 5.043 10.82000 77.07693
## 196 93.08298 7.617 10.25000 80.73451
# 4. Odhad základného regresného modelu --------------------------------
# Life.expectancy_i = β0 + β1 BMI_i + β2 GDP_i + β3 Schooling_i + u_i
model <- lm(Life.expectancy ~ BMI + GDP + Schooling,
data = udaje)
summary(model)
##
## Call:
## lm(formula = Life.expectancy ~ BMI + GDP + Schooling, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.83185 -0.55016 0.09509 0.81274 2.14624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.4332 4.0941 24.287 1.86e-13 ***
## BMI 0.1430 0.1644 0.870 0.39810
## GDP -0.9072 0.1968 -4.610 0.00034 ***
## Schooling 0.0325 0.0565 0.575 0.57362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.449 on 15 degrees of freedom
## Multiple R-squared: 0.67, Adjusted R-squared: 0.604
## F-statistic: 10.15 on 3 and 15 DF, p-value: 0.0006686
# 5. Korelačná matica a scatterplotová matica --------------------------
xvars <- udaje[, c("BMI", "GDP", "Schooling")]
round(cor(xvars), 3)
## BMI GDP Schooling
## BMI 1.000 0.066 -0.563
## GDP 0.066 1.000 0.484
## Schooling -0.563 0.484 1.000
pairs(
xvars,
main = "Scatterplotová matica – premenné BMI, GDP, Schooling"
)

# 6. VIF – Variance Inflation Factor -----------------------------------
vif(model)
## BMI GDP Schooling
## 1.873730 1.672847 2.436849
# 7. Condition Number ---------------------------------------------------
X <- model.matrix(model)[, -1]
XtX <- t(X) %*% X
eig <- eigen(XtX)
condition_number <- sqrt(max(eig$values) / min(eig$values))
condition_number
## [1] 47.43062
# 8. Riešenia multikolinearity -----------------------------------------
## 8.1 Vynechanie premennej
model_noBMI <- lm(Life.expectancy ~ GDP + Schooling, data = udaje)
summary(model_noBMI)
##
## Call:
## lm(formula = Life.expectancy ~ GDP + Schooling, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9620 -0.6488 0.1348 0.9095 2.1854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.020e+02 2.821e+00 36.157 < 2e-16 ***
## GDP -8.271e-01 1.726e-01 -4.793 0.000199 ***
## Schooling -9.705e-04 4.105e-02 -0.024 0.981430
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.438 on 16 degrees of freedom
## Multiple R-squared: 0.6533, Adjusted R-squared: 0.61
## F-statistic: 15.08 on 2 and 16 DF, p-value: 0.0002087
model_noSchooling <- lm(Life.expectancy ~ GDP + BMI, data = udaje)
summary(model_noSchooling)
##
## Call:
## lm(formula = Life.expectancy ~ GDP + BMI, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8639 -0.6275 0.1035 0.9450 2.2687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.53768 1.79954 56.424 <2e-16 ***
## GDP -0.83564 0.14926 -5.599 4e-05 ***
## BMI 0.07858 0.11782 0.667 0.514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.418 on 16 degrees of freedom
## Multiple R-squared: 0.6627, Adjusted R-squared: 0.6205
## F-statistic: 15.72 on 2 and 16 DF, p-value: 0.0001676
## 8.2 Škálovanie premenných (centrovanie + štandardizácia)
udaje$BMI_c <- scale(udaje$BMI, center = TRUE, scale = TRUE)
udaje$GDP_c <- scale(udaje$GDP, center = TRUE, scale = TRUE)
udaje$Schooling_c <- scale(udaje$Schooling, center = TRUE, scale = TRUE)
model_centered <- lm(Life.expectancy ~ BMI_c + GDP_c + Schooling_c,
data = udaje)
summary(model_centered)
##
## Call:
## lm(formula = Life.expectancy ~ BMI_c + GDP_c + Schooling_c, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.83185 -0.55016 0.09509 0.81274 2.14624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 92.6900 0.3323 278.905 < 2e-16 ***
## BMI_c 0.4065 0.4674 0.870 0.39810
## GDP_c -2.0359 0.4416 -4.610 0.00034 ***
## Schooling_c 0.3066 0.5330 0.575 0.57362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.449 on 15 degrees of freedom
## Multiple R-squared: 0.67, Adjusted R-squared: 0.604
## F-statistic: 10.15 on 3 and 15 DF, p-value: 0.0006686
vif(model_centered)
## BMI_c GDP_c Schooling_c
## 1.873730 1.672847 2.436849
Xc <- model.matrix(model_centered)[, -1]
XtXc <- t(Xc) %*% Xc
eig_c <- eigen(XtXc)
condition_number_centered <- sqrt(max(eig_c$values) / min(eig_c$values))
condition_number_centered
## [1] 2.761964
## 8.3 Úprava GDP na tisíce (GDP1000) – lepšia mierka -------------------
udaje$GDP1000 <- udaje$GDP / 1000
head(udaje)
## Life.expectancy BMI GDP Schooling BMI_c GDP_c
## 191 93.33162 5.116 10.54364 81.13000 -0.34437867 -0.27510769
## 192 92.96035 6.459 12.10000 81.45425 0.12801734 0.41840964
## 193 92.22327 6.248 11.04291 74.24768 0.05379875 -0.05263269
## 194 91.67388 7.527 12.33411 86.52000 0.50368297 0.52272834
## 195 92.24182 5.043 10.82000 77.07693 -0.37005619 -0.15196079
## 196 93.08298 7.617 10.25000 80.73451 0.53534018 -0.40595387
## Schooling_c GDP1000
## 191 0.41239400 0.01054364
## 192 0.44676449 0.01210000
## 193 -0.31713286 0.01104291
## 194 0.98373496 0.01233411
## 195 -0.01723219 0.01082000
## 196 0.37047253 0.01025000
model_GDP_1000 <- lm(Life.expectancy ~ BMI + GDP1000 + Schooling,
data = udaje)
summary(model_GDP_1000)
##
## Call:
## lm(formula = Life.expectancy ~ BMI + GDP1000 + Schooling, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.83185 -0.55016 0.09509 0.81274 2.14624
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 99.4332 4.0941 24.287 1.86e-13 ***
## BMI 0.1430 0.1644 0.870 0.39810
## GDP1000 -907.2055 196.7848 -4.610 0.00034 ***
## Schooling 0.0325 0.0565 0.575 0.57362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.449 on 15 degrees of freedom
## Multiple R-squared: 0.67, Adjusted R-squared: 0.604
## F-statistic: 10.15 on 3 and 15 DF, p-value: 0.0006686
vif(model_GDP_1000)
## BMI GDP1000 Schooling
## 1.873730 1.672847 2.436849
Xg <- model.matrix(model_GDP_1000)[, -1]
XtXg <- t(Xg) %*% Xg
eig_g <- eigen(XtXg)
condition_number_GDP1000 <- sqrt(max(eig_g$values) / min(eig_g$values))
condition_number_GDP1000
## [1] 44699.37
## 8.4 Očistenie databázy od pracovných stĺpcov ------------------------
udaje <- udaje %>%
dplyr::select(-BMI_c, -GDP_c, -Schooling_c, GDP1000)
udaje
## Life.expectancy BMI GDP Schooling GDP1000
## 191 93.33162 5.116 10.543639 81.13000 0.010543639
## 192 92.96035 6.459 12.100000 81.45425 0.012100000
## 193 92.22327 6.248 11.042908 74.24768 0.011042908
## 194 91.67388 7.527 12.334107 86.52000 0.012334107
## 195 92.24182 5.043 10.820000 77.07693 0.010820000
## 196 93.08298 7.617 10.250000 80.73451 0.010250000
## 197 92.48386 7.874 12.307873 76.73968 0.012307873
## 198 91.87535 3.594 12.934130 80.94005 0.012934130
## 199 93.70033 9.497 9.380000 52.43790 0.009380000
## 200 94.56724 2.828 10.820000 79.89889 0.010820000
## 201 95.30061 3.639 9.331455 79.33778 0.009331455
## 202 91.82949 4.209 11.290000 73.85083 0.011290000
## 203 93.93712 4.359 10.080000 75.26434 0.010080000
## 204 93.82779 14.781 10.740039 54.98794 0.010740039
## 205 93.86815 8.722 11.250000 83.11355 0.011250000
## 206 94.33886 5.013 11.801080 84.56273 0.011801080
## 207 94.61921 3.105 5.306952 73.48000 0.005306952
## 208 90.66261 4.826 12.364699 80.46000 0.012364699
## 209 84.58518 5.349 17.362568 91.31336 0.017362568