# import the dataset and create a data.frame udaje
udaje_svet <- read.csv("udaje/Life-Expectancy-Data-Updated.csv",header=TRUE,sep=",",dec=".",check.names = TRUE)
head(udaje_svet)
udaje_svet <- udaje_svet[-992,]
# z databázy udaje_svet si vyberieme len tie pozorovania, ktoré sa týkajú Abánska
udaje <- subset(udaje_svet, Country == "Albania")
# vyrovnanie priebehu očakávanej dĺžky dožitia v čase
model <- lm(Life_expectancy ~ Alcohol_consumption+Adult_mortality+Incidents_HIV,data = udaje)
library(broom)
library(knitr)
library(kableExtra)
# koeficienty regresie
tidy(model) %>%
kable(digits = 3, caption = "Odhadnuté koeficienty regresie") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 105.254 | 3.678 | 28.614 | 0.000 |
| Alcohol_consumption | -0.018 | 0.202 | -0.091 | 0.929 |
| Adult_mortality | -0.339 | 0.042 | -8.006 | 0.000 |
| Incidents_HIV | -44.297 | 31.242 | -1.418 | 0.184 |
# kvalita vyrovnania
glance(model) %>%
kable(digits = 3, caption = "Ukazovatele kvality vyrovnania") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.953 | 0.941 | 0.299 | 74.885 | 0 | 3 | -0.847 | 11.694 | 15.234 | 0.983 | 11 | 15 |
NA
# výber premenných
X <- udaje[, c("Alcohol_consumption", "Adult_mortality", "Incidents_HIV")]
# výpočet korelačnej matice
cor_matrix <- cor(X, use = "complete.obs")
# zaokrúhlenie
round(cor_matrix, 4)
Alcohol_consumption Adult_mortality Incidents_HIV
Alcohol_consumption 1.0000 0.0106 0.4130
Adult_mortality 0.0106 1.0000 -0.8066
Incidents_HIV 0.4130 -0.8066 1.0000
library(knitr)
round(cor_matrix, 4) %>%
kable(caption = "Korelačná matica")
| Alcohol_consumption | Adult_mortality | Incidents_HIV | |
|---|---|---|---|
| Alcohol_consumption | 1.0000 | 0.0106 | 0.4130 |
| Adult_mortality | 0.0106 | 1.0000 | -0.8066 |
| Incidents_HIV | 0.4130 | -0.8066 | 1.0000 |
cor.test (udaje$Adult_mortality, udaje$Alcohol_consumption)
Pearson's product-moment correlation
data: udaje$Adult_mortality and udaje$Alcohol_consumption
t = 0.038131, df = 13, p-value = 0.9702
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5044203 0.5200209
sample estimates:
cor
0.0105751
cor.test (udaje$Alcohol_consumption, udaje$Incidents_HIV)
Pearson's product-moment correlation
data: udaje$Alcohol_consumption and udaje$Incidents_HIV
t = 1.6351, df = 13, p-value = 0.126
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1258822 0.7636995
sample estimates:
cor
0.4130138
cor.test (udaje$Adult_mortality, udaje$Incidents_HIV )
Pearson's product-moment correlation
data: udaje$Adult_mortality and udaje$Incidents_HIV
t = -4.9197, df = 13, p-value = 0.0002801
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.9332439 -0.5015452
sample estimates:
cor
-0.8065793
if (!requireNamespace("corrplot", quietly = TRUE)) {
install.packages("corrplot")}
library(corrplot)
corrplot(cor_matrix, method = "number", type = "upper")
Pre premennú \(x_j\) definujeme
\[ VIF_j = \frac{1}{1-R_j^2}, \]
kde \(R_j^2\) je koeficient determinácie z pomocnej regresie, v ktorej je \(x_j\) vysvetľovaná ostatnými regresormi.
Ak je \(R_j^2\) blízko jednej, potom je \(VIF_j\) veľký a premenná \(x_j\) je silno lineárne vysvetliteľná ostatnými premennými.
library(car)
# Variance Inflation Factors
vif_values <- vif(model)
vif_values
Alcohol_consumption Adult_mortality Incidents_HIV
2.035214 4.830860 5.823734
Interpretation:
Číslo podmienenosti je založené na vlastných číslach matice \(X'X\). Ak sú vlastné čísla veľmi rozdielne, matica je zle podmienená.
\[ \kappa = \sqrt{\frac{\lambda_{\max}}{\lambda_{\min}}}. \]
X_scaled <- scale(X)
eigen_values <- eigen(cor(X_scaled))$values
condition_number <- sqrt(max(eigen_values) / min(eigen_values))
eigen_values
[1] 1.90191483 1.00857992 0.08950525
condition_number
[1] 4.609685
Interpretácia čísla podmienenosti: - \(\kappa \approx 1\): žiadna multikolinearita, - \(\kappa > 10\): mierna multikolinearita, - \(\kappa > 30\): silná multikolinearita.
Malá p-hodnota znamená, že zamietame hypotézu \(R=I_k\), teda medzi vysvetľujúcimi premennými existuje štatisticky významná korelačná štruktúra.
Multikolinearita nie je porušením exogenity. Nie je teda automaticky dôvodom na zamietnutie OLS modelu. Problém je hlavne inferenčný: veľké štandardné chyby a nestabilné individuálne koeficienty.
Možné riešenia:
Ak sú dve premenné takmer rovnaké a ekonomická teória nevyžaduje obe, môžeme jednu z nich vynechať.
model_reduced <- lm(Life_expectancy ~ Alcohol_consumption + Incidents_HIV, data = udaje)
summary(model)
Call:
lm(formula = Life_expectancy ~ Alcohol_consumption + Adult_mortality +
Incidents_HIV, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-0.36499 -0.18121 -0.05457 0.22523 0.48082
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 105.25422 3.67841 28.614 1.11e-11 ***
Alcohol_consumption -0.01843 0.20206 -0.091 0.929
Adult_mortality -0.33881 0.04232 -8.006 6.49e-06 ***
Incidents_HIV -44.29739 31.24154 -1.418 0.184
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.299 on 11 degrees of freedom
Multiple R-squared: 0.9533, Adjusted R-squared: 0.9406
F-statistic: 74.89 on 3 and 11 DF, p-value: 1.324e-07
summary(model_reduced)
Call:
lm(formula = Life_expectancy ~ Alcohol_consumption + Incidents_HIV,
data = udaje)
Residuals:
Min 1Q Median 3Q Max
-1.23159 -0.42878 0.07813 0.48214 0.94301
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 76.3137 1.7037 44.793 9.99e-15 ***
Alcohol_consumption -1.0512 0.3891 -2.702 0.0192 *
Incidents_HIV 178.4343 35.5604 5.018 0.0003 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7479 on 12 degrees of freedom
Multiple R-squared: 0.6813, Adjusted R-squared: 0.6282
F-statistic: 12.83 on 2 and 12 DF, p-value: 0.001047