# import the dataset and create a data.frame udaje
udaje_svet <- read.csv("udaje/Life-Expectancy-Data-Updated.csv",header=TRUE,sep=",",dec=".",check.names = TRUE)
head(udaje_svet)
udaje_svet <- udaje_svet[-992,]
# z databázy udaje_svet si vyberieme len tie pozorovania, ktoré sa týkajú Moldavsko
udaje <- subset(udaje_svet, Country == "Moldova")
# vyrovnanie priebehu očakávanej dĺžky dožitia v čase
model <- lm(Life_expectancy ~ Alcohol_consumption+Adult_mortality+Incidents_HIV,data = udaje)
library(broom)
library(knitr)
library(kableExtra)
# koeficienty regresie
tidy(model) %>%
kable(digits = 3, caption = "Odhadnuté koeficienty regresie") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 80.406 | 0.629 | 127.796 | 0.000 |
| Alcohol_consumption | -0.009 | 0.039 | -0.242 | 0.813 |
| Adult_mortality | -0.056 | 0.002 | -27.784 | 0.000 |
| Incidents_HIV | 1.342 | 0.900 | 1.492 | 0.162 |
# kvalita vyrovnania
glance(model) %>%
kable(digits = 3, caption = "Ukazovatele kvality vyrovnania") %>%
kable_styling(bootstrap_options = "striped", full_width = FALSE)
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.988 | 0.985 | 0.194 | 318.592 | 0 | 3 | 5.853 | -1.706 | 2.157 | 0.451 | 12 | 16 |
NA
# výber premenných
X <- udaje[, c("Alcohol_consumption", "Adult_mortality", "Incidents_HIV")]
# výpočet korelačnej matice
cor_matrix <- cor(X, use = "complete.obs")
# zaokrúhlenie
round(cor_matrix, 4)
Alcohol_consumption Adult_mortality Incidents_HIV
Alcohol_consumption 1.0000 -0.0964 0.3940
Adult_mortality -0.0964 1.0000 -0.3885
Incidents_HIV 0.3940 -0.3885 1.0000
library(knitr)
round(cor_matrix, 4) %>%
kable(caption = "Korelačná matica")
| Alcohol_consumption | Adult_mortality | Incidents_HIV | |
|---|---|---|---|
| Alcohol_consumption | 1.0000 | -0.0964 | 0.3940 |
| Adult_mortality | -0.0964 | 1.0000 | -0.3885 |
| Incidents_HIV | 0.3940 | -0.3885 | 1.0000 |
cor.test (udaje$Adult_mortality, udaje$Alcohol_consumption)
Pearson's product-moment correlation
data: udaje$Adult_mortality and udaje$Alcohol_consumption
t = -0.36249, df = 14, p-value = 0.7224
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5651204 0.4193211
sample estimates:
cor
-0.09642769
cor.test (udaje$Alcohol_consumption, udaje$Incidents_HIV)
Pearson's product-moment correlation
data: udaje$Alcohol_consumption and udaje$Incidents_HIV
t = 1.6041, df = 14, p-value = 0.131
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1263452 0.7443520
sample estimates:
cor
0.3940387
cor.test (udaje$Adult_mortality, udaje$Incidents_HIV )
Pearson's product-moment correlation
data: udaje$Adult_mortality and udaje$Incidents_HIV
t = -1.5777, df = 14, p-value = 0.137
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.7414342 0.1327472
sample estimates:
cor
-0.3885244
if (!requireNamespace("corrplot", quietly = TRUE)) {
install.packages("corrplot")}
library(corrplot)
corrplot(cor_matrix, method = "number", type = "upper")
Pre premennú \(x_j\) definujeme
\[ VIF_j = \frac{1}{1-R_j^2}, \]
kde \(R_j^2\) je koeficient determinácie z pomocnej regresie, v ktorej je \(x_j\) vysvetľovaná ostatnými regresormi.
Ak je \(R_j^2\) blízko jednej, potom je \(VIF_j\) veľký a premenná \(x_j\) je silno lineárne vysvetliteľná ostatnými premennými.
library(car)
# Variance Inflation Factors
vif_values <- vif(model)
vif_values
Alcohol_consumption Adult_mortality Incidents_HIV
1.189129 1.183085 1.387520
Interpretation:
Číslo podmienenosti je založené na vlastných číslach matice \(X'X\). Ak sú vlastné čísla veľmi rozdielne, matica je zle podmienená.
\[ \kappa = \sqrt{\frac{\lambda_{\max}}{\lambda_{\min}}}. \]
X_scaled <- scale(X)
eigen_values <- eigen(cor(X_scaled))$values
condition_number <- sqrt(max(eigen_values) / min(eigen_values))
eigen_values
[1] 1.6036759 0.9035825 0.4927416
condition_number
[1] 1.804051
Interpretácia čísla podmienenosti: - \(\kappa \approx 1\): žiadna multikolinearita, - \(\kappa > 10\): mierna multikolinearita, - \(\kappa > 30\): silná multikolinearita.
Malá p-hodnota znamená, že zamietame hypotézu \(R=I_k\), teda medzi vysvetľujúcimi premennými existuje štatisticky významná korelačná štruktúra.
Multikolinearita nie je porušením exogenity. Nie je teda automaticky dôvodom na zamietnutie OLS modelu. Problém je hlavne inferenčný: veľké štandardné chyby a nestabilné individuálne koeficienty.
Možné riešenia:
Ak sú dve premenné takmer rovnaké a ekonomická teória nevyžaduje obe, môžeme jednu z nich vynechať.
model_reduced <- lm(Life_expectancy ~ Alcohol_consumption + Incidents_HIV, data = udaje)
summary(model)
Call:
lm(formula = Life_expectancy ~ Alcohol_consumption + Adult_mortality +
Incidents_HIV, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-0.30738 -0.07191 0.00501 0.06394 0.33087
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 80.406412 0.629177 127.796 < 2e-16 ***
Alcohol_consumption -0.009374 0.038803 -0.242 0.813
Adult_mortality -0.056383 0.002029 -27.784 2.92e-12 ***
Incidents_HIV 1.342110 0.899627 1.492 0.162
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1938 on 12 degrees of freedom
Multiple R-squared: 0.9876, Adjusted R-squared: 0.9845
F-statistic: 318.6 on 3 and 12 DF, p-value: 1.06e-11
summary(model_reduced)
Call:
lm(formula = Life_expectancy ~ Alcohol_consumption + Incidents_HIV,
data = udaje)
Residuals:
Min 1Q Median 3Q Max
-1.4850 -1.0324 -0.5959 0.9277 2.5740
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 65.54601 2.57299 25.475 1.76e-12 ***
Alcohol_consumption -0.08151 0.30065 -0.271 0.791
Incidents_HIV 10.91938 6.45281 1.692 0.114
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.505 on 13 degrees of freedom
Multiple R-squared: 0.19, Adjusted R-squared: 0.06535
F-statistic: 1.524 on 2 and 13 DF, p-value: 0.2542