knitr::opts_chunk$set(echo = TRUE)
library(zoo)
library(tseries)
library(lmtest)
library(sandwich)
library(car)
library(MASS)
rm(list=ls())
Ako závislú premennú som zvolila pce (osobná spotreba) a za vysvetľujúce premenné pop, psavert (osobné úspory) a uempmed (mediálna dĺžka nezamestnanosti)
udaje <- read.csv("economics.csv", header = TRUE, dec = ".")
# Vyberieme len stĺpce, ktoré budeme používať
udaje <- udaje[, c("pce","pop","psavert","uempmed")]
# Imputácia chýbajúcich hodnôt stĺpcov pomocou mediánu
column_medians <- sapply(udaje, median, na.rm = TRUE)
udaje_imputed <- udaje
for (col in names(udaje)) {
udaje_imputed[[col]][is.na(udaje_imputed[[col]])] <- column_medians[col]
}
udaje <- udaje_imputed
# Rýchly prehľad
summary(udaje)
pce pop psavert uempmed
Min. : 506.7 Min. :198712 Min. : 2.200 Min. : 4.000
1st Qu.: 1578.3 1st Qu.:224896 1st Qu.: 6.400 1st Qu.: 6.000
Median : 3936.8 Median :253060 Median : 8.400 Median : 7.500
Mean : 4820.1 Mean :257160 Mean : 8.567 Mean : 8.609
3rd Qu.: 7626.3 3rd Qu.:290291 3rd Qu.:11.100 3rd Qu.: 9.100
Max. :12193.8 Max. :320402 Max. :17.300 Max. :25.200
attach(udaje)
model <- lm(pce ~ +1 + pop + psavert + uempmed, data = udaje)
summary(model)
Call:
lm(formula = pce ~ +1 + pop + psavert + uempmed, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-809.7 -392.7 -112.8 260.6 1296.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.162e+04 5.557e+02 -38.913 < 2e-16 ***
pop 9.869e-02 1.867e-03 52.857 < 2e-16 ***
psavert 8.682e+01 1.757e+01 4.942 1.02e-06 ***
uempmed 3.734e+01 9.669e+00 3.862 0.000125 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 519.5 on 570 degrees of freedom
Multiple R-squared: 0.9788, Adjusted R-squared: 0.9787
F-statistic: 8764 on 3 and 570 DF, p-value: < 2.2e-16
resettest(model)
RESET test
data: model
RESET = 2997.9, df1 = 2, df2 = 568, p-value < 2.2e-16
plot(model, which = 1)
car::crPlots(model)
model_quad <- lm(pce ~ +1 + pop + psavert + uempmed + I(psavert^2) + I(pop^2))
summary(model_quad)
Call:
lm(formula = pce ~ +1 + pop + psavert + uempmed + I(psavert^2) +
I(pop^2))
Residuals:
Min 1Q Median 3Q Max
-290.97 -99.08 -7.56 112.23 327.74
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.479e+04 4.871e+02 30.358 < 2e-16 ***
pop -1.750e-01 3.648e-03 -47.970 < 2e-16 ***
psavert 3.173e+00 1.414e+01 0.224 0.823
uempmed -2.267e+01 2.969e+00 -7.636 9.52e-14 ***
I(psavert^2) -1.009e+00 7.577e-01 -1.332 0.184
I(pop^2) 5.230e-07 6.956e-09 75.177 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 145.7 on 568 degrees of freedom
Multiple R-squared: 0.9983, Adjusted R-squared: 0.9983
F-statistic: 6.815e+04 on 5 and 568 DF, p-value: < 2.2e-16
anova(model, model_quad)
Analysis of Variance Table
Model 1: pce ~ +1 + pop + psavert + uempmed
Model 2: pce ~ +1 + pop + psavert + uempmed + I(psavert^2) + I(pop^2)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 570 153826789
2 568 12063268 2 141763522 3337.5 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
resettest(model_quad)
RESET test
data: model_quad
RESET = 75.198, df1 = 2, df2 = 566, p-value < 2.2e-16
Ak sa ukáže, že niektoré kvadratické členy sú nevýznamné, odstránime ich a ponecháme len signifikantné.
# Ak napr. I(pop^2) nevychádza signifikantne, použijeme len psavert^2
model_quad2 <- lm(pce ~ +1 + pop + psavert + uempmed + I(psavert^2))
summary(model_quad2)
Call:
lm(formula = pce ~ +1 + pop + psavert + uempmed + I(psavert^2))
Residuals:
Min 1Q Median 3Q Max
-1890.50 -368.18 -74.83 160.65 1426.09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.957e+04 5.576e+02 -35.091 < 2e-16 ***
pop 9.636e-02 1.748e-03 55.117 < 2e-16 ***
psavert -3.151e+02 4.462e+01 -7.061 4.83e-12 ***
uempmed 5.636e+01 9.180e+00 6.139 1.56e-09 ***
I(psavert^2) 2.214e+01 2.289e+00 9.676 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 481.8 on 569 degrees of freedom
Multiple R-squared: 0.9818, Adjusted R-squared: 0.9816
F-statistic: 7664 on 4 and 569 DF, p-value: < 2.2e-16
model_rozsireny <- lm(pce ~ pop + psavert + uempmed + I(psavert^2) + I(pop^2) + I(uempmed^2))
summary(model_rozsireny)
Call:
lm(formula = pce ~ pop + psavert + uempmed + I(psavert^2) + I(pop^2) +
I(uempmed^2))
Residuals:
Min 1Q Median 3Q Max
-292.33 -99.21 -7.21 111.25 328.57
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.475e+04 5.058e+02 29.159 < 2e-16 ***
pop -1.747e-01 3.858e-03 -45.267 < 2e-16 ***
psavert 2.935e+00 1.418e+01 0.207 0.83611
uempmed -2.523e+01 9.651e+00 -2.614 0.00918 **
I(psavert^2) -9.791e-01 7.658e-01 -1.279 0.20155
I(pop^2) 5.224e-07 7.262e-09 71.939 < 2e-16 ***
I(uempmed^2) 9.517e-02 3.417e-01 0.279 0.78072
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 145.9 on 567 degrees of freedom
Multiple R-squared: 0.9983, Adjusted R-squared: 0.9983
F-statistic: 5.67e+04 on 6 and 567 DF, p-value: < 2.2e-16
anova(model, model_rozsireny)
Analysis of Variance Table
Model 1: pce ~ +1 + pop + psavert + uempmed
Model 2: pce ~ pop + psavert + uempmed + I(psavert^2) + I(pop^2) + I(uempmed^2)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 570 153826789
2 567 12061617 3 141765172 2221.4 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
resettest(model_rozsireny)
RESET test
data: model_rozsireny
RESET = 83.466, df1 = 2, df2 = 565, p-value < 2.2e-16
udaje$DUM <- ifelse(udaje$psavert < 12, 0, 1)
# Zlom v autonómnom člene
modelD_auto <- lm(pce ~ +1 + DUM + pop + psavert + uempmed, data = udaje)
summary(modelD_auto)
Call:
lm(formula = pce ~ +1 + DUM + pop + psavert + uempmed, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-865.2 -333.8 -111.6 193.1 1431.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.077e+04 5.681e+02 -36.556 < 2e-16 ***
DUM 4.106e+02 7.913e+01 5.188 2.95e-07 ***
pop 9.647e-02 1.875e-03 51.443 < 2e-16 ***
psavert 3.553e+01 1.982e+01 1.792 0.0736 .
uempmed 4.750e+01 9.656e+00 4.919 1.14e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 508.1 on 569 degrees of freedom
Multiple R-squared: 0.9797, Adjusted R-squared: 0.9796
F-statistic: 6878 on 4 and 569 DF, p-value: < 2.2e-16
# Zlom v sklone (interakcia DUM * psavert)
modelD_sklon <- lm(pce ~ +1 + pop + psavert + I(DUM*psavert) + uempmed, data = udaje)
summary(modelD_sklon)
Call:
lm(formula = pce ~ +1 + pop + psavert + I(DUM * psavert) + uempmed,
data = udaje)
Residuals:
Min 1Q Median 3Q Max
-841.1 -330.5 -113.2 197.6 1432.0
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.071e+04 5.711e+02 -36.271 < 2e-16 ***
pop 9.631e-02 1.883e-03 51.157 < 2e-16 ***
psavert 3.359e+01 2.002e+01 1.678 0.0939 .
I(DUM * psavert) 3.168e+01 6.113e+00 5.182 3.05e-07 ***
uempmed 4.813e+01 9.683e+00 4.971 8.83e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 508.1 on 569 degrees of freedom
Multiple R-squared: 0.9797, Adjusted R-squared: 0.9796
F-statistic: 6878 on 4 and 569 DF, p-value: < 2.2e-16
anova(model, modelD_sklon)
Analysis of Variance Table
Model 1: pce ~ +1 + pop + psavert + uempmed
Model 2: pce ~ +1 + pop + psavert + I(DUM * psavert) + uempmed
Res.Df RSS Df Sum of Sq F Pr(>F)
1 570 153826789
2 569 146894306 1 6932483 26.853 3.053e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
resettest(modelD_sklon)
RESET test
data: modelD_sklon
RESET = 3172.2, df1 = 2, df2 = 567, p-value < 2.2e-16
boxcox(model)
# Príklad pre lambda = 0 (log)
model_bc <- lm(I(log(pce)) ~ +1 + pop + psavert + uempmed, data = udaje)
summary(model_bc)
Call:
lm(formula = I(log(pce)) ~ +1 + pop + psavert + uempmed, data = udaje)
Residuals:
Min 1Q Median 3Q Max
-0.4294 -0.1419 -0.0129 0.1854 0.3044
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.249e+00 2.007e-01 11.209 < 2e-16 ***
pop 2.440e-05 6.742e-07 36.183 < 2e-16 ***
psavert -3.058e-02 6.344e-03 -4.820 1.84e-06 ***
uempmed -1.707e-02 3.491e-03 -4.888 1.33e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1876 on 570 degrees of freedom
Multiple R-squared: 0.9608, Adjusted R-squared: 0.9606
F-statistic: 4658 on 3 and 570 DF, p-value: < 2.2e-16
resettest(model_bc)
RESET test
data: model_bc
RESET = 6935.7, df1 = 2, df2 = 568, p-value < 2.2e-16
Model: \[ pce_i = β_0 + β_1 \, pop_i + β_2 \, psavert_i + β_3 \, uempmed_i + ε_i \]
Pridali sa členy \(pop^2\) a \(psavert^2\).
To znamená, že populácia má nelineárny vplyv na
spotrebu – rast obyvateľstva ju zvyšuje, ale s klesajúcou
intenzitou.
Úspory sa správajú lineárne.
Model s jedným kvadratickým členom \(psavert^2\):
Model ukazuje mierne nelineárny vzťah medzi úsporami a spotrebou.
Model doplnený o \(I(uempmed^2)\).
Aj keď model vysvetľuje takmer všetku variabilitu, existujú nenachytené vzťahy alebo interakcie.
Dummy premenná \(DUM = 1\) ak \(psavert > 12\).
Testovaná transformácia: \[ \log(pce) = β_0 + β_1 pop + β_2 psavert + β_3 uempmed + ε \]
Transformácia pomáha pri interpretácii (elasticity), ale nie v špecifikácii modelu.
Výsledky ukazujú, že:
Záver: Osobná spotreba je silne závislá od veľkosti populácie, pričom úspory a nezamestnanosť majú doplnkový vplyv. Model možno ešte vylepšiť nelineárnymi väzbami alebo interakciami medzi premennými.