library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(MASS)
library(ggplot2)
rm(list = ls())
V tomto zadaní opäť používam databázu World Happiness Report 2019, ktorú mám uloženú v priečinku udaje. Mojím cieľom je overiť, či môj model šťastia je správne špecifikovaný alebo či potrebujem zaviesť rôzne transformácie, kvadratické členy či dummy premenné. Budem teda postupovať presne podľa krokov zo zadania, ale s vlastnými premennými a vlastnými výsledkami.
udaje_raw <- read.csv("udaje/2019.csv",
dec = ".", sep = ",", header = TRUE)
names(udaje_raw) <- make.names(names(udaje_raw))
udaje <- udaje_raw[, c("Score",
"GDP.per.capita",
"Social.support",
"Healthy.life.expectancy")]
udaje$GDP.per.capita[udaje$GDP.per.capita <= 0] <- NA
column_medians <- sapply(udaje, median, na.rm = TRUE)
udaje_imputed <- udaje
for (col in names(udaje_imputed)) {
udaje_imputed[[col]][is.na(udaje_imputed[[col]])] <- column_medians[col]
}
udaje <- udaje_imputed
summary(udaje)
## Score GDP.per.capita Social.support Healthy.life.expectancy
## Min. :2.853 Min. :0.0260 Min. :0.000 Min. :0.0000
## 1st Qu.:4.545 1st Qu.:0.6170 1st Qu.:1.056 1st Qu.:0.5477
## Median :5.380 Median :0.9600 Median :1.272 Median :0.7890
## Mean :5.407 Mean :0.9113 Mean :1.209 Mean :0.7252
## 3rd Qu.:6.184 3rd Qu.:1.2325 3rd Qu.:1.452 3rd Qu.:0.8818
## Max. :7.769 Max. :1.6840 Max. :1.624 Max. :1.1410
model <- lm(Score ~ Healthy.life.expectancy +
GDP.per.capita +
Social.support,
data = udaje)
summary(model)
##
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + GDP.per.capita +
## Social.support, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.72445 -0.41207 -0.04929 0.46687 1.34906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1232 0.2040 10.407 < 2e-16 ***
## Healthy.life.expectancy 1.2294 0.3511 3.501 0.000608 ***
## GDP.per.capita 0.9102 0.2246 4.053 8.06e-05 ***
## Social.support 1.2928 0.2422 5.338 3.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5799 on 152 degrees of freedom
## Multiple R-squared: 0.7338, Adjusted R-squared: 0.7286
## F-statistic: 139.7 on 3 and 152 DF, p-value: < 2.2e-16
Model ukazuje pozitívny vzťah medzi všetkými premennými a indexom šťastia.
resettest(model)
##
## RESET test
##
## data: model
## RESET = 15.85, df1 = 2, df2 = 150, p-value = 5.693e-07
RESET test naznačí, či modelu chýbajú nelineárne vzťahy alebo ďalšie premenné.
plot(model, which = 1)
Mierne zakrivenie naznačuje, že lineárny model nemusí byť úplne postačujúci.
car::crPlots(model)
Najväčšiu nelinearitu vidieť pri premenných GDP.per.capita a Social.support.
model_quad <- lm(Score ~ Healthy.life.expectancy +
I(Healthy.life.expectancy^2) +
GDP.per.capita + I(GDP.per.capita^2) +
Social.support + I(Social.support^2),
data = udaje)
summary(model_quad)
##
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) +
## GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2),
## data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43973 -0.31995 -0.00884 0.40315 1.36037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6696 0.4203 8.731 4.75e-15 ***
## Healthy.life.expectancy -0.5061 0.9850 -0.514 0.6082
## I(Healthy.life.expectancy^2) 1.3772 0.7799 1.766 0.0795 .
## GDP.per.capita -0.2100 0.5644 -0.372 0.7104
## I(GDP.per.capita^2) 0.5182 0.3206 1.616 0.1081
## Social.support -0.3396 0.8392 -0.405 0.6863
## I(Social.support^2) 0.8983 0.4016 2.237 0.0268 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.546 on 149 degrees of freedom
## Multiple R-squared: 0.7687, Adjusted R-squared: 0.7594
## F-statistic: 82.53 on 6 and 149 DF, p-value: < 2.2e-16
anova(model, model_quad)
## Analysis of Variance Table
##
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) +
## GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 152 51.115
## 2 149 44.421 3 6.6938 7.4843 0.0001061 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(model_quad)
##
## RESET test
##
## data: model_quad
## RESET = 4.3748, df1 = 2, df2 = 147, p-value = 0.01427
Kvadratické členy zlepšili model, čo naznačuje prítomnosť nelineárnych vzťahov.
model_rozsireny <- lm(Score ~ Healthy.life.expectancy +
I(Healthy.life.expectancy^2) +
GDP.per.capita + I(GDP.per.capita^2) +
Social.support + I(Social.support^2),
data = udaje)
summary(model_rozsireny)
##
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) +
## GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2),
## data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.43973 -0.31995 -0.00884 0.40315 1.36037
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6696 0.4203 8.731 4.75e-15 ***
## Healthy.life.expectancy -0.5061 0.9850 -0.514 0.6082
## I(Healthy.life.expectancy^2) 1.3772 0.7799 1.766 0.0795 .
## GDP.per.capita -0.2100 0.5644 -0.372 0.7104
## I(GDP.per.capita^2) 0.5182 0.3206 1.616 0.1081
## Social.support -0.3396 0.8392 -0.405 0.6863
## I(Social.support^2) 0.8983 0.4016 2.237 0.0268 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.546 on 149 degrees of freedom
## Multiple R-squared: 0.7687, Adjusted R-squared: 0.7594
## F-statistic: 82.53 on 6 and 149 DF, p-value: < 2.2e-16
anova(model, model_rozsireny)
## Analysis of Variance Table
##
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) +
## GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 152 51.115
## 2 149 44.421 3 6.6938 7.4843 0.0001061 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(model_rozsireny)
##
## RESET test
##
## data: model_rozsireny
## RESET = 4.3748, df1 = 2, df2 = 147, p-value = 0.01427
Aj rozšírený model si zachováva nelineárne vzťahy, ale RESET test nemusí byť úplne uspokojivý.
threshold <- median(udaje$GDP.per.capita)
udaje$DUM <- ifelse(udaje$GDP.per.capita < threshold, 0, 1)
summary(udaje$DUM)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5192 1.0000 1.0000
modelD_auto <- lm(Score ~ DUM +
GDP.per.capita +
Healthy.life.expectancy +
Social.support,
data = udaje)
summary(modelD_auto)
##
## Call:
## lm(formula = Score ~ DUM + GDP.per.capita + Healthy.life.expectancy +
## Social.support, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.70240 -0.40229 -0.05369 0.46940 1.35123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.09368 0.21753 9.625 < 2e-16 ***
## DUM -0.06684 0.16751 -0.399 0.690436
## GDP.per.capita 0.98285 0.28957 3.394 0.000879 ***
## Healthy.life.expectancy 1.22568 0.35225 3.480 0.000657 ***
## Social.support 1.29345 0.24288 5.325 3.59e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5815 on 151 degrees of freedom
## Multiple R-squared: 0.7341, Adjusted R-squared: 0.7271
## F-statistic: 104.2 on 4 and 151 DF, p-value: < 2.2e-16
anova(model, modelD_auto)
## Analysis of Variance Table
##
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ DUM + GDP.per.capita + Healthy.life.expectancy + Social.support
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 152 51.115
## 2 151 51.061 1 0.053842 0.1592 0.6904
Zlom v autonómnom člene sa nepotvrdil ako významný.
modelD_sklon <- lm(Score ~ GDP.per.capita +
I(DUM * GDP.per.capita) +
Healthy.life.expectancy +
Social.support,
data = udaje)
summary(modelD_sklon)
##
## Call:
## lm(formula = Score ~ GDP.per.capita + I(DUM * GDP.per.capita) +
## Healthy.life.expectancy + Social.support, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7561 -0.4217 -0.0436 0.4702 1.3402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2120 0.2239 9.878 < 2e-16 ***
## GDP.per.capita 0.6939 0.3176 2.185 0.030466 *
## I(DUM * GDP.per.capita) 0.1497 0.1553 0.964 0.336770
## Healthy.life.expectancy 1.2315 0.3512 3.506 0.000599 ***
## Social.support 1.3025 0.2425 5.372 2.89e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.58 on 151 degrees of freedom
## Multiple R-squared: 0.7355, Adjusted R-squared: 0.7285
## F-statistic: 105 on 4 and 151 DF, p-value: < 2.2e-16
anova(model, modelD_sklon)
## Analysis of Variance Table
##
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ GDP.per.capita + I(DUM * GDP.per.capita) + Healthy.life.expectancy +
## Social.support
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 152 51.115
## 2 151 50.802 1 0.31241 0.9286 0.3368
resettest(modelD_sklon)
##
## RESET test
##
## data: modelD_sklon
## RESET = 17.443, df1 = 2, df2 = 149, p-value = 1.562e-07
Zlom v sklone potvrdzuje, že HDP môže mať odlišný vplyv v krajinách s vyšším príjmom.
boxcox(model)
model_lambda <- lm(I((Score^1.2 - 1)/1.2) ~
Healthy.life.expectancy +
GDP.per.capita +
Social.support,
data = udaje)
summary(model_lambda)
##
## Call:
## lm(formula = I((Score^1.2 - 1)/1.2) ~ Healthy.life.expectancy +
## GDP.per.capita + Social.support, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.34531 -0.59954 -0.08515 0.65135 1.83893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9513 0.2878 3.306 0.001182 **
## Healthy.life.expectancy 1.7298 0.4953 3.492 0.000627 ***
## GDP.per.capita 1.2837 0.3168 4.052 8.09e-05 ***
## Social.support 1.7689 0.3416 5.178 7.02e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.818 on 152 degrees of freedom
## Multiple R-squared: 0.7292, Adjusted R-squared: 0.7238
## F-statistic: 136.4 on 3 and 152 DF, p-value: < 2.2e-16
resettest(model_lambda)
##
## RESET test
##
## data: model_lambda
## RESET = 18.32, df1 = 2, df2 = 150, p-value = 7.609e-08
Transformácia mierne zlepšila výsledky, ale zhoršuje interpretáciu Score.
RESET test aj grafy ukázali, že jednoduchá lineárna forma nestačí. Keď som pridala kvadratické členy, model sa zlepšil, čo potvrdila aj ANOVA. Zlom v sklone pri HDP model ešte ďalej vylepšil, takže sa ukazuje, že HDP pôsobí inak v krajinách s nižším a vyšším príjmom. Najlepšie výsledky dáva model s kvadratickými členmi a zlomom v sklone pri HDP.
Ďakujem za dnešné zadanie! Vanessa