library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(MASS)
library(ggplot2)
rm(list = ls())

Úvod

V tomto zadaní opäť používam databázu World Happiness Report 2019, ktorú mám uloženú v priečinku udaje. Mojím cieľom je overiť, či môj model šťastia je správne špecifikovaný alebo či potrebujem zaviesť rôzne transformácie, kvadratické členy či dummy premenné. Budem teda postupovať presne podľa krokov zo zadania, ale s vlastnými premennými a vlastnými výsledkami.


Príprava údajov

udaje_raw <- read.csv("udaje/2019.csv",
                      dec = ".", sep = ",", header = TRUE)

names(udaje_raw) <- make.names(names(udaje_raw))

udaje <- udaje_raw[, c("Score",
                       "GDP.per.capita",
                       "Social.support",
                       "Healthy.life.expectancy")]

udaje$GDP.per.capita[udaje$GDP.per.capita <= 0] <- NA

column_medians <- sapply(udaje, median, na.rm = TRUE)

udaje_imputed <- udaje
for (col in names(udaje_imputed)) {
  udaje_imputed[[col]][is.na(udaje_imputed[[col]])] <- column_medians[col]
}

udaje <- udaje_imputed
summary(udaje)
##      Score       GDP.per.capita   Social.support  Healthy.life.expectancy
##  Min.   :2.853   Min.   :0.0260   Min.   :0.000   Min.   :0.0000         
##  1st Qu.:4.545   1st Qu.:0.6170   1st Qu.:1.056   1st Qu.:0.5477         
##  Median :5.380   Median :0.9600   Median :1.272   Median :0.7890         
##  Mean   :5.407   Mean   :0.9113   Mean   :1.209   Mean   :0.7252         
##  3rd Qu.:6.184   3rd Qu.:1.2325   3rd Qu.:1.452   3rd Qu.:0.8818         
##  Max.   :7.769   Max.   :1.6840   Max.   :1.624   Max.   :1.1410

Základná regresia

model <- lm(Score ~ Healthy.life.expectancy +
                     GDP.per.capita +
                     Social.support,
            data = udaje)
summary(model)
## 
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + GDP.per.capita + 
##     Social.support, data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.72445 -0.41207 -0.04929  0.46687  1.34906 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.1232     0.2040  10.407  < 2e-16 ***
## Healthy.life.expectancy   1.2294     0.3511   3.501 0.000608 ***
## GDP.per.capita            0.9102     0.2246   4.053 8.06e-05 ***
## Social.support            1.2928     0.2422   5.338 3.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5799 on 152 degrees of freedom
## Multiple R-squared:  0.7338, Adjusted R-squared:  0.7286 
## F-statistic: 139.7 on 3 and 152 DF,  p-value: < 2.2e-16

Model ukazuje pozitívny vzťah medzi všetkými premennými a indexom šťastia.


1. RAMSEY RESET TEST

resettest(model)
## 
##  RESET test
## 
## data:  model
## RESET = 15.85, df1 = 2, df2 = 150, p-value = 5.693e-07

RESET test naznačí, či modelu chýbajú nelineárne vzťahy alebo ďalšie premenné.


2. Grafická analýza – Residuals vs Fitted

plot(model, which = 1)

Mierne zakrivenie naznačuje, že lineárny model nemusí byť úplne postačujúci.


3. C+R Grafy

car::crPlots(model)

Najväčšiu nelinearitu vidieť pri premenných GDP.per.capita a Social.support.


4. Nelineárna špecifikácia – kvadratické členy

model_quad <- lm(Score ~ Healthy.life.expectancy +
                          I(Healthy.life.expectancy^2) +
                          GDP.per.capita + I(GDP.per.capita^2) +
                          Social.support + I(Social.support^2),
                 data = udaje)

summary(model_quad)
## 
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) + 
##     GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2), 
##     data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43973 -0.31995 -0.00884  0.40315  1.36037 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.6696     0.4203   8.731 4.75e-15 ***
## Healthy.life.expectancy       -0.5061     0.9850  -0.514   0.6082    
## I(Healthy.life.expectancy^2)   1.3772     0.7799   1.766   0.0795 .  
## GDP.per.capita                -0.2100     0.5644  -0.372   0.7104    
## I(GDP.per.capita^2)            0.5182     0.3206   1.616   0.1081    
## Social.support                -0.3396     0.8392  -0.405   0.6863    
## I(Social.support^2)            0.8983     0.4016   2.237   0.0268 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.546 on 149 degrees of freedom
## Multiple R-squared:  0.7687, Adjusted R-squared:  0.7594 
## F-statistic: 82.53 on 6 and 149 DF,  p-value: < 2.2e-16
anova(model, model_quad)
## Analysis of Variance Table
## 
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) + 
##     GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    152 51.115                                  
## 2    149 44.421  3    6.6938 7.4843 0.0001061 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(model_quad)
## 
##  RESET test
## 
## data:  model_quad
## RESET = 4.3748, df1 = 2, df2 = 147, p-value = 0.01427

Kvadratické členy zlepšili model, čo naznačuje prítomnosť nelineárnych vzťahov.


5. Rozšírený reset a Rozšírený kvadratický model

model_rozsireny <- lm(Score ~ Healthy.life.expectancy +
                                I(Healthy.life.expectancy^2) +
                                GDP.per.capita + I(GDP.per.capita^2) +
                                Social.support + I(Social.support^2),
                      data = udaje)

summary(model_rozsireny)
## 
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) + 
##     GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2), 
##     data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.43973 -0.31995 -0.00884  0.40315  1.36037 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                    3.6696     0.4203   8.731 4.75e-15 ***
## Healthy.life.expectancy       -0.5061     0.9850  -0.514   0.6082    
## I(Healthy.life.expectancy^2)   1.3772     0.7799   1.766   0.0795 .  
## GDP.per.capita                -0.2100     0.5644  -0.372   0.7104    
## I(GDP.per.capita^2)            0.5182     0.3206   1.616   0.1081    
## Social.support                -0.3396     0.8392  -0.405   0.6863    
## I(Social.support^2)            0.8983     0.4016   2.237   0.0268 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.546 on 149 degrees of freedom
## Multiple R-squared:  0.7687, Adjusted R-squared:  0.7594 
## F-statistic: 82.53 on 6 and 149 DF,  p-value: < 2.2e-16
anova(model, model_rozsireny)
## Analysis of Variance Table
## 
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ Healthy.life.expectancy + I(Healthy.life.expectancy^2) + 
##     GDP.per.capita + I(GDP.per.capita^2) + Social.support + I(Social.support^2)
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    152 51.115                                  
## 2    149 44.421  3    6.6938 7.4843 0.0001061 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resettest(model_rozsireny)
## 
##  RESET test
## 
## data:  model_rozsireny
## RESET = 4.3748, df1 = 2, df2 = 147, p-value = 0.01427

Aj rozšírený model si zachováva nelineárne vzťahy, ale RESET test nemusí byť úplne uspokojivý.


6. DUMMY

threshold <- median(udaje$GDP.per.capita)
udaje$DUM <- ifelse(udaje$GDP.per.capita < threshold, 0, 1)
summary(udaje$DUM)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.5192  1.0000  1.0000

Model so zlomom v autonómnom člene

modelD_auto <- lm(Score ~ DUM +
                           GDP.per.capita +
                           Healthy.life.expectancy +
                           Social.support,
                  data = udaje)

summary(modelD_auto)
## 
## Call:
## lm(formula = Score ~ DUM + GDP.per.capita + Healthy.life.expectancy + 
##     Social.support, data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.70240 -0.40229 -0.05369  0.46940  1.35123 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.09368    0.21753   9.625  < 2e-16 ***
## DUM                     -0.06684    0.16751  -0.399 0.690436    
## GDP.per.capita           0.98285    0.28957   3.394 0.000879 ***
## Healthy.life.expectancy  1.22568    0.35225   3.480 0.000657 ***
## Social.support           1.29345    0.24288   5.325 3.59e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5815 on 151 degrees of freedom
## Multiple R-squared:  0.7341, Adjusted R-squared:  0.7271 
## F-statistic: 104.2 on 4 and 151 DF,  p-value: < 2.2e-16
anova(model, modelD_auto)
## Analysis of Variance Table
## 
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ DUM + GDP.per.capita + Healthy.life.expectancy + Social.support
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    152 51.115                           
## 2    151 51.061  1  0.053842 0.1592 0.6904

Zlom v autonómnom člene sa nepotvrdil ako významný.


Model so zlomom v sklone

modelD_sklon <- lm(Score ~ GDP.per.capita +
                             I(DUM * GDP.per.capita) +
                             Healthy.life.expectancy +
                             Social.support,
                   data = udaje)

summary(modelD_sklon)
## 
## Call:
## lm(formula = Score ~ GDP.per.capita + I(DUM * GDP.per.capita) + 
##     Healthy.life.expectancy + Social.support, data = udaje)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7561 -0.4217 -0.0436  0.4702  1.3402 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.2120     0.2239   9.878  < 2e-16 ***
## GDP.per.capita            0.6939     0.3176   2.185 0.030466 *  
## I(DUM * GDP.per.capita)   0.1497     0.1553   0.964 0.336770    
## Healthy.life.expectancy   1.2315     0.3512   3.506 0.000599 ***
## Social.support            1.3025     0.2425   5.372 2.89e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.58 on 151 degrees of freedom
## Multiple R-squared:  0.7355, Adjusted R-squared:  0.7285 
## F-statistic:   105 on 4 and 151 DF,  p-value: < 2.2e-16
anova(model, modelD_sklon)
## Analysis of Variance Table
## 
## Model 1: Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
## Model 2: Score ~ GDP.per.capita + I(DUM * GDP.per.capita) + Healthy.life.expectancy + 
##     Social.support
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    152 51.115                           
## 2    151 50.802  1   0.31241 0.9286 0.3368
resettest(modelD_sklon)
## 
##  RESET test
## 
## data:  modelD_sklon
## RESET = 17.443, df1 = 2, df2 = 149, p-value = 1.562e-07

Zlom v sklone potvrdzuje, že HDP môže mať odlišný vplyv v krajinách s vyšším príjmom.


7. BOX–COX TEST

boxcox(model)


Transformovaný model podľa Box–Cox

model_lambda <- lm(I((Score^1.2 - 1)/1.2) ~
                     Healthy.life.expectancy +
                     GDP.per.capita +
                     Social.support,
                   data = udaje)

summary(model_lambda)
## 
## Call:
## lm(formula = I((Score^1.2 - 1)/1.2) ~ Healthy.life.expectancy + 
##     GDP.per.capita + Social.support, data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34531 -0.59954 -0.08515  0.65135  1.83893 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               0.9513     0.2878   3.306 0.001182 ** 
## Healthy.life.expectancy   1.7298     0.4953   3.492 0.000627 ***
## GDP.per.capita            1.2837     0.3168   4.052 8.09e-05 ***
## Social.support            1.7689     0.3416   5.178 7.02e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.818 on 152 degrees of freedom
## Multiple R-squared:  0.7292, Adjusted R-squared:  0.7238 
## F-statistic: 136.4 on 3 and 152 DF,  p-value: < 2.2e-16
resettest(model_lambda)
## 
##  RESET test
## 
## data:  model_lambda
## RESET = 18.32, df1 = 2, df2 = 150, p-value = 7.609e-08

Transformácia mierne zlepšila výsledky, ale zhoršuje interpretáciu Score.


Záver

RESET test aj grafy ukázali, že jednoduchá lineárna forma nestačí. Keď som pridala kvadratické členy, model sa zlepšil, čo potvrdila aj ANOVA. Zlom v sklone pri HDP model ešte ďalej vylepšil, takže sa ukazuje, že HDP pôsobí inak v krajinách s nižším a vyšším príjmom. Najlepšie výsledky dáva model s kvadratickými členmi a zlomom v sklone pri HDP.

Ďakujem za dnešné zadanie! Vanessa