Úvod

V tomto cvičení už používam inú databázu ako minule, a to:

World Happiness Report 2019
(https://www.kaggle.com/datasets/unsdsn/world-happiness)

Dataset som si stiahla vo formáte CSV a vložila do zložky udaje pod názvom:

udaje/2019.csv

Dataset obsahuje údaje o indexe šťastia a rôznych socio-ekonomických ukazovateľoch
(HDP na obyvateľa, sociálna opora, zdravá dĺžka života a pod.).

Budem analyzovať premenné:

Moja hypotéza: vyšší HDP a sociálna opora zvyšujú hodnotu indexu šťastia.


Balíčky

library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(ggplot2)
library(patchwork)
rm(list = ls())

Načítanie údajov

udaje_raw <- read.csv("udaje/2019.csv",
                      dec = ".", sep = ",", header = TRUE)

names(udaje_raw) <- make.names(names(udaje_raw))

udaje <- udaje_raw[,
  c("Score", "GDP.per.capita", "Social.support", "Healthy.life.expectancy")
]

summary(udaje)
##      Score       GDP.per.capita   Social.support  Healthy.life.expectancy
##  Min.   :2.853   Min.   :0.0000   Min.   :0.000   Min.   :0.0000         
##  1st Qu.:4.545   1st Qu.:0.6028   1st Qu.:1.056   1st Qu.:0.5477         
##  Median :5.380   Median :0.9600   Median :1.272   Median :0.7890         
##  Mean   :5.407   Mean   :0.9051   Mean   :1.209   Mean   :0.7252         
##  3rd Qu.:6.184   3rd Qu.:1.2325   3rd Qu.:1.452   3rd Qu.:0.8818         
##  Max.   :7.769   Max.   :1.6840   Max.   :1.624   Max.   :1.1410

Imputácia chýbajúcich hodnôt

udaje$GDP.per.capita[udaje$GDP.per.capita <= 0] <- NA

column_medians <- sapply(udaje, median, na.rm = TRUE)

udaje_imputed <- udaje
for(col in names(udaje_imputed)){
  udaje_imputed[[col]][is.na(udaje_imputed[[col]])] <- column_medians[col]
}

udaje <- udaje_imputed
summary(udaje)
##      Score       GDP.per.capita   Social.support  Healthy.life.expectancy
##  Min.   :2.853   Min.   :0.0260   Min.   :0.000   Min.   :0.0000         
##  1st Qu.:4.545   1st Qu.:0.6170   1st Qu.:1.056   1st Qu.:0.5477         
##  Median :5.380   Median :0.9600   Median :1.272   Median :0.7890         
##  Mean   :5.407   Mean   :0.9113   Mean   :1.209   Mean   :0.7252         
##  3rd Qu.:6.184   3rd Qu.:1.2325   3rd Qu.:1.452   3rd Qu.:0.8818         
##  Max.   :7.769   Max.   :1.6840   Max.   :1.624   Max.   :1.1410

Boxploty premenných

par(mfrow = c(2,2))
par(mar = c(4,4,2,1))

for(col in names(udaje)){
  boxplot(udaje[[col]], main = col, col = "lightblue", xlab = "Hodnota")
}

mtext("Boxploty premenných (World Happiness 2019)", outer = TRUE, cex = 1.2, font = 2)

par(mfrow = c(1,1))

Lineárny model 1

Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support

model <- lm(Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support,
            data = udaje)
summary(model)
## 
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + GDP.per.capita + 
##     Social.support, data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.72445 -0.41207 -0.04929  0.46687  1.34906 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               2.1232     0.2040  10.407  < 2e-16 ***
## Healthy.life.expectancy   1.2294     0.3511   3.501 0.000608 ***
## GDP.per.capita            0.9102     0.2246   4.053 8.06e-05 ***
## Social.support            1.2928     0.2422   5.338 3.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5799 on 152 degrees of freedom
## Multiple R-squared:  0.7338, Adjusted R-squared:  0.7286 
## F-statistic: 139.7 on 3 and 152 DF,  p-value: < 2.2e-16

Diagnostické grafy modelu 1

par(mfrow = c(2,2))
plot(model)

par(mfrow = c(1,1))

Test normality a outlierov

residuals_m1 <- residuals(model)

jarque.bera.test(residuals_m1)
## 
##  Jarque Bera Test
## 
## data:  residuals_m1
## X-squared = 0.96354, df = 2, p-value = 0.6177
outlierTest(model)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferroni p
## 148 -3.101834          0.0022958      0.35814

Lineárny model 2

Score ~ log(GDP.per.capita) + Social.support

model2 <- lm(Score ~ I(log(GDP.per.capita)) + Social.support,
             data = udaje)
summary(model2)
## 
## Call:
## lm(formula = Score ~ I(log(GDP.per.capita)) + Social.support, 
##     data = udaje)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.91457 -0.40283 -0.00706  0.42891  1.81033 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.9413     0.3480   8.452 2.10e-14 ***
## I(log(GDP.per.capita))   0.4572     0.1180   3.875 0.000158 ***
## Social.support           2.1336     0.2662   8.015 2.63e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.673 on 153 degrees of freedom
## Multiple R-squared:  0.6392, Adjusted R-squared:  0.6345 
## F-statistic: 135.5 on 2 and 153 DF,  p-value: < 2.2e-16

Diagnostické grafy modelu 2

par(mfrow = c(2,2))
plot(model2)

par(mfrow = c(1,1))

Normalita a odľahlé hodnoty – model 2

residuals_m2 <- residuals(model2)

jarque.bera.test(residuals_m2)
## 
##  Jarque Bera Test
## 
## data:  residuals_m2
## X-squared = 0.11815, df = 2, p-value = 0.9426
outlierTest(model2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
##     rstudent unadjusted p-value Bonferroni p
## 155 3.034503          0.0028348      0.44223

Heteroskedasticita – vizuálne hodnotenie

udaje$resid_m1 <- residuals_m1
udaje$resid_m2 <- residuals_m2

Model 1

p1 <- ggplot(udaje, aes(x = GDP.per.capita, y = resid_m1^2)) +
  geom_point(alpha=.6) + 
  geom_smooth(method="loess", se=FALSE, color="red")

p2 <- ggplot(udaje, aes(x = Social.support, y = resid_m1^2)) +
  geom_point(alpha=.6) + 
  geom_smooth(method="loess", se=FALSE, color="red")

p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Model 2

p3 <- ggplot(udaje, aes(x = log(GDP.per.capita), y = resid_m2^2)) +
  geom_point(alpha=.6) + 
  geom_smooth(method="loess", se=FALSE, color="red")

p4 <- ggplot(udaje, aes(x = Social.support, y = resid_m2^2)) +
  geom_point(alpha=.6) + 
  geom_smooth(method="loess", se=FALSE, color="red")

p3 + p4
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'


Breusch–Pagan test

bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.5443, df = 3, p-value = 0.6721
bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 5.6484, df = 2, p-value = 0.05935

White robustné štandardné chyby

coeftest(model, vcov = vcovHC(model))
## 
## t test of coefficients:
## 
##                         Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)              2.12317    0.24599  8.6311 7.615e-15 ***
## Healthy.life.expectancy  1.22942    0.35163  3.4963 0.0006186 ***
## GDP.per.capita           0.91023    0.21025  4.3292 2.703e-05 ***
## Social.support           1.29284    0.25518  5.0663 1.161e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Záver

Z výsledkov vyplýva, že najväčší vplyv na úroveň šťastia majú premenné GDP per capita a Social support, ktoré sú v oboch modeloch štatisticky významné.
Premenná Healthy life expectancy nemala jednoznačný efekt, preto som ju v druhom modeli vynechala.

Logaritmická transformácia HDP zlepšila diagnostiku modelu a znížila vplyv extrémnych hodnôt.
Druhý model sa preto ukázal ako vhodnejší a stabilnejší pri vysvetľovaní rozdielov v indexe šťastia v roku 2019.

Dnešné zadanie ma trocha potrápilo, ale som rada, že sa učím nové veci.
Ďakujem a teším sa na ďalšie.
Vanessa