V tomto cvičení už používam inú databázu ako minule, a to:
World Happiness Report 2019
(https://www.kaggle.com/datasets/unsdsn/world-happiness)
Dataset som si stiahla vo formáte CSV a vložila do zložky udaje pod názvom:
udaje/2019.csv
Dataset obsahuje údaje o indexe šťastia a rôznych socio-ekonomických
ukazovateľoch
(HDP na obyvateľa, sociálna opora, zdravá dĺžka života a pod.).
Budem analyzovať premenné:
Moja hypotéza: vyšší HDP a sociálna opora zvyšujú hodnotu indexu šťastia.
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lmtest)
library(sandwich)
library(car)
## Loading required package: carData
library(ggplot2)
library(patchwork)
rm(list = ls())
udaje_raw <- read.csv("udaje/2019.csv",
dec = ".", sep = ",", header = TRUE)
names(udaje_raw) <- make.names(names(udaje_raw))
udaje <- udaje_raw[,
c("Score", "GDP.per.capita", "Social.support", "Healthy.life.expectancy")
]
summary(udaje)
## Score GDP.per.capita Social.support Healthy.life.expectancy
## Min. :2.853 Min. :0.0000 Min. :0.000 Min. :0.0000
## 1st Qu.:4.545 1st Qu.:0.6028 1st Qu.:1.056 1st Qu.:0.5477
## Median :5.380 Median :0.9600 Median :1.272 Median :0.7890
## Mean :5.407 Mean :0.9051 Mean :1.209 Mean :0.7252
## 3rd Qu.:6.184 3rd Qu.:1.2325 3rd Qu.:1.452 3rd Qu.:0.8818
## Max. :7.769 Max. :1.6840 Max. :1.624 Max. :1.1410
udaje$GDP.per.capita[udaje$GDP.per.capita <= 0] <- NA
column_medians <- sapply(udaje, median, na.rm = TRUE)
udaje_imputed <- udaje
for(col in names(udaje_imputed)){
udaje_imputed[[col]][is.na(udaje_imputed[[col]])] <- column_medians[col]
}
udaje <- udaje_imputed
summary(udaje)
## Score GDP.per.capita Social.support Healthy.life.expectancy
## Min. :2.853 Min. :0.0260 Min. :0.000 Min. :0.0000
## 1st Qu.:4.545 1st Qu.:0.6170 1st Qu.:1.056 1st Qu.:0.5477
## Median :5.380 Median :0.9600 Median :1.272 Median :0.7890
## Mean :5.407 Mean :0.9113 Mean :1.209 Mean :0.7252
## 3rd Qu.:6.184 3rd Qu.:1.2325 3rd Qu.:1.452 3rd Qu.:0.8818
## Max. :7.769 Max. :1.6840 Max. :1.624 Max. :1.1410
par(mfrow = c(2,2))
par(mar = c(4,4,2,1))
for(col in names(udaje)){
boxplot(udaje[[col]], main = col, col = "lightblue", xlab = "Hodnota")
}
mtext("Boxploty premenných (World Happiness 2019)", outer = TRUE, cex = 1.2, font = 2)
par(mfrow = c(1,1))
Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support
model <- lm(Score ~ Healthy.life.expectancy + GDP.per.capita + Social.support,
data = udaje)
summary(model)
##
## Call:
## lm(formula = Score ~ Healthy.life.expectancy + GDP.per.capita +
## Social.support, data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.72445 -0.41207 -0.04929 0.46687 1.34906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1232 0.2040 10.407 < 2e-16 ***
## Healthy.life.expectancy 1.2294 0.3511 3.501 0.000608 ***
## GDP.per.capita 0.9102 0.2246 4.053 8.06e-05 ***
## Social.support 1.2928 0.2422 5.338 3.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5799 on 152 degrees of freedom
## Multiple R-squared: 0.7338, Adjusted R-squared: 0.7286
## F-statistic: 139.7 on 3 and 152 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(model)
par(mfrow = c(1,1))
residuals_m1 <- residuals(model)
jarque.bera.test(residuals_m1)
##
## Jarque Bera Test
##
## data: residuals_m1
## X-squared = 0.96354, df = 2, p-value = 0.6177
outlierTest(model)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 148 -3.101834 0.0022958 0.35814
Score ~ log(GDP.per.capita) + Social.support
model2 <- lm(Score ~ I(log(GDP.per.capita)) + Social.support,
data = udaje)
summary(model2)
##
## Call:
## lm(formula = Score ~ I(log(GDP.per.capita)) + Social.support,
## data = udaje)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91457 -0.40283 -0.00706 0.42891 1.81033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.9413 0.3480 8.452 2.10e-14 ***
## I(log(GDP.per.capita)) 0.4572 0.1180 3.875 0.000158 ***
## Social.support 2.1336 0.2662 8.015 2.63e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.673 on 153 degrees of freedom
## Multiple R-squared: 0.6392, Adjusted R-squared: 0.6345
## F-statistic: 135.5 on 2 and 153 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(model2)
par(mfrow = c(1,1))
residuals_m2 <- residuals(model2)
jarque.bera.test(residuals_m2)
##
## Jarque Bera Test
##
## data: residuals_m2
## X-squared = 0.11815, df = 2, p-value = 0.9426
outlierTest(model2)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 155 3.034503 0.0028348 0.44223
udaje$resid_m1 <- residuals_m1
udaje$resid_m2 <- residuals_m2
p1 <- ggplot(udaje, aes(x = GDP.per.capita, y = resid_m1^2)) +
geom_point(alpha=.6) +
geom_smooth(method="loess", se=FALSE, color="red")
p2 <- ggplot(udaje, aes(x = Social.support, y = resid_m1^2)) +
geom_point(alpha=.6) +
geom_smooth(method="loess", se=FALSE, color="red")
p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
p3 <- ggplot(udaje, aes(x = log(GDP.per.capita), y = resid_m2^2)) +
geom_point(alpha=.6) +
geom_smooth(method="loess", se=FALSE, color="red")
p4 <- ggplot(udaje, aes(x = Social.support, y = resid_m2^2)) +
geom_point(alpha=.6) +
geom_smooth(method="loess", se=FALSE, color="red")
p3 + p4
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.5443, df = 3, p-value = 0.6721
bptest(model2)
##
## studentized Breusch-Pagan test
##
## data: model2
## BP = 5.6484, df = 2, p-value = 0.05935
coeftest(model, vcov = vcovHC(model))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.12317 0.24599 8.6311 7.615e-15 ***
## Healthy.life.expectancy 1.22942 0.35163 3.4963 0.0006186 ***
## GDP.per.capita 0.91023 0.21025 4.3292 2.703e-05 ***
## Social.support 1.29284 0.25518 5.0663 1.161e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Z výsledkov vyplýva, že najväčší vplyv na úroveň šťastia majú
premenné GDP per capita a Social
support, ktoré sú v oboch modeloch štatisticky významné.
Premenná Healthy life expectancy nemala jednoznačný
efekt, preto som ju v druhom modeli vynechala.
Logaritmická transformácia HDP zlepšila diagnostiku modelu a znížila
vplyv extrémnych hodnôt.
Druhý model sa preto ukázal ako vhodnejší a stabilnejší
pri vysvetľovaní rozdielov v indexe šťastia v roku 2019.
Dnešné zadanie ma trocha potrápilo, ale som rada, že sa učím nové
veci.
Ďakujem a teším sa na ďalšie.
Vanessa