library(tidyverse)
library(GGally)
library(MLmetrics)
library(lmtest)
library(car)
options(scipen = 999)
happiness <- read.csv("world-happiness-report-2021--.csv")
glimpse(happiness)
#> Rows: 149
#> Columns: 20
#> $ Country.name <chr> "Finland", "Denmark", "Swit~
#> $ Regional.indicator <chr> "Western Europe", "Western ~
#> $ Ladder.score <dbl> 7.842, 7.620, 7.571, 7.554,~
#> $ Standard.error.of.ladder.score <dbl> 0.032, 0.035, 0.036, 0.059,~
#> $ upperwhisker <dbl> 7.904, 7.687, 7.643, 7.670,~
#> $ lowerwhisker <dbl> 7.780, 7.552, 7.500, 7.438,~
#> $ Logged.GDP.per.capita <dbl> 10.775, 10.933, 11.117, 10.~
#> $ Social.support <dbl> 0.954, 0.954, 0.942, 0.983,~
#> $ Healthy.life.expectancy <dbl> 72.000, 72.700, 74.400, 73.~
#> $ Freedom.to.make.life.choices <dbl> 0.949, 0.946, 0.919, 0.955,~
#> $ Generosity <dbl> -0.098, 0.030, 0.025, 0.160~
#> $ Perceptions.of.corruption <dbl> 0.186, 0.179, 0.292, 0.673,~
#> $ Ladder.score.in.Dystopia <dbl> 2.43, 2.43, 2.43, 2.43, 2.4~
#> $ Explained.by..Log.GDP.per.capita <dbl> 1.446, 1.502, 1.566, 1.482,~
#> $ Explained.by..Social.support <dbl> 1.106, 1.108, 1.079, 1.172,~
#> $ Explained.by..Healthy.life.expectancy <dbl> 0.741, 0.763, 0.816, 0.772,~
#> $ Explained.by..Freedom.to.make.life.choices <dbl> 0.691, 0.686, 0.653, 0.698,~
#> $ Explained.by..Generosity <dbl> 0.124, 0.208, 0.204, 0.293,~
#> $ Explained.by..Perceptions.of.corruption <dbl> 0.481, 0.485, 0.413, 0.170,~
#> $ Dystopia...residual <dbl> 3.253, 2.868, 2.839, 2.967,~
Check if there is missing value.
anyNA(happiness)
#> [1] FALSE
Select columns we want to use.
happy <- happiness %>%
select(Ladder.score, Logged.GDP.per.capita, Social.support, Healthy.life.expectancy, Freedom.to.make.life.choices, Generosity, Perceptions.of.corruption)
Visualize data in a boxplot to see if there is any outlier.
boxplot(happy)
Note:
There is some outlier in the data, but we keep going for now.
Let’s look at the correlation between variables.
ggcorr(happy, label=TRUE, hjust = 1,layout.exp = 1)
model_some <- lm(Ladder.score ~ Logged.GDP.per.capita + Social.support + Healthy.life.expectancy, happy)
summary(model_some)
#>
#> Call:
#> lm(formula = Ladder.score ~ Logged.GDP.per.capita + Social.support +
#> Healthy.life.expectancy, data = happy)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.81242 -0.40736 0.01624 0.47390 1.31845
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.40583 0.47808 -5.032 0.00000141 ***
#> Logged.GDP.per.capita 0.27335 0.09419 2.902 0.00429 **
#> Social.support 3.00051 0.70315 4.267 0.00003555 ***
#> Healthy.life.expectancy 0.04486 0.01447 3.101 0.00232 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.6013 on 145 degrees of freedom
#> Multiple R-squared: 0.6928, Adjusted R-squared: 0.6865
#> F-statistic: 109 on 3 and 145 DF, p-value: < 0.00000000000000022
Model Interpretation:
Making regression model with all variable predictor and without variable predictor as preparation for stepwise regression.
model_none <- lm(Ladder.score ~ 1, happy)
model_all <- lm(Ladder.score ~., happy)
model_backward <- step(model_all, direction = "backward")
#> Start: AIC=-175.83
#> Ladder.score ~ Logged.GDP.per.capita + Social.support + Healthy.life.expectancy +
#> Freedom.to.make.life.choices + Generosity + Perceptions.of.corruption
#>
#> Df Sum of Sq RSS AIC
#> - Generosity 1 0.3777 42.052 -176.49
#> <none> 41.674 -175.84
#> - Perceptions.of.corruption 1 1.2732 42.948 -173.35
#> - Healthy.life.expectancy 1 1.5170 43.191 -172.51
#> - Logged.GDP.per.capita 1 3.0409 44.715 -167.34
#> - Social.support 1 4.0301 45.705 -164.08
#> - Freedom.to.make.life.choices 1 4.8452 46.520 -161.45
#>
#> Step: AIC=-176.49
#> Ladder.score ~ Logged.GDP.per.capita + Social.support + Healthy.life.expectancy +
#> Freedom.to.make.life.choices + Perceptions.of.corruption
#>
#> Df Sum of Sq RSS AIC
#> <none> 42.052 -176.49
#> - Healthy.life.expectancy 1 1.4290 43.481 -173.51
#> - Perceptions.of.corruption 1 1.6089 43.661 -172.90
#> - Logged.GDP.per.capita 1 2.7815 44.834 -168.95
#> - Social.support 1 4.1366 46.189 -164.51
#> - Freedom.to.make.life.choices 1 5.7233 47.775 -159.48
summary(model_backward)
#>
#> Call:
#> lm(formula = Ladder.score ~ Logged.GDP.per.capita + Social.support +
#> Healthy.life.expectancy + Freedom.to.make.life.choices +
#> Perceptions.of.corruption, data = happy)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.93303 -0.29768 0.06863 0.33924 1.02304
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.11039 0.62112 -3.398 0.000880 ***
#> Logged.GDP.per.capita 0.26400 0.08584 3.075 0.002518 **
#> Social.support 2.50670 0.66835 3.751 0.000256 ***
#> Healthy.life.expectancy 0.02936 0.01332 2.204 0.029095 *
#> Freedom.to.make.life.choices 2.13266 0.48342 4.412 0.0000201 ***
#> Perceptions.of.corruption -0.66778 0.28549 -2.339 0.020718 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.5423 on 143 degrees of freedom
#> Multiple R-squared: 0.7536, Adjusted R-squared: 0.745
#> F-statistic: 87.49 on 5 and 143 DF, p-value: < 0.00000000000000022
Model Interpretation:
model_forward <- step(model_none,
scope = list(upper = model_all, lower = model_none),
direction = "forward")
#> Start: AIC=22.25
#> Ladder.score ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + Logged.GDP.per.capita 1 106.463 64.227 -121.386
#> + Healthy.life.expectancy 1 100.703 69.987 -108.590
#> + Social.support 1 97.785 72.905 -102.503
#> + Freedom.to.make.life.choices 1 63.047 107.643 -44.443
#> + Perceptions.of.corruption 1 30.273 140.417 -4.840
#> <none> 170.690 22.250
#> + Generosity 1 0.054 170.636 24.202
#>
#> Step: AIC=-121.39
#> Ladder.score ~ Logged.GDP.per.capita
#>
#> Df Sum of Sq RSS AIC
#> + Freedom.to.make.life.choices 1 14.8894 49.338 -158.68
#> + Social.support 1 8.3203 55.907 -140.06
#> + Healthy.life.expectancy 1 5.2124 59.015 -132.00
#> + Perceptions.of.corruption 1 4.3955 59.832 -129.95
#> + Generosity 1 3.4635 60.764 -127.65
#> <none> 64.227 -121.39
#>
#> Step: AIC=-158.68
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices
#>
#> Df Sum of Sq RSS AIC
#> + Social.support 1 3.8373 45.500 -168.75
#> + Healthy.life.expectancy 1 2.4344 46.903 -164.22
#> + Perceptions.of.corruption 1 0.9758 48.362 -159.66
#> <none> 49.338 -158.68
#> + Generosity 1 0.6055 48.732 -158.52
#>
#> Step: AIC=-168.75
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support
#>
#> Df Sum of Sq RSS AIC
#> + Perceptions.of.corruption 1 2.0194 43.481 -173.51
#> + Healthy.life.expectancy 1 1.8395 43.661 -172.90
#> + Generosity 1 0.6249 44.876 -168.81
#> <none> 45.500 -168.75
#>
#> Step: AIC=-173.51
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support + Perceptions.of.corruption
#>
#> Df Sum of Sq RSS AIC
#> + Healthy.life.expectancy 1 1.42899 42.052 -176.49
#> <none> 43.481 -173.51
#> + Generosity 1 0.28967 43.191 -172.51
#>
#> Step: AIC=-176.49
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support + Perceptions.of.corruption + Healthy.life.expectancy
#>
#> Df Sum of Sq RSS AIC
#> <none> 42.052 -176.49
#> + Generosity 1 0.37767 41.674 -175.84
summary(model_forward)
#>
#> Call:
#> lm(formula = Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support + Perceptions.of.corruption + Healthy.life.expectancy,
#> data = happy)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.93303 -0.29768 0.06863 0.33924 1.02304
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.11039 0.62112 -3.398 0.000880 ***
#> Logged.GDP.per.capita 0.26400 0.08584 3.075 0.002518 **
#> Freedom.to.make.life.choices 2.13266 0.48342 4.412 0.0000201 ***
#> Social.support 2.50670 0.66835 3.751 0.000256 ***
#> Perceptions.of.corruption -0.66778 0.28549 -2.339 0.020718 *
#> Healthy.life.expectancy 0.02936 0.01332 2.204 0.029095 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.5423 on 143 degrees of freedom
#> Multiple R-squared: 0.7536, Adjusted R-squared: 0.745
#> F-statistic: 87.49 on 5 and 143 DF, p-value: < 0.00000000000000022
Model Interpretation:
model_both <- step(model_none,
scope = list(upper = model_all), data = happy,
direction="both")
#> Start: AIC=22.25
#> Ladder.score ~ 1
#>
#> Df Sum of Sq RSS AIC
#> + Logged.GDP.per.capita 1 106.463 64.227 -121.386
#> + Healthy.life.expectancy 1 100.703 69.987 -108.590
#> + Social.support 1 97.785 72.905 -102.503
#> + Freedom.to.make.life.choices 1 63.047 107.643 -44.443
#> + Perceptions.of.corruption 1 30.273 140.417 -4.840
#> <none> 170.690 22.250
#> + Generosity 1 0.054 170.636 24.202
#>
#> Step: AIC=-121.39
#> Ladder.score ~ Logged.GDP.per.capita
#>
#> Df Sum of Sq RSS AIC
#> + Freedom.to.make.life.choices 1 14.889 49.338 -158.68
#> + Social.support 1 8.320 55.907 -140.06
#> + Healthy.life.expectancy 1 5.212 59.015 -132.00
#> + Perceptions.of.corruption 1 4.395 59.832 -129.95
#> + Generosity 1 3.463 60.764 -127.65
#> <none> 64.227 -121.39
#> - Logged.GDP.per.capita 1 106.463 170.690 22.25
#>
#> Step: AIC=-158.68
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices
#>
#> Df Sum of Sq RSS AIC
#> + Social.support 1 3.837 45.500 -168.747
#> + Healthy.life.expectancy 1 2.434 46.903 -164.223
#> + Perceptions.of.corruption 1 0.976 48.362 -159.660
#> <none> 49.338 -158.683
#> + Generosity 1 0.606 48.732 -158.523
#> - Freedom.to.make.life.choices 1 14.889 64.227 -121.386
#> - Logged.GDP.per.capita 1 58.306 107.643 -44.443
#>
#> Step: AIC=-168.75
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support
#>
#> Df Sum of Sq RSS AIC
#> + Perceptions.of.corruption 1 2.0193 43.481 -173.51
#> + Healthy.life.expectancy 1 1.8395 43.661 -172.90
#> + Generosity 1 0.6249 44.876 -168.81
#> <none> 45.500 -168.75
#> - Social.support 1 3.8373 49.338 -158.68
#> - Freedom.to.make.life.choices 1 10.4064 55.907 -140.06
#> - Logged.GDP.per.capita 1 14.3434 59.844 -129.92
#>
#> Step: AIC=-173.51
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support + Perceptions.of.corruption
#>
#> Df Sum of Sq RSS AIC
#> + Healthy.life.expectancy 1 1.4290 42.052 -176.49
#> <none> 43.481 -173.51
#> + Generosity 1 0.2897 43.191 -172.51
#> - Perceptions.of.corruption 1 2.0193 45.500 -168.75
#> - Social.support 1 4.8808 48.362 -159.66
#> - Freedom.to.make.life.choices 1 6.4842 49.965 -154.80
#> - Logged.GDP.per.capita 1 10.3744 53.856 -143.63
#>
#> Step: AIC=-176.49
#> Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support + Perceptions.of.corruption + Healthy.life.expectancy
#>
#> Df Sum of Sq RSS AIC
#> <none> 42.052 -176.49
#> + Generosity 1 0.3777 41.674 -175.84
#> - Healthy.life.expectancy 1 1.4290 43.481 -173.51
#> - Perceptions.of.corruption 1 1.6089 43.661 -172.90
#> - Logged.GDP.per.capita 1 2.7815 44.834 -168.95
#> - Social.support 1 4.1366 46.189 -164.51
#> - Freedom.to.make.life.choices 1 5.7233 47.775 -159.48
summary(model_both)
#>
#> Call:
#> lm(formula = Ladder.score ~ Logged.GDP.per.capita + Freedom.to.make.life.choices +
#> Social.support + Perceptions.of.corruption + Healthy.life.expectancy,
#> data = happy)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -1.93303 -0.29768 0.06863 0.33924 1.02304
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -2.11039 0.62112 -3.398 0.000880 ***
#> Logged.GDP.per.capita 0.26400 0.08584 3.075 0.002518 **
#> Freedom.to.make.life.choices 2.13266 0.48342 4.412 0.0000201 ***
#> Social.support 2.50670 0.66835 3.751 0.000256 ***
#> Perceptions.of.corruption -0.66778 0.28549 -2.339 0.020718 *
#> Healthy.life.expectancy 0.02936 0.01332 2.204 0.029095 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.5423 on 143 degrees of freedom
#> Multiple R-squared: 0.7536, Adjusted R-squared: 0.745
#> F-statistic: 87.49 on 5 and 143 DF, p-value: < 0.00000000000000022
Model Interpretation:
Freedom to make life choices, since it has the smallest P-value of 0.0000201.RMSE(model_some$fitted.values, happy$Ladder.score)
#> [1] 0.5931968
RMSE(model_both$fitted.values, happy$Ladder.score)
#> [1] 0.5312525
RMSE(model_all$fitted.values, happy$Ladder.score)
#> [1] 0.5288616
prediction <- predict(object = model_both, newdata = happy)
head(predict(model_all, newdata = happy,
level = 0.95, interval = "prediction"))
#> fit lwr upr
#> 1 7.079317 5.964311 8.194323
#> 2 7.189548 6.079186 8.299910
#> 3 7.138321 6.040998 8.235645
#> 4 7.021626 5.930484 8.112769
#> 5 7.040740 5.942746 8.138735
#> 6 7.237320 6.136644 8.337995
model_bothhist(model_both$residuals)
shapiro.test(model_both$residuals)
#>
#> Shapiro-Wilk normality test
#>
#> data: model_both$residuals
#> W = 0.97079, p-value = 0.002893
Shapiro-Wilk hypothesis test:
Insight:
model_some to check whether that model is accepted by statistical rules.model_somehist(model_some$residuals)
shapiro.test(model_some$residuals)
#>
#> Shapiro-Wilk normality test
#>
#> data: model_some$residuals
#> W = 0.9924, p-value = 0.6143
Insight:
model_some is distributed normally because P-value < 0.05Breusch-Pagan hypothesis test:
bptest(model_some)
#>
#> studentized Breusch-Pagan test
#>
#> data: model_some
#> BP = 6.0644, df = 3, p-value = 0.1085
Homoscedasticity Plot:
plot(model_some$fitted.values, model_some$residuals)
abline(h = 0, col = "red")
Insight:
vif(model_some)
#> Logged.GDP.per.capita Social.support Healthy.life.expectancy
#> 4.874626 2.671151 3.917951
Insight:
model_all which using all variable predictor, because this model have the smaller error than other model, which is 0.5288616.model_all is the best model in terms of smallest error, but model_some is the one that fulfill statistical asumptions. This model only have Adjusted R-square of 0.6865.model_some are variables that have strong correlation with target variable (> 0.7), which is Logged GDP per capita, Social support, and Healthy life expectancy.