# STATR 520 Final Project Proposal: Analyzing Factors Influencing Happiness Using the World Happiness Report Dataset ### Garrett Jackson

Research Question + Data Description

What are the most significant factors influencing happiness across different countries

The World Happiness Report dataset from Kaggle contains information about the happiness levels of countries worldwide, based on a variety of indicators. This data was collected in 2024, and key variables in the dataset include:

The dataset also includes country names, regions, and other demographic details, providing a rich source for understanding happiness on a global scale.

# import happiness data
happiness_data <- read_csv("World-happiness-report-2024.csv")
## Rows: 143 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Country name, Regional indicator
## dbl (10): Ladder_score, upperwhisker, lowerwhisker, Log_GDP_per_capita, Soci...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# see data variables and types
str(happiness_data)
## spc_tbl_ [143 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Country name                : chr [1:143] "Finland" "Denmark" "Iceland" "Sweden" ...
##  $ Regional indicator          : chr [1:143] "Western Europe" "Western Europe" "Western Europe" "Western Europe" ...
##  $ Ladder_score                : num [1:143] 7.74 7.58 7.53 7.34 7.34 ...
##  $ upperwhisker                : num [1:143] 7.82 7.67 7.62 7.42 7.41 ...
##  $ lowerwhisker                : num [1:143] 7.67 7.5 7.43 7.27 7.28 ...
##  $ Log_GDP_per_capita          : num [1:143] 1.84 1.91 1.88 1.88 1.8 ...
##  $ Social_support              : num [1:143] 1.57 1.52 1.62 1.5 1.51 ...
##  $ Healthy_life_expectancy     : num [1:143] 0.695 0.699 0.718 0.724 0.74 0.706 0.704 0.708 0.747 0.692 ...
##  $ Freedom_to_make_life_choices: num [1:143] 0.859 0.823 0.819 0.838 0.641 0.725 0.835 0.801 0.759 0.756 ...
##  $ Generosity                  : num [1:143] 0.142 0.204 0.258 0.221 0.153 0.247 0.224 0.146 0.173 0.225 ...
##  $ Perceptions_of_corruption   : num [1:143] 0.546 0.548 0.182 0.524 0.193 0.372 0.484 0.432 0.498 0.323 ...
##  $ Dystopia + residual         : num [1:143] 2.08 1.88 2.05 1.66 2.3 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Country name` = col_character(),
##   ..   `Regional indicator` = col_character(),
##   ..   Ladder_score = col_double(),
##   ..   upperwhisker = col_double(),
##   ..   lowerwhisker = col_double(),
##   ..   Log_GDP_per_capita = col_double(),
##   ..   Social_support = col_double(),
##   ..   Healthy_life_expectancy = col_double(),
##   ..   Freedom_to_make_life_choices = col_double(),
##   ..   Generosity = col_double(),
##   ..   Perceptions_of_corruption = col_double(),
##   ..   `Dystopia + residual` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data Preparation

# summary table of data
describe(happiness_data)
##                              vars   n  mean    sd median trimmed   mad   min
## Country name*                   1 143 72.00 41.42  72.00   72.00 53.37  1.00
## Regional indicator*             2 143  6.08  3.15   6.00    6.22  4.45  1.00
## Ladder_score                    3 143  5.53  1.17   5.78    5.59  1.21  1.72
## upperwhisker                    4 143  5.64  1.16   5.89    5.71  1.19  1.77
## lowerwhisker                    5 143  5.41  1.19   5.67    5.48  1.24  1.67
## Log_GDP_per_capita              6 140  1.38  0.43   1.43    1.40  0.50  0.00
## Social_support                  7 140  1.13  0.33   1.24    1.17  0.30  0.00
## Healthy_life_expectancy         8 140  0.52  0.16   0.55    0.53  0.17  0.00
## Freedom_to_make_life_choices    9 140  0.62  0.16   0.64    0.64  0.15  0.00
## Generosity                     10 140  0.15  0.07   0.14    0.14  0.07  0.00
## Perceptions_of_corruption      11 140  0.15  0.13   0.12    0.13  0.09  0.00
## Dystopia + residual            12 140  1.58  0.54   1.64    1.60  0.39 -0.07
##                                 max  range  skew kurtosis   se
## Country name*                143.00 142.00  0.00    -1.23 3.46
## Regional indicator*           10.00   9.00 -0.25    -1.41 0.26
## Ladder_score                   7.74   6.02 -0.51    -0.26 0.10
## upperwhisker                   7.82   6.04 -0.54    -0.18 0.10
## lowerwhisker                   7.67   6.00 -0.49    -0.32 0.10
## Log_GDP_per_capita             2.14   2.14 -0.50    -0.42 0.04
## Social_support                 1.62   1.62 -0.97     0.40 0.03
## Healthy_life_expectancy        0.86   0.86 -0.53    -0.44 0.01
## Freedom_to_make_life_choices   0.86   0.86 -1.00     1.16 0.01
## Generosity                     0.40   0.40  0.65     0.72 0.01
## Perceptions_of_corruption      0.58   0.58  1.49     1.83 0.01
## Dystopia + residual            3.00   3.07 -0.59     0.68 0.05
# check for NA values
colSums(is.na(happiness_data))
##                 Country name           Regional indicator 
##                            0                            0 
##                 Ladder_score                 upperwhisker 
##                            0                            0 
##                 lowerwhisker           Log_GDP_per_capita 
##                            0                            3 
##               Social_support      Healthy_life_expectancy 
##                            3                            3 
## Freedom_to_make_life_choices                   Generosity 
##                            3                            3 
##    Perceptions_of_corruption          Dystopia + residual 
##                            3                            3
# remove NA values
happiness_data <- happiness_data %>%
  filter(!is.na(Social_support))

# distribution of happiness score
ggplot(happiness_data, aes(x = Ladder_score)) + geom_histogram(binwidth = .3, fill = "navyblue") +
  labs(title = "Happiness Score by Country",
       x = "Happiness Score",
       y = "Count") +
  theme_minimal()

The happiness score data seems to be left skewed. To confirm this, we can look at the normality of the data using a shapiro.test

# test normality of happiness data
shapiro.test(happiness_data$Ladder_score)
## 
##  Shapiro-Wilk normality test
## 
## data:  happiness_data$Ladder_score
## W = 0.9702, p-value = 0.003735

Here we see that the p-value is < 0.05, confirming that the distribution is NOT normal. In order to normalize it, I will perform a square transformation

# perform square transformation on happiness
happiness_data <- happiness_data %>%
  mutate(sq_ladder = Ladder_score^2)

# rerun shapiro test
shapiro.test(happiness_data$sq_ladder)
## 
##  Shapiro-Wilk normality test
## 
## data:  happiness_data$sq_ladder
## W = 0.98423, p-value = 0.1079

We see a much better p-value now, suggesting our data is now normally distributed.

Model Selection

Since I want to examine factors that influence happiness across different countries, I will first create a base model that includes all of the predictive variables. Depending on the summary, I will the create a second, refined model – both times using a multiple linear regression model.

# convert `Regional indicator` to a factor
# happiness_data$Region <- as.factor(happiness_data$`Regional indicator`)

# create a base multiple linear regression model
base_model <- lm(sq_ladder ~ Log_GDP_per_capita + Social_support +
                          Healthy_life_expectancy + Freedom_to_make_life_choices + Generosity + 
                          Perceptions_of_corruption, data = happiness_data)

# check model output
summary(base_model)
## 
## Call:
## lm(formula = sq_ladder ~ Log_GDP_per_capita + Social_support + 
##     Healthy_life_expectancy + Freedom_to_make_life_choices + 
##     Generosity + Perceptions_of_corruption, data = happiness_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.4720  -2.5959   0.6752   3.4706  10.3074 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -13.311      2.152  -6.186 7.09e-09 ***
## Log_GDP_per_capita              5.571      2.171   2.567 0.011378 *  
## Social_support                 14.403      2.200   6.546 1.17e-09 ***
## Healthy_life_expectancy        12.977      5.140   2.524 0.012762 *  
## Freedom_to_make_life_choices   17.901      3.359   5.329 4.11e-07 ***
## Generosity                      6.021      6.598   0.913 0.363089    
## Perceptions_of_corruption      16.340      4.257   3.839 0.000191 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.336 on 133 degrees of freedom
## Multiple R-squared:  0.8245, Adjusted R-squared:  0.8165 
## F-statistic: 104.1 on 6 and 133 DF,  p-value: < 2.2e-16
# calculate VIF for each predictor
vif_values <- vif(base_model)
print(vif_values)
##           Log_GDP_per_capita               Social_support 
##                     4.157293                     2.625614 
##      Healthy_life_expectancy Freedom_to_make_life_choices 
##                     3.508979                     1.454670 
##                   Generosity    Perceptions_of_corruption 
##                     1.146337                     1.409850

The base model has an adjusted R^2 of .82, which is very strong, and there does not appear to be any collinearity among predictive variables. However, we do see the Generosity does not seem to impact the model. What if we remove it?

# create an interaction model to assess regional effects
refined_model <- lm(sq_ladder ~ Log_GDP_per_capita + Social_support +
                          Healthy_life_expectancy + Freedom_to_make_life_choices  + 
                          Perceptions_of_corruption, data = happiness_data)

# check model output
summary(refined_model)
## 
## Call:
## lm(formula = sq_ladder ~ Log_GDP_per_capita + Social_support + 
##     Healthy_life_expectancy + Freedom_to_make_life_choices + 
##     Perceptions_of_corruption, data = happiness_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.3980  -2.3984   0.7247   3.7554  10.2711 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   -12.657      2.028  -6.242 5.29e-09 ***
## Log_GDP_per_capita              5.117      2.112   2.423   0.0167 *  
## Social_support                 14.669      2.179   6.731 4.50e-10 ***
## Healthy_life_expectancy        13.125      5.135   2.556   0.0117 *  
## Freedom_to_make_life_choices   18.472      3.298   5.600 1.17e-07 ***
## Perceptions_of_corruption      17.118      4.168   4.107 6.93e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.332 on 134 degrees of freedom
## Multiple R-squared:  0.8234, Adjusted R-squared:  0.8168 
## F-statistic: 124.9 on 5 and 134 DF,  p-value: < 2.2e-16

The refined model has the same R^2 value, and all predictive values are statistically significant.

Model Comparison Approach

Because the two models have identical R^2 values, I will use a Likelihood Ratio Test and AIC/BIC to determine the best model.

# perform likelihood ratio test
lrt <- lrtest(base_model, refined_model)
print(lrt)
## Likelihood ratio test
## 
## Model 1: sq_ladder ~ Log_GDP_per_capita + Social_support + Healthy_life_expectancy + 
##     Freedom_to_make_life_choices + Generosity + Perceptions_of_corruption
## Model 2: sq_ladder ~ Log_GDP_per_capita + Social_support + Healthy_life_expectancy + 
##     Freedom_to_make_life_choices + Perceptions_of_corruption
##   #Df  LogLik Df Chisq Pr(>Chisq)
## 1   8 -429.48                    
## 2   7 -429.92 -1 0.874     0.3499

The likelihood ratio test indicates that removing Generosity does not improves the model (p-value > 0.05).

# compute AIC values for each model
AIC_model_1 <- AIC(base_model)
AIC_model_2 <- AIC(refined_model)

# print AIC outputs
cat("AIC for Model 1:", AIC_model_1, "\n")
## AIC for Model 1: 874.964
cat("AIC for Model 2:", AIC_model_2, "\n")
## AIC for Model 2: 873.838

The AIC scores are virtually identical.

# compute BIC values for each model
BIC_model_1 <- BIC(base_model)
BIC_model_2 <- BIC(refined_model)

# print BIC outputs
cat("BIC for Model 1:", AIC_model_1, "\n")
## BIC for Model 1: 874.964
cat("BIC for Model 2:", AIC_model_2, "\n")
## BIC for Model 2: 873.838

Again, we see similar scores for BIC. Because these are similar, we will rely on the LRT to determine that the best model is the base model.

Statistical Test

Now that we’ve identifed the base model as the best MLR, I want to conduct an F-test to validate that predictive factors have varying levels of influence on overall happiness.

Null hypothesis: All predictive factors have the same influence on overall happiness Alt. hypothesis: All predictive factors do not have the same influence on overall happiness (i.e., some are stronger indicators than are others)

In order to conduct this F-test, I will use linearHypothesis() to test equal influence.

# use a test of equal influence
linearHypothesis(base_model, c("Log_GDP_per_capita = Social_support",
                                       "Social_support = Healthy_life_expectancy",
                                       "Healthy_life_expectancy = Freedom_to_make_life_choices",
                                       "Freedom_to_make_life_choices = Generosity",
                                       "Generosity = Perceptions_of_corruption"))
## 
## Linear hypothesis test:
## Log_GDP_per_capita - Social_support = 0
## Social_support - Healthy_life_expectancy = 0
## Healthy_life_expectancy - Freedom_to_make_life_choices = 0
## Freedom_to_make_life_choices - Generosity = 0
## Generosity - Perceptions_of_corruption = 0
## 
## Model 1: restricted model
## Model 2: sq_ladder ~ Log_GDP_per_capita + Social_support + Healthy_life_expectancy + 
##     Freedom_to_make_life_choices + Generosity + Perceptions_of_corruption
## 
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    138 4154.5                              
## 2    133 3786.6  5    367.89 2.5843 0.02889 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since the p-value < 0.05, we have enough evidence to reject the null hypothesis and conclude that at least one predictor has a significantly different influence on happiness.

Shiny App

# create a shiny app

# ensure Region is a factor
happiness_data$Region <- as.factor(happiness_data$`Regional indicator`)

# define UI
ui <- fluidPage(
  titlePanel("World Happiness Report: Regional Analysis"),
  
  sidebarLayout(
    sidebarPanel(
      selectInput("selected_region", "Choose a Region:", 
                  choices = unique(happiness_data$Region), 
                  selected = unique(happiness_data$Region)[1])
    ),
    
    mainPanel(
      plotOutput("regressionPlot"),
      verbatimTextOutput("r2Value"),  # Add R² value below the plot
      plotOutput("barPlot")
    )
  )
)

# define server logic
server <- function(input, output) {
  
  # filter data based on selected region
  filtered_data <- reactive({
    happiness_data %>% filter(Region == input$selected_region)
  })
  
  # regression Plot: Social_support vs Happiness Score
  output$regressionPlot <- renderPlot({
    data <- filtered_data()
    
    # fit linear model
    model <- lm(Ladder_score ~ Social_support, data = data)
    r_squared <- summary(model)$r.squared  # Extract R² value
    
    # create plot
    ggplot(data, aes(x = Social_support, y = Ladder_score)) +
      geom_point() +
      geom_smooth(method = "lm", se = FALSE, color = "red") +
      annotate("text", x = min(data$Social_support, na.rm = TRUE), 
               y = min(data$Ladder_score, na.rm = TRUE), 
               label = paste("R² =", round(r_squared, 3)), 
               hjust = 0, vjust = -1, size = 5, color = "blue") +
      labs(title = paste("Social Support vs. Happiness Score in", input$selected_region),
           x = "Social Support", y = "Happiness Score") +
      theme_minimal()
  })
  
  # display R² value 
  output$r2Value <- renderText({
    data <- filtered_data()
    model <- lm(Ladder_score ~ Social_support, data = data)
    r_squared <- summary(model)$r.squared
    paste("R² Value:", round(r_squared, 3))
  })
}


# run app
shinyApp(ui = ui, server = server)
Shiny applications not supported in static R Markdown documents