Initial set-up

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "MASS", "kableExtra", "flextable", "equatiomatic") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## 
## Attaching package: 'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "kableExtra"   "MASS"         "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [11] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "flextable"    "kableExtra"   "MASS"         "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "equatiomatic" "flextable"    "kableExtra"   "MASS"         "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"
tinytex::install_tinytex(force = TRUE)
## The directory /usr/local/bin is not writable. I recommend that you make it writable. See https://github.com/rstudio/tinytex/issues/24 for more info.
## tlmgr install multirow

Loading the ESS dataset

ess <- read_fst("~/Desktop/SOC202 Final Project/All-ESS-Data.fst")

Filtering and recoding

Variable Info:

how happy are you (outcome): https://ess.sikt.no/en/datafile/b2b0bf39-176b-4eca-8d26-3c05ea83d2cb/266?tab=1&elements=[%22a5c95af9-6037-4e52-9156-2ceac708eb3f/4%22]

highest level of education (main predictor): https://ess.sikt.no/en/datafile/2dc0634d-eb82-47c1-9e7a-83c9ec2e9130/6?tab=1&elements=[%22649fc6a7-858f-4743-955d-20a7568f3afb/1%22] https://ess.sikt.no/en/datafile/f37d014a-6958-42d4-b03b-17c29e481d3d/256?tab=1&elements=[%22a7895188-56aa-46c1-93b6-2af6d966b2b5/8%22]

satisfaction with democracy (secondary predictor): https://ess.sikt.no/en/datafile/a93fed5b-3858-4e86-bdae-dfcf5bbc9bf9/23?tab=1&elements=[%22eb1d3374-0cd8-4324-98ad-a36141dc9587/2%22]

#filtering to our country of interest 
finland_data <- ess %>%
  filter(cntry == "FI") %>%
  mutate(
    #cleaning our outcome happy
    happy = ifelse(happy %in% c(77, 88, 99), NA, happy),
    #cleaning our main predictor education level 
     edulvla = edulvla %>%
    na_if(55) %>%
    na_if(77) %>%
    na_if(88) %>%
    na_if(99),

edulvlb = edulvlb %>%
    na_if(5555) %>%
    na_if(7777) %>%
    na_if(8888) %>%
    na_if(9999), 

#changed BA and No BA to a numeric for later coding 
educ.ba = case_when(
    essround < 5 & edulvla == 5 ~ 1,
    essround >= 5 & edulvlb > 600 ~ 1,
    TRUE ~ 0
),
    #cleaning our secondary predictor satisfaction with democracy 
     stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem)
  )

# not a bad practice to create a duplicate in case you need to backtrack or use for different purposes
df <- finland_data
#making sure all variables are properly cleaned 
finland_data <- finland_data %>% filter(!is.na(happy))
finland_data <- finland_data %>% filter(!is.na(educ.ba))
finland_data <- finland_data %>% filter(!is.na(stfdem))

table(finland_data$happy)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   18   24   79  158  218  595  759 2577 6716 6036 1831
table(finland_data$educ.ba)
## 
##     0     1 
## 13591  5420
table(finland_data$stfdem)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##  159  202  419  853 1261 2224 2495 4512 4675 1864  347

#Graphs for outcome over time

#putting our data year range from 2002-2020
df$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  df$year[df$essround == i] <- replacements[i]
}
happy_avg <- df %>%
  group_by(year) %>%
  summarize(happy_avg = mean(happy, na.rm = TRUE))

plot_zoom_happy <- ggplot(happy_avg, aes(x = year, y = happy_avg)) +
  geom_point(aes(size = happy_avg, color = happy_avg), alpha = 0.7) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(title = "Happy Average by Year (Finland)",
       x = "Survey Year",
       y = "Happy Average") +
  theme_minimal() +
  theme(legend.position = "none")

print(plot_zoom_happy)

plot_full_scale_happy <- ggplot(happy_avg, aes(x = year, y = happy_avg)) +
  geom_point(aes(size = happy_avg, color = happy_avg), alpha = 0.7) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(title = "Happy Average by Year (Finland)",
       x = "Survey Year",
       y = "Happy Average") +
  theme_minimal() +
  theme(legend.position = "none") +
  scale_y_continuous(limits = c(0, 10))

print(plot_full_scale_happy)

The two graphs show that the Finnish are consistently highly happy over the near two decade time span.

#need to delete some data to prevent vector memory exhauseted error
rm(finland_data)
rm(plot_full_scale_happy)
rm(plot_zoom_happy)
rm(happy_avg)
rm(df)

#Finland average happiness score compared to other countries

ess$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  ess$year[ess$essround == i] <- replacements[i]
}

ess <- ess %>%
  mutate(
    #cleaning our outcome happy
    happy = ifelse(happy %in% c(77, 88, 99), NA, happy),
    #cleaning our main predictor education level 
    edulvla = case_when(
      essround < 5 & edulvla == 55 ~ NA_real_,
      TRUE ~ edulvla
    ),
    edulvlb = case_when(
      essround >= 5 & edulvlb == 5555 ~ NA_real_,
      TRUE ~ edulvlb
    ),
    educ_level = case_when(
      essround < 5 & edulvla == 5 ~ "BA",
      essround >= 5 & edulvlb > 600 ~ "BA",
      TRUE ~ "No BA"
    ),
    #cleaning our secondary predictor satisfaction with democracy 
     stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem)
  )
ess_cleaned <- ess %>%
  mutate(
    happy = happy,
    year = as.factor(year)  # Ensure year is a factor
  ) %>%
  filter(!is.na(happy))

# Calculating yearly averages for each country
yearly_happy_avg <- ess_cleaned %>%
  group_by(cntry, year) %>%
  summarize(avg_happy = mean(happy), .groups = 'drop')

# Subset for specific countries and overall ESS baseline
specific_countries_happy <- yearly_happy_avg %>%
  filter(cntry %in% c("HU", "FI", "ES", "DE", "SE"))

ess_baseline_happy <- ess_cleaned %>%
  group_by(year) %>%
  summarize(avg_happy = mean(happy), .groups = 'drop') %>%
  mutate(cntry = "ESS Baseline")

# Combining specific countries with ESS baseline
combined_data_happy <- rbind(specific_countries_happy, ess_baseline_happy)

# Plotting the spaghetti plot for happy
ggplot(combined_data_happy, aes(x = year, y = avg_happy, group = cntry, color = cntry)) +
  geom_line() +
  scale_color_manual(values = c("HU" = "darkgreen", "FI" = "red", "ES" = "yellow", "DE" = "orange", "SE" = "blue", "ESS Baseline" = "black")) +
  labs(title = "Average Happy Score by Year for Selected Countries and ESS Baseline",
       x = "Year",
       y = "Happy Score") +
  theme_minimal() +
  theme(legend.position = "right")

Here we can note that Finland is noticeably higher than the ESS baseline. It also shows that Finland has the highest average score of all countries regardless of the year and is the most consistent as well.

#need to delete some data to prevent vector memory exhauseted error
rm(combined_data_happy)
rm(ess_baseline_happy)
rm(ess_cleaned)
rm(specific_countries_happy)
rm(yearly_happy_avg)

#Adding back previosuly deleted data from vector memory exhausted error

#filtering to our country of interest 
finland_data <- ess %>%
  filter(cntry == "FI") %>%
  mutate(
    #cleaning our outcome happy
    happy = ifelse(happy %in% c(77, 88, 99), NA, happy),
    #cleaning our main predictor education level 
     edulvla = edulvla %>%
    na_if(55) %>%
    na_if(77) %>%
    na_if(88) %>%
    na_if(99),

edulvlb = edulvlb %>%
    na_if(5555) %>%
    na_if(7777) %>%
    na_if(8888) %>%
    na_if(9999), 

#changed BA and No BA to a numeric for later coding 
educ.ba = case_when(
    essround < 5 & edulvla == 5 ~ 1,
    essround >= 5 & edulvlb > 600 ~ 1,
    TRUE ~ 0
),
    #cleaning our secondary predictor satisfaction with democracy 
     stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem)
  )

# not a bad practice to create a duplicate in case you need to backtrack or use for different purposes
df <- finland_data
#making sure all variables are properly cleaned 
finland_data <- finland_data %>% filter(!is.na(happy))
finland_data <- finland_data %>% filter(!is.na(educ.ba))
finland_data <- finland_data %>% filter(!is.na(stfdem))

table(finland_data$happy)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##   18   24   79  158  218  595  759 2577 6716 6036 1831
table(finland_data$educ.ba)
## 
##     0     1 
## 13591  5420
table(finland_data$stfdem)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##  159  202  419  853 1261 2224 2495 4512 4675 1864  347
rm(ess)

#Creating a data summary table

d <- df %>%
  # Convert factors/characters to numeric
  mutate(
    happy = as.numeric(happy),
    stfdem = as.numeric(stfdem),
    educ_ba_num = as.numeric(educ.ba)
  )
table1 <- datasummary_skim(d  %>% dplyr::select(happy, stfdem, educ_ba_num), output = "flextable")
## Warning: The histogram argument is only supported for (a) output types "default",
##   "html", "kableExtra", or "gt"; (b) writing to file paths with extensions
##   ".html", ".jpg", or ".png"; and (c) Rmarkdown, knitr or Quarto documents
##   compiled to PDF (via kableExtra)  or HTML (via kableExtra or gt). Use
##   `histogram=FALSE` to silence this warning.
table1

Unique (#)

Missing (%)

Mean

SD

Min

Median

Max

happy

12

0

8.1

1.4

0.0

8.0

10.0

stfdem

12

3

6.5

2.0

0.0

7.0

10.0

educ_ba_num

2

0

0.3

0.4

0.0

0.0

1.0

#relabeling and titling our data summary table 
d <- d %>%
  rename(
    `Happiness` = happy,
    `Satisfaction with Democracy` = stfdem,
    `Educational Attainment` = educ_ba_num
  )
  
  table1_v2 <- datasummary_skim(d %>% dplyr::select(`Happiness`,`Satisfaction with Democracy`, `Educational Attainment`), title = "Descriptive Statistics of Variables", output = "flextable")
## Warning: The histogram argument is only supported for (a) output types "default",
##   "html", "kableExtra", or "gt"; (b) writing to file paths with extensions
##   ".html", ".jpg", or ".png"; and (c) Rmarkdown, knitr or Quarto documents
##   compiled to PDF (via kableExtra)  or HTML (via kableExtra or gt). Use
##   `histogram=FALSE` to silence this warning.
  table1_v2
Descriptive Statistics of Variables

Unique (#)

Missing (%)

Mean

SD

Min

Median

Max

Happiness

12

0

8.1

1.4

0.0

8.0

10.0

Satisfaction with Democracy

12

3

6.5

2.0

0.0

7.0

10.0

Educational Attainment

2

0

0.3

0.4

0.0

0.0

1.0

#saving table 1 as a pdf
flextable::save_as_docx(table1_v2, path = "summary_table1_v2.docx", # change name to whatever you would like
                       width = 7.0, height = 7.0) # play around with dimensions

set_flextable_defaults(fonts_ignore=TRUE)
print(table1_v2, preview = "pdf")
## a flextable object.
## col_keys: ` `, `Unique (#)`, `Missing (%)`, `Mean`, `SD`, `Min`, `Median`, `Max` 
## header has 1 row(s) 
## body has 3 row(s) 
## original dataset sample: 
##                               Unique (#) Missing (%) Mean  SD Min Median  Max
## 1                   Happiness         12           0  8.1 1.4 0.0    8.0 10.0
## 2 Satisfaction with Democracy         12           3  6.5 2.0 0.0    7.0 10.0
## 3      Educational Attainment          2           0  0.3 0.4 0.0    0.0  1.0

#Hypothesis Testing with Infer

#subsetting to our country of interest 
df <- finland_data %>%
  filter(cntry == "FI") %>%
  mutate(
    #cleaning our outcome happy
    happy = ifelse(happy %in% c(77, 88, 99), NA, happy),
    #cleaning our main predictor education level 
     edulvla = edulvla %>%
    na_if(55) %>%
    na_if(77) %>%
    na_if(88) %>%
    na_if(99),

edulvlb = edulvlb %>%
    na_if(5555) %>%
    na_if(7777) %>%
    na_if(8888) %>%
    na_if(9999), 
#changing educ.ba back to a categorical variable in order to run t test
educ.ba = case_when(
    essround < 5 & edulvla == 5 ~ 'BA',
    essround >= 5 & edulvlb > 600 ~ 'BA',
    TRUE ~ 'No BA'
),
    #cleaning our secondary filter satisfaction with democracy 
     stfdem = ifelse(stfdem %in% c(77, 88, 99), NA, stfdem)
  )
test_stat <- df %>%
  specify(explanatory = educ.ba, # change variable name for explanatory variable
          response = happy) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "t") # replace in between quotation marks appropriate test statistic
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "BA" - "No BA", or divided in the order "BA" / "No BA" for ratio-based
## statistics. To specify this order yourself, supply `order = c("BA", "No BA")`
## to the calculate() function.
print(test_stat$stat)
##        t 
## 11.45481

A t-statistic of 11.45481 is quite large, indicating a significant difference in happiness between those with a BA and those without a BA. It suggests that the mean happiness is different for the two groups.

null_dist <- df %>%
  specify(explanatory = educ.ba,
          response = happy) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard
  calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "BA" - "No BA", or divided in the order "BA" / "No BA" for ratio-based
## statistics. To specify this order yourself, supply `order = c("BA", "No BA")`
## to the calculate() function.
null_dist
## Response: happy (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.143 
##  2         2  0.537 
##  3         3 -0.845 
##  4         4  0.402 
##  5         5 -2.13  
##  6         6 -0.0297
##  7         7 -0.366 
##  8         8  0.371 
##  9         9 -0.420 
## 10        10 -0.624 
## # ℹ 990 more rows
p_val <- null_dist %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided") 
## Warning: Please be cautious in reporting a p-value of 0. This result is an
## approximation based on the number of `reps` chosen in the `generate()` step.
## See `?get_p_value()` for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

A p-value of 0 means there is strong evidence to reject the null hypothesis.

null_dist %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_dist
## Response: happy (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.143 
##  2         2  0.537 
##  3         3 -0.845 
##  4         4  0.402 
##  5         5 -2.13  
##  6         6 -0.0297
##  7         7 -0.366 
##  8         8  0.371 
##  9         9 -0.420 
## 10        10 -0.624 
## # ℹ 990 more rows

Visualizes the null distribution alongside the observsed test statistic (the red line), indicating here clear evidence, under assumptions of frequentist hypothesis testing, to reject the null.

conf_int <- null_dist %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_dist %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)+
  labs(title = "Figure 1. Simulation-Based Null Distribution",
       x = "T statistic",
       y = "Count")

null_dist
## Response: happy (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.143 
##  2         2  0.537 
##  3         3 -0.845 
##  4         4  0.402 
##  5         5 -2.13  
##  6         6 -0.0297
##  7         7 -0.366 
##  8         8  0.371 
##  9         9 -0.420 
## 10        10 -0.624 
## # ℹ 990 more rows

With a 95% confidence interval we can see strong evidence to reject the null hypothesis as the test statistic falls way outside the simulated T Null Distribution.

#Regression Modelling

We need to use modelsummary here and specify three models (single, multivariate, interaction)

df$weight <- df$dweight * df$pweight
model1 <- lm(happy ~ educ.ba, data = df, weights = weight)
model2 <- lm(happy ~ educ.ba + stfdem, data = df, weights = weight)
model3 <- lm(happy ~ educ.ba + stfdem*educ.ba, data = df, weights = weight)
modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL,
  coef_omit = "Intercept")
 (1)   (2)   (3)
educ.baNo BA −0.2 (0.0)*** −0.1 (0.0)*** −0.5 (0.1)***
stfdem 0.2 (0.0)*** 0.1 (0.0)***
educ.baNo BA × stfdem 0.0 (0.0)***
Num.Obs. 19011 19011 19011
R2 0.006 0.055 0.055
R2 Adj. 0.006 0.055 0.055
AIC 66550.2 65601.6 65587.8
BIC 66573.7 65633.0 65627.1
Log.Lik. −33272.089 −32796.802 −32788.901
RMSE 1.39 1.35 1.35

#Creating a boxplot

df <- finland_data %>%
  mutate( 
    happy = ifelse(happy %in% c(77, 88, 99), NA, happy),
    #cleaning our main predictor education level 
     edulvla = edulvla %>%
    na_if(55) %>%
    na_if(77) %>%
    na_if(88) %>%
    na_if(99),

edulvlb = edulvlb %>%
    na_if(5555) %>%
    na_if(7777) %>%
    na_if(8888) %>%
    na_if(9999), 

educ.ba = case_when(
    essround < 5 & edulvla == 5 ~ 'BA',
    essround >= 5 & edulvlb > 600 ~ 'BA',
    TRUE ~ 'No BA'
)
)

figure2plot <- ggplot(df, aes(x = reorder(educ.ba, -happy, FUN=median), y = happy, fill = educ.ba)) +
  geom_boxplot() +
  theme_minimal() + 
  theme(legend.position = "none") + 
  labs(title = "Figure 2. Boxplot Comparison for Happiness and Educational Attainment (Finland)", 
       x = "Educational Attainment", 
       y = " Happiness Scale (0-10)")

figure2plot

ggsave(filename = "figure2plot.pdf", plot = figure2plot, device = "pdf", width = 8, height = 6)