#Installing & Applying Packages
packages <-c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "MASS", "aod", "interactions", "kableExtra", "flextable", "scales")
new_packages <-packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Version 2.0.0 of `modelsummary`, to be released soon, will introduce a
##   breaking change: The default table-drawing package will be `tinytable`
##   instead of `kableExtra`. All currently supported table-drawing packages
##   will continue to be supported for the foreseeable future, including
##   `kableExtra`, `gt`, `huxtable`, `flextable, and `DT`.
##   
##   You can always call the `config_modelsummary()` function to change the
##   default table-drawing package in persistent fashion. To try `tinytable`
##   now:
##   
##   config_modelsummary(factory_default = 'tinytable')
##   
##   To set the default back to `kableExtra`:
##   
##   config_modelsummary(factory_default = 'kableExtra')
## 
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## 
## Attaching package: 'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## 
## Attaching package: 'aod'
## 
## 
## The following object is masked from 'package:survival':
## 
##     rats
## 
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "aod"          "MASS"         "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [11] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "interactions" "aod"          "MASS"         "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "kableExtra"   "interactions" "aod"          "MASS"         "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "flextable"    "kableExtra"   "interactions" "aod"          "MASS"        
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "scales"       "flextable"    "kableExtra"   "interactions" "aod"         
##  [6] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
## [11] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [16] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [21] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [26] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [31] "methods"      "base"
#Import dataset
rm(list=ls()); gc()
##           used  (Mb) gc trigger  (Mb) limit (Mb) max used  (Mb)
## Ncells 2642479 141.2    4110187 219.6         NA  4110187 219.6
## Vcells 4519327  34.5   10146329  77.5      16384  6891803  52.6
ireland_data <-read.fst("ireland_data.fst")
#Cleaning and recoding variables 

ireland_data <-ireland_data %>%
  mutate(educ.ba = case_when(
    essround < 5 & edulvla == 5 ~ "BA or more",
    essround >= 5 & edulvlb > 600 ~ "BA or more",
    TRUE ~ "No BA"),
    edulvla = ifelse(edulvla %in% c(77, 88), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = as.character(educ.ba))

ireland_data <-ireland_data %>%
  mutate(gender = case_when(
    gndr == 1 ~ "Male",
    gndr == 2 ~ "Female", 
    TRUE ~ NA_character_))

ireland_data <- ireland_data %>%
  mutate(happy = case_when(
    happy %in% 77:99 ~ NA_integer_,
    TRUE ~ happy))
    
ireland_data <- ireland_data %>% filter(!is.na(educ.ba), !is.na(gender), !is.na(happy))

#Check
table(ireland_data$educ.ba) 
## 
## BA or more      No BA 
##       5971      16139
table(ireland_data$gender)
## 
## Female   Male 
##  11814  10296
table(ireland_data$happy)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##  110   85  211  497  708 1717 1844 4092 6304 3854 2688
## make new variables _numeric for each variable of interest and recode to numeric entries (need numeric variables to make datasummary table)

ireland_data <- ireland_data %>%
  mutate(educ.ba_numeric = case_when(
    educ.ba == "BA or more" ~ 1,
    educ.ba == "No BA" ~ 0,
    TRUE ~ NA_integer_
  ))

# Convert gender to numeric
ireland_data <- ireland_data %>%
  mutate(gender_numeric = case_when(
    gender == "Male" ~ 1,
    gender == "Female" ~ 2,
    TRUE ~ NA_integer_
  ))

# Convert happy to numeric
ireland_data <- ireland_data %>%
  mutate(happy_numeric = as.numeric(happy))

# Check the counts match the original numbers recoded
table(ireland_data$educ.ba_numeric) 
## 
##     0     1 
## 16139  5971
table(ireland_data$gender_numeric)
## 
##     1     2 
## 10296 11814
table(ireland_data$happy_numeric)
## 
##    0    1    2    3    4    5    6    7    8    9   10 
##  110   85  211  497  708 1717 1844 4092 6304 3854 2688
#Descriptive statistics with datasummary_skim
table_summary <- datasummary_skim(ireland_data[c("gender_numeric", "happy_numeric", "educ.ba_numeric")], title="Table 1: Descriptive Statistics for Gender, Educ.ba, and Happy")
table_summary
Table 1: Descriptive Statistics for Gender, Educ.ba, and Happy
Unique Missing Pct. Mean SD Min Median Max
gender_numeric 2 0 1.5 0.5 1.0 2.0 2.0
happy_numeric 11 0 7.5 1.9 0.0 8.0 10.0
educ.ba_numeric 2 0 0.3 0.4 0.0 0.0 1.0
# For educ.ba
#Checking variable types to determine test statistic
is.numeric(ireland_data$educ.ba)
## [1] FALSE
is.numeric(ireland_data$happy)
## [1] TRUE
# Step 1: Calculate the test statistic of your sample
test_stat <- ireland_data %>%
  specify(explanatory = educ.ba, 
          response = happy) %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "BA or more" - "No BA", or divided in the order "BA or more" / "No BA"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("BA or more", "No BA")` to the calculate() function.
print(test_stat$stat)
##        t 
## 13.02869
#Step 2: Simulate the null distribution
null_distribution <- ireland_data %>%
  specify(explanatory = educ.ba,
          response = happy) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "BA or more" - "No BA", or divided in the order "BA or more" / "No BA"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("BA or more", "No BA")` to the calculate() function.
#Step 3: Calculate the p-value of your sample
p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided") 
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
#Step 4: Visualize
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?

null_distribution
## Response: happy (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1  0.562 
##  2         2  0.0617
##  3         3  1.27  
##  4         4 -0.381 
##  5         5  0.393 
##  6         6 -1.16  
##  7         7 -0.734 
##  8         8 -0.0112
##  9         9 -1.46  
## 10        10 -0.577 
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?

#Regression Modelling 
#Model 1: educ.ba main single predictor
#Model 2: multivariate linear regression educ.ba + gender 
#Model 3: interaction model 

#Making 'No BA' the reference category
ireland_data<-ireland_data %>%
  mutate(
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")))

# Running Regression Models
model1 <-lm(happy ~ educ.ba, data = ireland_data)
model2 <-lm(happy ~ educ.ba + gender, data = ireland_data, weights=weight)
model3 <-lm(happy ~ educ.ba + gender + educ.ba*gender, data = ireland_data, weights=weight)

modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL, 
  coef_rename = c("educ.baBA or more" = "BA or more", "genderMale" = "Male"),
  title = 'Regression model for level of education and gender on happiness for Ireland')
Regression model for level of education and gender on happiness for Ireland
 (1)   (2)   (3)
(Intercept) 7.4 (0.0)*** 7.0 (0.1)*** 7.0 (0.1)***
BA or more 0.3 (0.0)*** 0.3 (0.1)** 0.3 (0.1)*
Male −0.3 (0.1)*** −0.3 (0.1)**
BA or more:Male 0.0 (0.2)
Num.Obs. 22110 2386 2386
R2 0.006 0.009 0.009
R2 Adj. 0.006 0.008 0.008
AIC 90438.2 11229.8 11231.7
BIC 90462.2 11252.9 11260.6
Log.Lik. −45216.111 −5610.878 −5610.873
RMSE 1.87 1.95 1.95
# 3rd visual: Interaction plot for model 3
interaction_plot <- effect("educ.ba*gender", model3, na.rm=TRUE)

plot(interaction_plot,
     main="Interaction Effect: Education Level & Gender on Happiness in Ireland",
     xlab="Level of Education",
     ylab="Happiness Levels")

interaction_plot
## 
##  educ.ba*gender effect
##             gender
## educ.ba        Female     Male
##   No BA      6.965602 6.662492
##   BA or more 7.249267 6.926717