Naccarato_Isabella_ResearchReport

###Setting up the Environment

# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "MASS", "aod", "interactions", "kableExtra", "flextable", "scales", "marginaleffects","performance") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## 
## Attaching package: 'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## 
## Attaching package: 'aod'
## 
## 
## The following object is masked from 'package:survival':
## 
##     rats
## 
## 
## 
## Attaching package: 'kableExtra'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following objects are masked from 'package:kableExtra':
## 
##     as_image, footnote
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "aod"          "MASS"         "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [11] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "interactions" "aod"          "MASS"         "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "kableExtra"   "interactions" "aod"          "MASS"         "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "flextable"    "kableExtra"   "interactions" "aod"          "MASS"        
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "scales"       "flextable"    "kableExtra"   "interactions" "aod"         
##  [6] "MASS"         "survey"       "survival"     "Matrix"       "grid"        
## [11] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [16] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [21] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [26] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [31] "methods"      "base"        
## 
## [[13]]
##  [1] "marginaleffects" "scales"          "flextable"       "kableExtra"     
##  [5] "interactions"    "aod"             "MASS"            "survey"         
##  [9] "survival"        "Matrix"          "grid"            "effects"        
## [13] "carData"         "modelsummary"    "fst"             "infer"          
## [17] "lubridate"       "forcats"         "stringr"         "dplyr"          
## [21] "purrr"           "readr"           "tidyr"           "tibble"         
## [25] "ggplot2"         "tidyverse"       "stats"           "graphics"       
## [29] "grDevices"       "utils"           "datasets"        "methods"        
## [33] "base"           
## 
## [[14]]
##  [1] "performance"     "marginaleffects" "scales"          "flextable"      
##  [5] "kableExtra"      "interactions"    "aod"             "MASS"           
##  [9] "survey"          "survival"        "Matrix"          "grid"           
## [13] "effects"         "carData"         "modelsummary"    "fst"            
## [17] "infer"           "lubridate"       "forcats"         "stringr"        
## [21] "dplyr"           "purrr"           "readr"           "tidyr"          
## [25] "tibble"          "ggplot2"         "tidyverse"       "stats"          
## [29] "graphics"        "grDevices"       "utils"           "datasets"       
## [33] "methods"         "base"

###Load the Italy Dataset

# Load the Italy dataset, and define a duplicate dataset df of this data to keep the original data stored in italy_data unmodified:
italy_data <- read_fst("italy_data.fst")
df <- italy_data

Examine the Dataset

# How many observations are in the Italy dataset:
nrow(df)

## [1] 10178

Research Question

We are interested in predicting life satisfaction. This is our outcome variable, or what we want to predict/explain. We can start by coding this variable of interest.

df$weight <- df$dweight * df$pweight 
survey_design <- svydesign(ids = ~1, data = df, weights = ~weight)

# Set values of satisfcation of 0-5 to "No", indicating not satisfied with life, and values of satisfaction of 6-10 to "Yes", indicating yes satisfied with life
df <- df %>%
  mutate(satisfaction = case_when(
    stflife < 6 ~ 0, # 0 for not satisfied with life
    stflife >=6 ~ 1, # 1 for satisfied with life
    TRUE ~ NA_real_
  ),
    
  stflife = ifelse(stflife %in% c(77,88,99), NA_integer_, stflife))

df <- df %>% filter(!is.na(satisfaction))

###Cleaning Variables and Subset The variables we need as our predictors are the educational attainment which we can define as educ.ba by recoding the edulvla and edulvlb variables, and the feeling about household’s income, hincfel

df <- df %>%
  mutate(
# Recoding education to create a binary variable indicating whether the respondent has attained a bachelor's degree or above vs. not
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    
    # Handle NAs for education levels
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    
    # Explicitly making 'No BA' the reference category
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),
    
    # Recoding for feeling about household's income
    feelincome = case_when(
      hincfel == 1 ~ "comfortable", # Living comfortably to "comfortable"
      hincfel %in% 2:4 ~ "difficult", # Other categories to "difficult"
      TRUE ~ NA_character_          # Everything else to NA
    )
  )
# Assuming 'italy_data' is your cleaned dataset from the European Social Survey

# Information about the data source
survey_name <- "European Social Survey"
rounds <- c("Round 9", "Round 10", "Round 11")  # Specify the rounds of the survey
year_range <- "2018 - 2022"  # Specify the year range of the survey rounds

# Number of observations in the cleaned sample
num_obs <- nrow(italy_data)

# Print the information
cat("Data Source:\n")

## Data Source:

cat("Survey Name:", survey_name, "\n")

## Survey Name: European Social Survey

cat("Rounds:", paste(rounds, collapse = ", "), "\n")

## Rounds: Round 9, Round 10, Round 11

cat("Year Range:", year_range, "\n\n")

## Year Range: 2018 - 2022

## [1] 12

We should subset our Italy dataset to select the variables of interest

In this study, we are going to look at life satisfaction, educational attainment, and income.

df <- df %>%
  dplyr::select(educ.ba, feelincome, satisfaction, weight)

# Check the first few rows to verify the selection
df <- na.omit(df)
head(df)

##   educ.ba  feelincome satisfaction   weight
## 1   No BA   difficult            1 4.008902
## 2   No BA   difficult            1 4.008902
## 3   No BA   difficult            1 4.008902
## 4   No BA   difficult            1 6.013556
## 5   No BA   difficult            1 5.328524
## 6   No BA comfortable            1 5.328524

Check now how many observations are in the dataset

dim(df)

## [1] 9903    4

There are 9735 observations and 4 variables

Run models

We can now run some models with our variables

model1 <- glm(satisfaction ~ educ.ba, data = df, family = binomial, weights = round(weight))
model2 <- glm(satisfaction ~ educ.ba + feelincome, data = df, family = binomial, weights = round(weight))
#model3 <- glm(stflife ~ educ.ba + feelincome, educ.ba*feelincome, data = df, weights = round(weight))


modelsummary(list(model1, model2), fmt = 1, estimate = c("{estimate} ({std.error}){stars}","{estimate}({std.error}){stars}"), statistic = NULL, coef_omit = "Intercept")

	(1)	(2)
educ.baBA or more	0.5 (0.0)***	0.3(0.1)***
feelincomedifficult		−1.0(0.0)***
Num.Obs.	9903	9903
AIC	25535.2	24976.1
BIC	25549.6	24997.7
Log.Lik.	−12765.616	−12485.051
RMSE	0.40	0.39

Descriptive Table

We need to make a summary table

df <- df %>%
  mutate(satisfaction = case_when(
    satisfaction == 0 ~ "No",
    satisfaction == 1 ~"Yes",
    TRUE ~ NA_character_
  ))
dft1 <- df %>%
  mutate(
    Satisfaction = satisfaction,
    Education = educ.ba,
    Income = feelincome
  )
table1 <- datasummary_skim(dft1 %>% dplyr::select(Satisfaction, Education, Income), type = "categorical", title = "Table1: Descriptive statistics for main variables", output = "flextable")

table1

Table1: Descriptive statistics for main variables
		N	%
Satisfaction	No	1954	19.7
	Yes	7949	80.3
Education	No BA	8448	85.3
	BA or more	1455	14.7
Income	comfortable	2665	26.9
	difficult	7238	73.1

As numeric it would be:

#italy_data <- italy_data %>%
 # mutate(
  #  Satisfaction = stflife,
   # Education = edulvla,
    #Income = hincfel
  #)

#df_num <- italy_data
#df_num <- df_num %>% filter(!is.na(Education))
#df_num <- df_num %>% filter(!is.na(Income))

#table1b <- datasummary_skim(df %>% dplyr::select(Satisfaction, Education, Income), title = "Table 1. Descriptive statistics for main variables", output = "flextable")

Test the relationship

The next step is to test the main relationship as made explicit in our research question. If our research question establishes that the main predictor is Education and the outcome is Life Satisfaction, we need hypotheses. Based on the literature, we have the following hypotheses:

H0: There is no significant difference in life satisfaction levels between individuals with different levels of educational attainment.

H1: The higher the individual’s level of educational attainment, the higher the life satisfaction.

We may decide to use income as our interaction variable.

We can look at the tables of our variables.

table(df$satisfaction)

## 
##   No  Yes 
## 1954 7949

table(df$educ.ba)

## 
##      No BA BA or more 
##       8448       1455

table(df$feelincome)

## 
## comfortable   difficult 
##        2665        7238

Hypothesis Testing

We can now do our hypothesis testing with infer.

Step 1: Our outcome is binary categorical and our main predictor has 2 categories. We use Chisq when we have to compare two categorical variables

df <- df %>%
  mutate(
    Satisfaction = case_when(
      satisfaction == 0 ~ "No",
      satisfaction == 1 ~ "Yes",
      TRUE ~ NA_character_),
    Education = case_when(
      educ.ba == "BA or more" ~ "BA or more",
      educ.ba == "No BA" ~ "No BA"
    ),
    
  Income = case_when(
      feelincome == "comfortable" ~ "comfortable", # Living comfortably to "comfortable"
      feelincome == "difficult" ~ "difficult", # Other categories to "difficult"
      TRUE ~ NA_character_          # Everything else to NA
    )
      
    )

table(df$Satisfaction)

## < table of extent 0 >

test_stat <- df %>%
  specify(explanatory = educ.ba,
          response = satisfaction, success = "Yes") %>%
  hypothesize(null = "independence") %>%
  calculate(stat = "Chisq")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.

print(test_stat$stat)

## X-squared 
##  41.74688

Our Chi squared value is 41.75

Step 2: Simulate the null distribution

null_dist <- df %>%
  specify(response = satisfaction, explanatory = educ.ba, success = "Yes") %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>%
  calculate(stat = "Chisq")

## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.

null_dist

## Response: satisfaction (factor)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate  stat
##        <int> <dbl>
##  1         1 0.551
##  2         2 0.915
##  3         3 0.107
##  4         4 0.450
##  5         5 4.16 
##  6         6 1.24 
##  7         7 0.209
##  8         8 1.72 
##  9         9 0.915
## 10        10 0.376
## # ℹ 990 more rows

Step 3: Calculate the p-value of the sample

p_val <- null_dist %>%
  get_pvalue(obs_stat = test_stat, direction = "greater")

## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.

p_val

## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Step 4: This is what I will include in my report

conf_int <- null_dist %>%
  get_confidence_interval(level = 0.95, type = "percentile")

null_dist %>%
  visualize(data, bins = 10, method = "simulation", dens_color = "black") + 
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval((endpoints = conf_int))

## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

The test statistic is very extreme. We can manually edit

# Assuming 'test_stat' is our observed test statistic value
test_stat <- 41.75

conf_int <- null_dist %>%
  get_confidence_interval(level = 0.95, type = "percentile")

# Visualize the null distribution with manual x-axis limits
null_dist %>%
  visualize(bins = 15, method = "simulation", dens_color = "black") +
  geom_vline(aes(xintercept = test_stat), color = "red", linetype = "dashed", size = 1) +
  shade_p_value(obs_stat = test_stat, direction = "greater") +
  shade_confidence_interval(endpoints = conf_int) +
  coord_cartesian(xlim = c(0, max(null_dist$stat)*1.1)) + 
  labs(title = "Simulation-Based Null Distribution",
       x = "Chi-Square",
       y = "Count")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf

### Modelling We already tested our first hypothesis, but maybe we can use a second predictor of Income.

Suppose we also hypothesize that the relationship between life satisfaction and education depends on whether someone has a comfortable feeling of their income or a difficult feeling of their income. We may expect people who have an education to be more satisfied with their life, but maybe those who have an education but a difficult feeling of their income might have a lower satisfaction with their life than those with an education and a comfortable feeling of their income. So the relationship between the predictor education and the response life satisfaction is said to depend on the levels of the second predictor of feeling of income. We can test this through an interaction hypothesis.

# We need the outcome as 0 and 1 but through the table we see values of No and Yes
table(df$satisfaction)

## 
##   No  Yes 
## 1954 7949

df <- df %>%
  mutate(
    satisfaction = case_when(
      satisfaction == "Yes" ~ 1,
      satisfaction == "No" ~ 0,
      TRUE ~ NA_integer_
    )
  )

table(df$satisfaction)

## 
##    0    1 
## 1954 7949

Regression models table

model1 <- glm(satisfaction ~ educ.ba, data = df, family = binomial, weights = round(weight))
model2 <- glm(satisfaction ~ educ.ba + feelincome, data = df, family = binomial, weights = round(weight))
model3 <- glm(satisfaction ~ educ.ba + feelincome + educ.ba*feelincome, family = binomial, data = df, weights = round(weight))

modelsummary(
  list(model1, model2, model3),
  fmt = 1,
  estimate = c("{estimate}({std.error}){stars}",
               "{estimate}({std.error}){stars}",
               "{estimate}({std.error}){stars}"),
  statistic = NULL)

	(1)	(2)	(3)
(Intercept)	1.3(0.0)***	2.1(0.0)***	2.1(0.0)***
educ.baBA or more	0.5(0.0)***	0.3(0.1)***	0.1(0.1)
feelincomedifficult		−1.0(0.0)***	−1.0(0.0)***
educ.baBA or more × feelincomedifficult			0.3(0.1)*
Num.Obs.	9903	9903	9903
AIC	25535.2	24976.1	24971.8
BIC	25549.6	24997.7	25000.6
Log.Lik.	−12765.616	−12485.051	−12481.915
RMSE	0.40	0.39	0.39

Improving the Display

Rename coefficients and provide a title

models <- list()
models[['SLR']] <- glm(satisfaction ~ educ.ba, data = df, family = binomial, weights = round(weight))
models[['MLR']] <- glm(satisfaction ~ educ.ba + feelincome, data = df, family = binomial, weights = round(weight))
models[['Interaction']] <- glm(satisfaction ~ educ.ba + feelincome + educ.ba*feelincome, family = binomial, data = df, weights = round(weight))

modelsummary(models,
             fmt = 1,
             estimate = c("{estimate}({std.error}){stars}",
               "{estimate}({std.error}){stars}",
               "{estimate}({std.error}){stars}"),
  statistic = NULL,
  exponentiate = TRUE,
  coef_rename = c("educ.baBA or more" = "BA or More", "feelincomedifficult" = "Difficult Income"),
  title = 'Table 2. Regression models predicting Life Satisfaction')

Table 2. Regression models predicting Life Satisfaction
	SLR	MLR	Interaction
(Intercept)	3.6(0.1)***	7.9(0.3)***	8.3(0.4)***
BA or More	1.6(0.1)***	1.3(0.1)***	1.1(0.1)
Difficult Income		0.4(0.0)***	0.4(0.0)***
BA or More:Difficult Income			1.3(0.1)*
Num.Obs.	9903	9903	9903
AIC	25535.2	24976.1	24971.8
BIC	25549.6	24997.7	25000.6
Log.Lik.	−12765.616	−12485.051	−12481.915
RMSE	0.40	0.39	0.39

Visual of Choice

We can show the marginaleffects package

plot_comparisons(model3, variables = "educ.ba", condition = "feelincome")

Checking model fit

#check_model(model3)

Not possible to use this function for a glm.