###Setting up the Environment
# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "MASS", "aod", "interactions", "kableExtra", "flextable", "scales", "marginaleffects","performance") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
##
##
## Attaching package: 'MASS'
##
##
## The following object is masked from 'package:dplyr':
##
## select
##
##
##
## Attaching package: 'aod'
##
##
## The following object is masked from 'package:survival':
##
## rats
##
##
##
## Attaching package: 'kableExtra'
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
##
## Attaching package: 'flextable'
##
##
## The following objects are masked from 'package:kableExtra':
##
## as_image, footnote
##
##
## The following object is masked from 'package:purrr':
##
## compose
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "MASS" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "aod" "MASS" "survey" "survival" "Matrix"
## [6] "grid" "effects" "carData" "modelsummary" "fst"
## [11] "infer" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[9]]
## [1] "interactions" "aod" "MASS" "survey" "survival"
## [6] "Matrix" "grid" "effects" "carData" "modelsummary"
## [11] "fst" "infer" "lubridate" "forcats" "stringr"
## [16] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [21] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [26] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "kableExtra" "interactions" "aod" "MASS" "survey"
## [6] "survival" "Matrix" "grid" "effects" "carData"
## [11] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "flextable" "kableExtra" "interactions" "aod" "MASS"
## [6] "survey" "survival" "Matrix" "grid" "effects"
## [11] "carData" "modelsummary" "fst" "infer" "lubridate"
## [16] "forcats" "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [26] "graphics" "grDevices" "utils" "datasets" "methods"
## [31] "base"
##
## [[12]]
## [1] "scales" "flextable" "kableExtra" "interactions" "aod"
## [6] "MASS" "survey" "survival" "Matrix" "grid"
## [11] "effects" "carData" "modelsummary" "fst" "infer"
## [16] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [21] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [26] "stats" "graphics" "grDevices" "utils" "datasets"
## [31] "methods" "base"
##
## [[13]]
## [1] "marginaleffects" "scales" "flextable" "kableExtra"
## [5] "interactions" "aod" "MASS" "survey"
## [9] "survival" "Matrix" "grid" "effects"
## [13] "carData" "modelsummary" "fst" "infer"
## [17] "lubridate" "forcats" "stringr" "dplyr"
## [21] "purrr" "readr" "tidyr" "tibble"
## [25] "ggplot2" "tidyverse" "stats" "graphics"
## [29] "grDevices" "utils" "datasets" "methods"
## [33] "base"
##
## [[14]]
## [1] "performance" "marginaleffects" "scales" "flextable"
## [5] "kableExtra" "interactions" "aod" "MASS"
## [9] "survey" "survival" "Matrix" "grid"
## [13] "effects" "carData" "modelsummary" "fst"
## [17] "infer" "lubridate" "forcats" "stringr"
## [21] "dplyr" "purrr" "readr" "tidyr"
## [25] "tibble" "ggplot2" "tidyverse" "stats"
## [29] "graphics" "grDevices" "utils" "datasets"
## [33] "methods" "base"
###Load the Italy Dataset
# Load the Italy dataset, and define a duplicate dataset df of this data to keep the original data stored in italy_data unmodified:
italy_data <- read_fst("italy_data.fst")
df <- italy_data
# How many observations are in the Italy dataset:
nrow(df)
## [1] 10178
We are interested in predicting life satisfaction. This is our outcome variable, or what we want to predict/explain. We can start by coding this variable of interest.
df$weight <- df$dweight * df$pweight
survey_design <- svydesign(ids = ~1, data = df, weights = ~weight)
# Set values of satisfcation of 0-5 to "No", indicating not satisfied with life, and values of satisfaction of 6-10 to "Yes", indicating yes satisfied with life
df <- df %>%
mutate(satisfaction = case_when(
stflife < 6 ~ 0, # 0 for not satisfied with life
stflife >=6 ~ 1, # 1 for satisfied with life
TRUE ~ NA_real_
),
stflife = ifelse(stflife %in% c(77,88,99), NA_integer_, stflife))
df <- df %>% filter(!is.na(satisfaction))
###Cleaning Variables and Subset The variables we need as our predictors are the educational attainment which we can define as educ.ba by recoding the edulvla and edulvlb variables, and the feeling about household’s income, hincfel
df <- df %>%
mutate(
# Recoding education to create a binary variable indicating whether the respondent has attained a bachelor's degree or above vs. not
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
# Handle NAs for education levels
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
# Explicitly making 'No BA' the reference category
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more")),
# Recoding for feeling about household's income
feelincome = case_when(
hincfel == 1 ~ "comfortable", # Living comfortably to "comfortable"
hincfel %in% 2:4 ~ "difficult", # Other categories to "difficult"
TRUE ~ NA_character_ # Everything else to NA
)
)
# Assuming 'italy_data' is your cleaned dataset from the European Social Survey
# Information about the data source
survey_name <- "European Social Survey"
rounds <- c("Round 9", "Round 10", "Round 11") # Specify the rounds of the survey
year_range <- "2018 - 2022" # Specify the year range of the survey rounds
# Number of observations in the cleaned sample
num_obs <- nrow(italy_data)
# Print the information
cat("Data Source:\n")
## Data Source:
cat("Survey Name:", survey_name, "\n")
## Survey Name: European Social Survey
cat("Rounds:", paste(rounds, collapse = ", "), "\n")
## Rounds: Round 9, Round 10, Round 11
cat("Year Range:", year_range, "\n\n")
## Year Range: 2018 - 2022
12
## [1] 12
In this study, we are going to look at life satisfaction, educational attainment, and income.
df <- df %>%
dplyr::select(educ.ba, feelincome, satisfaction, weight)
# Check the first few rows to verify the selection
df <- na.omit(df)
head(df)
## educ.ba feelincome satisfaction weight
## 1 No BA difficult 1 4.008902
## 2 No BA difficult 1 4.008902
## 3 No BA difficult 1 4.008902
## 4 No BA difficult 1 6.013556
## 5 No BA difficult 1 5.328524
## 6 No BA comfortable 1 5.328524
dim(df)
## [1] 9903 4
There are 9735 observations and 4 variables
We can now run some models with our variables
model1 <- glm(satisfaction ~ educ.ba, data = df, family = binomial, weights = round(weight))
model2 <- glm(satisfaction ~ educ.ba + feelincome, data = df, family = binomial, weights = round(weight))
#model3 <- glm(stflife ~ educ.ba + feelincome, educ.ba*feelincome, data = df, weights = round(weight))
modelsummary(list(model1, model2), fmt = 1, estimate = c("{estimate} ({std.error}){stars}","{estimate}({std.error}){stars}"), statistic = NULL, coef_omit = "Intercept")
| Â (1) | Â Â (2) | |
|---|---|---|
| educ.baBA or more | 0.5 (0.0)*** | 0.3(0.1)*** |
| feelincomedifficult | −1.0(0.0)*** | |
| Num.Obs. | 9903 | 9903 |
| AIC | 25535.2 | 24976.1 |
| BIC | 25549.6 | 24997.7 |
| Log.Lik. | −12765.616 | −12485.051 |
| RMSE | 0.40 | 0.39 |
We need to make a summary table
df <- df %>%
mutate(satisfaction = case_when(
satisfaction == 0 ~ "No",
satisfaction == 1 ~"Yes",
TRUE ~ NA_character_
))
dft1 <- df %>%
mutate(
Satisfaction = satisfaction,
Education = educ.ba,
Income = feelincome
)
table1 <- datasummary_skim(dft1 %>% dplyr::select(Satisfaction, Education, Income), type = "categorical", title = "Table1: Descriptive statistics for main variables", output = "flextable")
table1
|
| N | % |
|---|---|---|---|
Satisfaction | No | 1954 | 19.7 |
Yes | 7949 | 80.3 | |
Education | No BA | 8448 | 85.3 |
BA or more | 1455 | 14.7 | |
Income | comfortable | 2665 | 26.9 |
difficult | 7238 | 73.1 |
As numeric it would be:
#italy_data <- italy_data %>%
# mutate(
# Satisfaction = stflife,
# Education = edulvla,
#Income = hincfel
#)
#df_num <- italy_data
#df_num <- df_num %>% filter(!is.na(Education))
#df_num <- df_num %>% filter(!is.na(Income))
#table1b <- datasummary_skim(df %>% dplyr::select(Satisfaction, Education, Income), title = "Table 1. Descriptive statistics for main variables", output = "flextable")
The next step is to test the main relationship as made explicit in our research question. If our research question establishes that the main predictor is Education and the outcome is Life Satisfaction, we need hypotheses. Based on the literature, we have the following hypotheses:
H0: There is no significant difference in life satisfaction levels between individuals with different levels of educational attainment.
H1: The higher the individual’s level of educational attainment, the higher the life satisfaction.
We may decide to use income as our interaction variable.
We can look at the tables of our variables.
table(df$satisfaction)
##
## No Yes
## 1954 7949
table(df$educ.ba)
##
## No BA BA or more
## 8448 1455
table(df$feelincome)
##
## comfortable difficult
## 2665 7238
We can now do our hypothesis testing with infer.
Step 1: Our outcome is binary categorical and our main predictor has 2 categories. We use Chisq when we have to compare two categorical variables
df <- df %>%
mutate(
Satisfaction = case_when(
satisfaction == 0 ~ "No",
satisfaction == 1 ~ "Yes",
TRUE ~ NA_character_),
Education = case_when(
educ.ba == "BA or more" ~ "BA or more",
educ.ba == "No BA" ~ "No BA"
),
Income = case_when(
feelincome == "comfortable" ~ "comfortable", # Living comfortably to "comfortable"
feelincome == "difficult" ~ "difficult", # Other categories to "difficult"
TRUE ~ NA_character_ # Everything else to NA
)
)
table(df$Satisfaction)
## < table of extent 0 >
test_stat <- df %>%
specify(explanatory = educ.ba,
response = satisfaction, success = "Yes") %>%
hypothesize(null = "independence") %>%
calculate(stat = "Chisq")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
print(test_stat$stat)
## X-squared
## 41.74688
Our Chi squared value is 41.75
Step 2: Simulate the null distribution
null_dist <- df %>%
specify(response = satisfaction, explanatory = educ.ba, success = "Yes") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
null_dist
## Response: satisfaction (factor)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 0.551
## 2 2 0.915
## 3 3 0.107
## 4 4 0.450
## 5 5 4.16
## 6 6 1.24
## 7 7 0.209
## 8 8 1.72
## 9 9 0.915
## 10 10 0.376
## # ℹ 990 more rows
Step 3: Calculate the p-value of the sample
p_val <- null_dist %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
Step 4: This is what I will include in my report
conf_int <- null_dist %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_dist %>%
visualize(data, bins = 10, method = "simulation", dens_color = "black") +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval((endpoints = conf_int))
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
The test statistic is very extreme. We can manually edit
# Assuming 'test_stat' is our observed test statistic value
test_stat <- 41.75
conf_int <- null_dist %>%
get_confidence_interval(level = 0.95, type = "percentile")
# Visualize the null distribution with manual x-axis limits
null_dist %>%
visualize(bins = 15, method = "simulation", dens_color = "black") +
geom_vline(aes(xintercept = test_stat), color = "red", linetype = "dashed", size = 1) +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int) +
coord_cartesian(xlim = c(0, max(null_dist$stat)*1.1)) +
labs(title = "Simulation-Based Null Distribution",
x = "Chi-Square",
y = "Count")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
### Modelling We already tested our first hypothesis, but maybe we can
use a second predictor of Income.
Suppose we also hypothesize that the relationship between life satisfaction and education depends on whether someone has a comfortable feeling of their income or a difficult feeling of their income. We may expect people who have an education to be more satisfied with their life, but maybe those who have an education but a difficult feeling of their income might have a lower satisfaction with their life than those with an education and a comfortable feeling of their income. So the relationship between the predictor education and the response life satisfaction is said to depend on the levels of the second predictor of feeling of income. We can test this through an interaction hypothesis.
# We need the outcome as 0 and 1 but through the table we see values of No and Yes
table(df$satisfaction)
##
## No Yes
## 1954 7949
df <- df %>%
mutate(
satisfaction = case_when(
satisfaction == "Yes" ~ 1,
satisfaction == "No" ~ 0,
TRUE ~ NA_integer_
)
)
table(df$satisfaction)
##
## 0 1
## 1954 7949
Regression models table
model1 <- glm(satisfaction ~ educ.ba, data = df, family = binomial, weights = round(weight))
model2 <- glm(satisfaction ~ educ.ba + feelincome, data = df, family = binomial, weights = round(weight))
model3 <- glm(satisfaction ~ educ.ba + feelincome + educ.ba*feelincome, family = binomial, data = df, weights = round(weight))
modelsummary(
list(model1, model2, model3),
fmt = 1,
estimate = c("{estimate}({std.error}){stars}",
"{estimate}({std.error}){stars}",
"{estimate}({std.error}){stars}"),
statistic = NULL)
| Â (1) | Â Â (2) | Â Â (3) | |
|---|---|---|---|
| (Intercept) | 1.3(0.0)*** | 2.1(0.0)*** | 2.1(0.0)*** |
| educ.baBA or more | 0.5(0.0)*** | 0.3(0.1)*** | 0.1(0.1) |
| feelincomedifficult | −1.0(0.0)*** | −1.0(0.0)*** | |
| educ.baBA or more × feelincomedifficult | 0.3(0.1)* | ||
| Num.Obs. | 9903 | 9903 | 9903 |
| AIC | 25535.2 | 24976.1 | 24971.8 |
| BIC | 25549.6 | 24997.7 | 25000.6 |
| Log.Lik. | −12765.616 | −12485.051 | −12481.915 |
| RMSE | 0.40 | 0.39 | 0.39 |
Rename coefficients and provide a title
models <- list()
models[['SLR']] <- glm(satisfaction ~ educ.ba, data = df, family = binomial, weights = round(weight))
models[['MLR']] <- glm(satisfaction ~ educ.ba + feelincome, data = df, family = binomial, weights = round(weight))
models[['Interaction']] <- glm(satisfaction ~ educ.ba + feelincome + educ.ba*feelincome, family = binomial, data = df, weights = round(weight))
modelsummary(models,
fmt = 1,
estimate = c("{estimate}({std.error}){stars}",
"{estimate}({std.error}){stars}",
"{estimate}({std.error}){stars}"),
statistic = NULL,
exponentiate = TRUE,
coef_rename = c("educ.baBA or more" = "BA or More", "feelincomedifficult" = "Difficult Income"),
title = 'Table 2. Regression models predicting Life Satisfaction')
| SLR | MLR | Â Interaction | |
|---|---|---|---|
| (Intercept) | 3.6(0.1)*** | 7.9(0.3)*** | 8.3(0.4)*** |
| BA or More | 1.6(0.1)*** | 1.3(0.1)*** | 1.1(0.1) |
| Difficult Income | 0.4(0.0)*** | 0.4(0.0)*** | |
| BA or More:Difficult Income | 1.3(0.1)* | ||
| Num.Obs. | 9903 | 9903 | 9903 |
| AIC | 25535.2 | 24976.1 | 24971.8 |
| BIC | 25549.6 | 24997.7 | 25000.6 |
| Log.Lik. | −12765.616 | −12485.051 | −12481.915 |
| RMSE | 0.40 | 0.39 | 0.39 |
We can show the marginaleffects package
plot_comparisons(model3, variables = "educ.ba", condition = "feelincome")
#check_model(model3)
Not possible to use this function for a glm.