# List of packages
packages <- c("tidyverse", "modelsummary", "flextable",
"fst", "viridis", "knitr", "kableExtra", "rmarkdown", "ggridges", "questionr", "infer", "effects", "survey", "performance", "broom", "scales", "ggeffects", "marginaleffects")
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##
## Attaching package: 'flextable'
##
##
## The following object is masked from 'package:purrr':
##
## compose
##
##
## Loading required package: viridisLite
##
##
## Attaching package: 'kableExtra'
##
##
## The following objects are masked from 'package:flextable':
##
## as_image, footnote
##
##
## The following object is masked from 'package:dplyr':
##
## group_rows
##
##
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:viridis':
##
## viridis_pal
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [6] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [11] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[3]]
## [1] "flextable" "modelsummary" "lubridate" "forcats" "stringr"
## [6] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [11] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "fst" "flextable" "modelsummary" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "viridis" "viridisLite" "fst" "flextable" "modelsummary"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "knitr" "viridis" "viridisLite" "fst" "flextable"
## [6] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[7]]
## [1] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [6] "flextable" "modelsummary" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
##
## [[8]]
## [1] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [6] "fst" "flextable" "modelsummary" "lubridate" "forcats"
## [11] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [16] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [21] "grDevices" "utils" "datasets" "methods" "base"
##
## [[9]]
## [1] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [6] "viridisLite" "fst" "flextable" "modelsummary" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[10]]
## [1] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [6] "viridis" "viridisLite" "fst" "flextable" "modelsummary"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[11]]
## [1] "infer" "questionr" "ggridges" "rmarkdown" "kableExtra"
## [6] "knitr" "viridis" "viridisLite" "fst" "flextable"
## [11] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[12]]
## [1] "effects" "carData" "infer" "questionr" "ggridges"
## [6] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [11] "fst" "flextable" "modelsummary" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[13]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "infer" "questionr" "ggridges" "rmarkdown"
## [11] "kableExtra" "knitr" "viridis" "viridisLite" "fst"
## [16] "flextable" "modelsummary" "lubridate" "forcats" "stringr"
## [21] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [26] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [31] "utils" "datasets" "methods" "base"
##
## [[14]]
## [1] "performance" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "infer" "questionr" "ggridges"
## [11] "rmarkdown" "kableExtra" "knitr" "viridis" "viridisLite"
## [16] "fst" "flextable" "modelsummary" "lubridate" "forcats"
## [21] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [26] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [31] "grDevices" "utils" "datasets" "methods" "base"
##
## [[15]]
## [1] "broom" "performance" "survey" "survival" "Matrix"
## [6] "grid" "effects" "carData" "infer" "questionr"
## [11] "ggridges" "rmarkdown" "kableExtra" "knitr" "viridis"
## [16] "viridisLite" "fst" "flextable" "modelsummary" "lubridate"
## [21] "forcats" "stringr" "dplyr" "purrr" "readr"
## [26] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [31] "graphics" "grDevices" "utils" "datasets" "methods"
## [36] "base"
##
## [[16]]
## [1] "scales" "broom" "performance" "survey" "survival"
## [6] "Matrix" "grid" "effects" "carData" "infer"
## [11] "questionr" "ggridges" "rmarkdown" "kableExtra" "knitr"
## [16] "viridis" "viridisLite" "fst" "flextable" "modelsummary"
## [21] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [26] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [31] "stats" "graphics" "grDevices" "utils" "datasets"
## [36] "methods" "base"
##
## [[17]]
## [1] "ggeffects" "scales" "broom" "performance" "survey"
## [6] "survival" "Matrix" "grid" "effects" "carData"
## [11] "infer" "questionr" "ggridges" "rmarkdown" "kableExtra"
## [16] "knitr" "viridis" "viridisLite" "fst" "flextable"
## [21] "modelsummary" "lubridate" "forcats" "stringr" "dplyr"
## [26] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [31] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [36] "datasets" "methods" "base"
##
## [[18]]
## [1] "marginaleffects" "ggeffects" "scales" "broom"
## [5] "performance" "survey" "survival" "Matrix"
## [9] "grid" "effects" "carData" "infer"
## [13] "questionr" "ggridges" "rmarkdown" "kableExtra"
## [17] "knitr" "viridis" "viridisLite" "fst"
## [21] "flextable" "modelsummary" "lubridate" "forcats"
## [25] "stringr" "dplyr" "purrr" "readr"
## [29] "tidyr" "tibble" "ggplot2" "tidyverse"
## [33] "stats" "graphics" "grDevices" "utils"
## [37] "datasets" "methods" "base"
spain_data <- read.fst("spain_data.fst")
spain_clean <- spain_data %>%
mutate(
hinctnta = ifelse(hinctnta %in% c(77, 88, 99), NA, hinctnta),
health = ifelse(health %in% c(7, 8, 9), NA, health),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888, 9999), NA, edulvlb),
)
spain_clean <- spain_clean %>% filter(!is.na(hinctnta), !is.na(edulvlb), !is.na(health))
table(spain_clean$hinctnta)
##
## 1 2 3 4 5 6 7 8 9 10
## 933 1131 1124 1109 949 908 754 689 690 609
table(spain_clean$health)
##
## 1 2 3 4 5
## 1600 3810 2588 797 101
##PART 1 of FINDINGS section: Education and Health
spain_task1 <- spain_clean %>%
mutate(
hinctnta = case_when(
hinctnta == 1 ~ "1st Decile",
hinctnta == 2 ~ "2nd Decile",
hinctnta == 3 ~ "3rd Decile",
hinctnta == 4 ~ "4th Decile",
hinctnta == 5 ~ "5th Decile",
hinctnta == 6 ~ "6th Decile",
hinctnta == 7 ~ "7th Decile",
hinctnta == 8 ~ "8th Decile",
hinctnta == 9 ~ "9th Decile",
hinctnta == 10 ~ "10th Decile",
hinctnta %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ NA_character_
),
Health = case_when(
health == 1 ~ "Very Good",
health == 2 ~ "Good",
health == 3 ~ "Fair",
health == 4 ~ "bad",
health == 5 ~ "Very Bad",
health %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ NA_character_
),
educ.ba = case_when(
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
) %>%
filter(!is.na(health), !is.na(educ.ba), !is.na(hinctnta))
table(spain_task1$health)
##
## 1 2 3 4 5
## 1600 3810 2588 797 101
table(spain_task1$educ.ba)
##
## No BA BA or more
## 6649 2247
mytab <- table(spain_task1$health, spain_task1$educ.ba)
mytab
##
## No BA BA or more
## 1 1084 516
## 2 2696 1114
## 3 2072 516
## 4 703 94
## 5 94 7
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
expected <- outer(rsums, csums) / N
expected
## No BA BA or more
## 1 1195.86331 404.13669
## 2 2847.64951 962.35049
## 3 1934.30890 653.69110
## 4 595.68941 201.31059
## 5 75.48887 25.51113
spain_task2 <- spain_data %>%
mutate(
health = recode(as.character(health),
'1' = "Very Good",
'2' = "Good",
'3' = "Fair",
'4' = "Bad",
'5' = "Very Bad",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_),
educ.ba = case_when(
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
) %>%
filter(!is.na(health), !is.na(educ.ba)
)
table(spain_task2$health)
##
## Bad Fair Good Very Bad Very Good
## 1793 5297 8538 239 3564
table(spain_task2$educ.ba)
##
## No BA BA or more
## 16564 2867
# Step 1: Calculate the test statistic
library(infer)
test_stat <- spain_task2 %>%
specify(response = health, explanatory = educ.ba) %>%
hypothesize(null = "independence") %>%
calculate(stat = "chisq")
print(test_stat$stat)
## X-squared
## 171.0094
# Steps 2 & 3: Simulate the null distribution and calculate the p-value
library(infer)
null_distribution <- spain_task2 %>%
specify(response = health, explanatory = educ.ba) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
# p-value
p_value <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_value
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
# Step 4: Visualization with CI
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize(data, bins = 10, method = "simulation", dens_color = "black") +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int)
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
# Assuming 'test_stat' is your observed test statistic value
test_stat <- 960
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
# Visualize the null distribution with manual x-axis limits
null_distribution %>%
visualize(bins = 15, method = "simulation", dens_color = "black") +
geom_vline(aes(xintercept = test_stat), color = "red", linetype = "dashed", size = 1) +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int) +
coord_cartesian(xlim = c(0, max(null_distribution$stat) * 1.1)) + # Set manual x-axis limits
labs(title = "Simulation-Based Null Distribution",
x = "Chi-Square",
y = "Count")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
##PART 1 of FINDINGS section: Income and Health
spain_clean <- spain_data %>%
mutate(
hinctnta = ifelse(hinctnta %in% c(77, 88, 99), NA, hinctnta),
health = ifelse(health %in% c(7, 8, 9), NA, health),
)
table(spain_clean$hinctnta)
##
## 1 2 3 4 5 6 7 8 9 10
## 1069 1289 1396 1400 1185 1041 908 786 772 707
table(spain_clean$health)
##
## 1 2 3 4 5
## 3564 8538 5297 1793 239
spain_task1 <- spain_clean %>%
mutate(
hinctnta = case_when(
hinctnta == 1 ~ "1st Decile",
hinctnta == 2 ~ "2nd Decile",
hinctnta == 3 ~ "3rd Decile",
hinctnta == 4 ~ "4th Decile",
hinctnta == 5 ~ "5th Decile",
hinctnta == 6 ~ "6th Decile",
hinctnta == 7 ~ "7th Decile",
hinctnta == 8 ~ "8th Decile",
hinctnta == 9 ~ "9th Decile",
hinctnta == 10 ~ "10th Decile",
hinctnta %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ NA_character_
),
health = case_when(
health == 1 ~ "Very Good",
health == 2 ~ "Good",
health == 3 ~ "Fair",
health == 4 ~ "bad",
health == 5 ~ "Very Bad",
health %in% c(7, 8, 9) ~ NA_character_,
TRUE ~ NA_character_
),
)
table(spain_task1$health)
##
## bad Fair Good Very Bad Very Good
## 1793 5297 8538 239 3564
table(spain_task1$hinctnta)
##
## 10th Decile 1st Decile 2nd Decile 3rd Decile 4th Decile 5th Decile
## 707 1069 1289 1396 1400 1185
## 6th Decile 7th Decile 8th Decile 9th Decile
## 1041 908 786 772
mytab <- table(spain_task1$health, spain_task1$hinctnta)
mytab
##
## 10th Decile 1st Decile 2nd Decile 3rd Decile 4th Decile 5th Decile
## bad 23 211 177 150 113 78
## Fair 156 354 436 441 404 329
## Good 327 353 494 576 603 522
## Very Bad 3 32 18 20 14 14
## Very Good 198 118 162 208 263 241
##
## 6th Decile 7th Decile 8th Decile 9th Decile
## bad 73 46 40 42
## Fair 263 248 181 187
## Good 501 425 402 367
## Very Bad 6 6 0 2
## Very Good 197 182 163 174
rsums <- rowSums(mytab)
csums <- colSums(mytab)
N <- sum(mytab)
expected <- outer(rsums, csums) / N
expected
## 10th Decile 1st Decile 2nd Decile 3rd Decile 4th Decile 5th Decile
## bad 63.906952 96.53837 116.33416 126.09646 126.27725 107.02381
## Fair 201.109077 303.79702 366.09248 396.81353 397.38243 336.79370
## Good 306.458314 462.93844 557.86683 604.68083 605.54776 513.22015
## Very Bad 7.711752 11.64944 14.03822 15.21626 15.23807 12.91473
## Very Good 127.813905 193.07673 232.66831 252.19292 252.55449 214.04761
## 6th Decile 7th Decile 8th Decile 9th Decile
## bad 94.00740 81.985298 71.047899 69.782415
## Fair 295.83231 257.999905 223.580954 219.598596
## Good 450.80148 393.150906 340.701888 334.633406
## Very Bad 11.34402 9.893294 8.573461 8.420753
## Very Good 188.01480 163.970597 142.095798 139.564830
spain_task2 <- spain_data %>%
mutate(
health = recode(as.character(health),
'1' = "Very Good",
'2' = "Good",
'3' = "Fair",
'4' = "Bad",
'5' = "Very Bad",
'7' = NA_character_,
'8' = NA_character_,
'9' = NA_character_),
hinctnta = case_when(
hinctnta == 1 ~ "1st Decile",
hinctnta == 2 ~ "2nd Decile",
hinctnta == 3 ~ "3rd Decile",
hinctnta == 4 ~ "4th Decile",
hinctnta == 5 ~ "5th Decile",
hinctnta == 6 ~ "6th Decile",
hinctnta == 7 ~ "7th Decile",
hinctnta == 8 ~ "8th Decile",
hinctnta == 9 ~ "9th Decile",
hinctnta == 10 ~ "10th Decile",
hinctnta %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ NA_character_
),
) %>%
filter(!is.na(health), !is.na(hinctnta)
)
table(spain_task2$health)
##
## Bad Fair Good Very Bad Very Good
## 953 2999 4570 115 1906
table(spain_task2$hinctnta)
##
## 10th Decile 1st Decile 2nd Decile 3rd Decile 4th Decile 5th Decile
## 707 1068 1287 1395 1397 1184
## 6th Decile 7th Decile 8th Decile 9th Decile
## 1040 907 786 772
# Step 1: Calculate the test statistic
test_stat <- spain_task2 %>%
specify(response = health, explanatory = hinctnta) %>%
hypothesize(null = "independence") %>%
calculate(stat = "chisq")
print(test_stat$stat)
## X-squared
## 538.5648
# Steps 2 & 3: Simulate the null distribution and calculate the p-value
null_distribution <- spain_task2 %>%
specify(response = health, explanatory = hinctnta) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
# p-value
p_value <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "greater")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_value
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
# Step 4: Visualization with CI
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize(data, bins = 10, method = "simulation", dens_color = "black") +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int)
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
# Assuming 'test_stat' is your observed test statistic value
test_stat <- 960
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
# Visualize the null distribution with manual x-axis limits
null_distribution %>%
visualize(bins = 15, method = "simulation", dens_color = "black") +
geom_vline(aes(xintercept = test_stat), color = "red", linetype = "dashed", size = 1) +
shade_p_value(obs_stat = test_stat, direction = "greater") +
shade_confidence_interval(endpoints = conf_int) +
coord_cartesian(xlim = c(0, max(null_distribution$stat) * 1.1)) + # Set manual x-axis limits
labs(title = "Simulation-Based Null Distribution",
x = "Chi-Square",
y = "Count")
## Warning in min(diff(unique_loc)): no non-missing arguments to min; returning
## Inf
##PART 2 of FINDINGS section
Second, you will provide your regression modelling output. There will be three models. First model will be your main single predictor. The second model will be your two predictor (m2ultivariate) regression model. The third model will be your interaction model involving your two predictors.
"health" %in% names(spain_task2)
## [1] TRUE
library(dplyr)
table(spain_data$health)
##
## 1 2 3 4 5 7 8 9
## 3564 8538 5297 1793 239 6 3 12
spain_task2 <- spain_data %>%
mutate(
health = recode(as.numeric(health),
`1`= 1, `2`= 2, `3`= 3, `4`= 4, `5`= 5,
.default = NA_real_),
hinctnta = case_when(
hinctnta == 1 ~ "1st Decile",
hinctnta == 2 ~ "2nd Decile",
hinctnta == 3 ~ "3rd Decile",
hinctnta == 4 ~ "4th Decile",
hinctnta == 5 ~ "5th Decile",
hinctnta == 6 ~ "6th Decile",
hinctnta == 7 ~ "7th Decile",
hinctnta == 8 ~ "8th Decile",
hinctnta == 9 ~ "9th Decile",
hinctnta == 10 ~ "10th Decile",
hinctnta %in% c(77, 88, 99) ~ NA_character_,
TRUE ~ NA_character_
),
educ.ba = case_when(
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
) %>%
filter(!is.na(health), !is.na(hinctnta), !is.na(educ.ba))
table(spain_task2$health)
##
## 1 2 3 4 5
## 1906 4570 2999 953 115
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
model1 <- lm(health ~ educ.ba, data = spain_task2) # single predictor
model2 <- lm(health ~ educ.ba + hinctnta, data = spain_task2) # two predictors
model3 <- lm(health ~ educ.ba + hinctnta + educ.ba*hinctnta, data = spain_task2) # both predictors + interaction
# Create a model summary table
modelsummary(
list(model1, model2, model3),
fmt = 1,
estimate = c("{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL
)
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | 2.4 (0.0)*** | 2.1 (0.0)*** | 2.0 (0.1)*** |
| educ.baBA or more | −0.3 (0.0)*** | −0.2 (0.0)*** | 0.0 (0.1) |
| hinctnta1st Decile | 0.6 (0.0)*** | 0.7 (0.1)*** | |
| hinctnta2nd Decile | 0.4 (0.0)*** | 0.5 (0.1)*** | |
| hinctnta3rd Decile | 0.3 (0.0)*** | 0.4 (0.1)*** | |
| hinctnta4th Decile | 0.2 (0.0)*** | 0.3 (0.1)*** | |
| hinctnta5th Decile | 0.2 (0.0)*** | 0.2 (0.1)*** | |
| hinctnta6th Decile | 0.2 (0.0)*** | 0.2 (0.1)*** | |
| hinctnta7th Decile | 0.1 (0.0)** | 0.2 (0.1)*** | |
| hinctnta8th Decile | 0.1 (0.0) | 0.1 (0.1) | |
| hinctnta9th Decile | 0.1 (0.0)* | 0.1 (0.1)+ | |
| educ.baBA or more × hinctnta1st Decile | −0.4 (0.1)** | ||
| educ.baBA or more × hinctnta2nd Decile | −0.3 (0.1)** | ||
| educ.baBA or more × hinctnta3rd Decile | −0.3 (0.1)** | ||
| educ.baBA or more × hinctnta4th Decile | −0.1 (0.1) | ||
| educ.baBA or more × hinctnta5th Decile | 0.0 (0.1) | ||
| educ.baBA or more × hinctnta6th Decile | −0.1 (0.1) | ||
| educ.baBA or more × hinctnta7th Decile | −0.2 (0.1) | ||
| educ.baBA or more × hinctnta8th Decile | 0.0 (0.1) | ||
| educ.baBA or more × hinctnta9th Decile | 0.0 (0.1) | ||
| Num.Obs. | 10543 | 10543 | 10543 |
| R2 | 0.016 | 0.048 | 0.051 |
| R2 Adj. | 0.016 | 0.047 | 0.049 |
| AIC | 27723.4 | 27386.1 | 27376.0 |
| BIC | 27745.2 | 27473.3 | 27528.5 |
| Log.Lik. | −13858.697 | −13681.047 | −13667.007 |
| RMSE | 0.90 | 0.89 | 0.88 |
models <- list()
models[['SLR']]<- lm(health ~ educ.ba, data = spain_task2) # single predictor
models[['MLR']]<- lm(health ~ educ.ba + hinctnta, data = spain_task2) # two predictors
models[['Interaction']]<- lm(health ~ educ.ba + hinctnta + educ.ba*hinctnta, data = spain_task2) # both predictors + interaction
modelsummary(models,
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL,
exponentiate = TRUE, ## NOTE: this is to exponentiate directly, otherwise leave out or set to FALSE
coef_rename = c("educ.baBA or more" = "BA or more", "hinctnta1st Decile" = "1st Decile", "hinctnta2nd Decile" = "2nd Decile", "hinctnta3rd Decile" = "3rd Decile", "hinctnta4th Decile" = "4th Decile", "hinctnta5th Decile" = "5th Decile", "hinctnta6th Decile" = "6th Decile", "hinctnta7th Decile" = "7th Decile", "hinctnta8th Decile" = "8th Decile", "hinctnta9th Decile" = "9th Decile"),
title = 'Table 2. Regression models predicting health')
| SLR | MLR | Interaction | |
|---|---|---|---|
| (Intercept) | 10.8 (0.1)*** | 8.2 (0.3)*** | 7.7 (0.4)*** |
| BA or more | 0.8 (0.0)*** | 0.9 (0.0)*** | 1.0 (0.1) |
| 1st Decile | 1.8 (0.1)*** | 2.0 (0.1)*** | |
| 2nd Decile | 1.5 (0.1)*** | 1.7 (0.1)*** | |
| 3rd Decile | 1.4 (0.1)*** | 1.5 (0.1)*** | |
| 4th Decile | 1.2 (0.1)*** | 1.3 (0.1)*** | |
| 5th Decile | 1.2 (0.1)*** | 1.2 (0.1)*** | |
| 6th Decile | 1.2 (0.1)*** | 1.2 (0.1)*** | |
| 7th Decile | 1.1 (0.1)** | 1.2 (0.1)*** | |
| 8th Decile | 1.1 (0.0) | 1.1 (0.1) | |
| 9th Decile | 1.1 (0.1)* | 1.1 (0.1)+ | |
| BA or more:1st Decile | 0.7 (0.1)** | ||
| BA or more:2nd Decile | 0.7 (0.1)** | ||
| BA or more:3rd Decile | 0.7 (0.1)** | ||
| BA or more:4th Decile | 0.9 (0.1) | ||
| BA or more:5th Decile | 1.0 (0.1) | ||
| BA or more:6th Decile | 0.9 (0.1) | ||
| BA or more:7th Decile | 0.9 (0.1) | ||
| BA or more:8th Decile | 1.0 (0.1) | ||
| BA or more:9th Decile | 1.0 (0.1) | ||
| Num.Obs. | 10543 | 10543 | 10543 |
| R2 | 0.016 | 0.048 | 0.051 |
| R2 Adj. | 0.016 | 0.047 | 0.049 |
| AIC | 27723.4 | 27386.1 | 27376.0 |
| BIC | 27745.2 | 27473.3 | 27528.5 |
| Log.Lik. | −13858.697 | −13681.047 | −13667.007 |
| RMSE | 0.90 | 0.89 | 0.88 |
**MY code for the interaction effect was working until the last second before submission :( here is a text version of it..
install.packages(‘effects’) install.packages(‘estimability’)
interaction_plot <- effect(“educ.ba*hinctnta”, model3, na.rm=TRUE)
plot(interaction_plot, main=“Interaction effect”, xlab=“Education and Income”, ylab=“health”)
interaction_plot