# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "performance", "flextable", "broom", "scales", "ggeffects", "marginaleffects") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Version 2.0.0 of `modelsummary`, to be released soon, will introduce a
## breaking change: The default table-drawing package will be `tinytable`
## instead of `kableExtra`. All currently supported table-drawing packages
## will continue to be supported for the foreseeable future, including
## `kableExtra`, `gt`, `huxtable`, `flextable, and `DT`.
##
## You can always call the `config_modelsummary()` function to change the
## default table-drawing package in persistent fashion. To try `tinytable`
## now:
##
## config_modelsummary(factory_default = 'tinytable')
##
## To set the default back to `kableExtra`:
##
## config_modelsummary(factory_default = 'kableExtra')
##
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
##
##
## Attaching package: 'flextable'
##
##
## The following object is masked from 'package:purrr':
##
## compose
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "performance" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "flextable" "performance" "survey" "survival" "Matrix"
## [6] "grid" "effects" "carData" "modelsummary" "fst"
## [11] "infer" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[9]]
## [1] "broom" "flextable" "performance" "survey" "survival"
## [6] "Matrix" "grid" "effects" "carData" "modelsummary"
## [11] "fst" "infer" "lubridate" "forcats" "stringr"
## [16] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [21] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [26] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "scales" "broom" "flextable" "performance" "survey"
## [6] "survival" "Matrix" "grid" "effects" "carData"
## [11] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "ggeffects" "scales" "broom" "flextable" "performance"
## [6] "survey" "survival" "Matrix" "grid" "effects"
## [11] "carData" "modelsummary" "fst" "infer" "lubridate"
## [16] "forcats" "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [26] "graphics" "grDevices" "utils" "datasets" "methods"
## [31] "base"
##
## [[12]]
## [1] "marginaleffects" "ggeffects" "scales" "broom"
## [5] "flextable" "performance" "survey" "survival"
## [9] "Matrix" "grid" "effects" "carData"
## [13] "modelsummary" "fst" "infer" "lubridate"
## [17] "forcats" "stringr" "dplyr" "purrr"
## [21] "readr" "tidyr" "tibble" "ggplot2"
## [25] "tidyverse" "stats" "graphics" "grDevices"
## [29] "utils" "datasets" "methods" "base"
loading the data:
france_data <-read.fst("france_data.fst")
filtering and recoding our main variable of interest which is income:
france_data <- france_data %>%
filter(cntry == "FR")
france_data$weight <- france_data$dweight * france_data$pweight
survey_design <- svydesign(ids = ~1, data = france_data, weights = ~weight)
france_data <- france_data %>%
mutate(income = case_when(
hincfel == 1 ~ 1, # 1 yes I feel I have high income
hincfel == 2 ~ 0, # No to 0, I don't feel I have high income
TRUE ~ NA_real_ # Everything else to NA
))
france_data <- france_data %>% filter(!is.na(income))
recoding predictor: education
francedf <- france_data %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
essround >= 5 & edulvlb > 600 ~ "BA or more",
TRUE ~ "No BA"
),
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
)
table(francedf$educ.ba)
##
## No BA BA or more
## 9767 3082
recoding predictor: field of study
france_data <- francedf %>%
mutate(field_of_study = recode(as.character(edufld),
'1' = "general no specific field",
'2' = "law and legal services",
'3' = "humanities",
'4' = "technical and engineering",
'5' = "medical/healthservices/nursing/etc.",
'66' = NA_character_,
'99' = NA_character_,))
# check
table(france_data$field_of_study)
##
## 10 11
## 118 64
## 12 13
## 153 26
## 14 6
## 25 112
## 7 77
## 216 3
## 8 88
## 242 41
## 9 general no specific field
## 463 938
## humanities law and legal services
## 163 53
## medical/healthservices/nursing/etc. technical and engineering
## 96 576
looking at France by itself:
df <- france_data
df$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
df$year[df$essround == i] <- replacements[i]
}
income_avg <- df %>%
group_by(year) %>%
summarize(income_avg = mean(income, na.rm = TRUE))
income_plot_FR <- ggplot(income_avg, aes(x = year, y = income_avg)) +
geom_point(aes(size = income_avg, color = income_avg), alpha = 0.7) +
geom_line(aes(group = 1), color = "blue", linetype = "dashed") +
labs(
x = "Survey Year",
y = "In(Income)") +
theme_minimal() +
theme(legend.position = "none")
print(income_plot_FR)
#categorical descriptive table 1
francedf <- francedf %>%
mutate(income = case_when(
hincfel == 1 ~ "Yes", # 1 to 'Yes'
hincfel == 2 ~ "No", # 0 to 'No'
TRUE ~ NA_character_ # Keep NA values as is, but ensure character type
))
francet1 <- francedf %>%
# Relabeling variables for clarity
mutate(
income = income,
education = educ.ba,
field = edufld
)
table1 <- datasummary_skim(francet1 %>% dplyr::select(income, educ.ba, edufld), type = "categorical",
title = "Table 1. Descriptive statistics for main variables", output = "flextable")
table1
|
| N | % |
|---|---|---|---|
income | No | 7868 | 61.2 |
Yes | 4981 | 38.8 | |
educ.ba | No BA | 9767 | 76.0 |
BA or more | 3082 | 24.0 |
#numeric descriptive table 1
#Findings part one #visual by infer package #step:1
test_stat <- france_data %>%
specify(explanatory = educ.ba, # change variable name for explanatory variable
response = income) %>% # change variable name for outcome of interest
hypothesize(null = "independence") %>%
calculate(stat = "t") # replace in between quotation marks appropriate test statistic
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
print(test_stat$stat)
## t
## -28.17443
#step: 2
null_distribution <- france_data %>%
specify(explanatory = educ.ba,
response = income) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
#step:3
p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?
null_distribution
## Response: income (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
## replicate stat
## <int> <dbl>
## 1 1 0.117
## 2 2 1.31
## 3 3 1.65
## 4 4 -0.518
## 5 5 0.244
## 6 6 -0.264
## 7 7 0.244
## 8 8 -0.180
## 9 9 0.584
## 10 10 0.329
## # ℹ 990 more rows
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided") +
shade_confidence_interval(endpoints = conf_int)
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?
#Findings Part 2 #regression models main single predictor & two
predictor
model1 <- lm(income ~ educ.ba, data = df, weights = weight)
model2 <- lm(income ~ educ.ba + edufld, data = df, weights = weight)
modelsummary(
list(model1, model2),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}", # what we are specifying here is what we want to show in our model summary table
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| Â (1) | Â Â (2) | |
|---|---|---|
| (Intercept) | 0.3 (0.0)*** | 0.3 (0.0)*** |
| educ.baBA or more | 0.3 (0.0)*** | 0.2 (0.0)*** |
| edufld | 0.0 (0.0)+ | |
| Num.Obs. | 12849 | 3405 |
| R2 | 0.054 | 0.050 |
| R2 Adj. | 0.054 | 0.049 |
| AIC | 18716.7 | 4891.0 |
| BIC | 18739.1 | 4915.6 |
| Log.Lik. | −9355.344 | −2441.519 |
| RMSE | 0.47 | 0.47 |
Interaction model
model3 <- lm(income ~ educ.ba + edufld, data = df, weights = weight)
model4 <- lm(income ~ educ.ba + edufld + educ.ba*edufld, data = df, weights = weight)
modelsummary(
list(model3, model4),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}",
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| Â (1) | Â Â (2) | |
|---|---|---|
| (Intercept) | 0.3 (0.0)*** | 0.3 (0.0)*** |
| educ.baBA or more | 0.2 (0.0)*** | 0.2 (0.0)*** |
| edufld | 0.0 (0.0)+ | 0.0 (0.0)+ |
| educ.baBA or more × edufld | 0.0 (0.0) | |
| Num.Obs. | 3405 | 3405 |
| R2 | 0.050 | 0.050 |
| R2 Adj. | 0.049 | 0.049 |
| AIC | 4891.0 | 4892.4 |
| BIC | 4915.6 | 4923.1 |
| Log.Lik. | −2441.519 | −2441.201 |
| RMSE | 0.47 | 0.47 |
#visual of choice
model4 <- glm(income ~ educ.ba + edufld + edufld*educ.ba, family = binomial, data = df, weights = round(weight)) # both predictors + interaction
mydf <- predict_response(model4, terms = c("edufld", "educ.ba"))
print(mydf, collapse_tables = TRUE)
## # Predicted probabilities of income
##
## edufld | educ.ba | Predicted | 95% CI
## --------------------------------------------
## 1 | No BA | 0.33 | 0.31, 0.34
## 4 | | 0.32 | 0.31, 0.34
## 6 | | 0.32 | 0.31, 0.34
## 9 | | 0.32 | 0.31, 0.33
## 12 | | 0.32 | 0.31, 0.33
## 99 | | 0.26 | 0.22, 0.31
## 1 | BA or more | 0.54 | 0.49, 0.58
## 4 | | 0.55 | 0.52, 0.57
## 6 | | 0.55 | 0.53, 0.57
## 9 | | 0.56 | 0.54, 0.59
## 12 | | 0.57 | 0.53, 0.61
## 99 | | 0.82 | 0.25, 0.98
##
## Not all rows are shown in the output. Use `print(..., n = Inf)` to show
## all rows.
ggplot(mydf, aes(x, predicted, colour = group)) + geom_line()
#End