# List of packages
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "performance", "flextable", "broom", "scales", "ggeffects", "marginaleffects") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Version 2.0.0 of `modelsummary`, to be released soon, will introduce a
##   breaking change: The default table-drawing package will be `tinytable`
##   instead of `kableExtra`. All currently supported table-drawing packages
##   will continue to be supported for the foreseeable future, including
##   `kableExtra`, `gt`, `huxtable`, `flextable, and `DT`.
##   
##   You can always call the `config_modelsummary()` function to change the
##   default table-drawing package in persistent fashion. To try `tinytable`
##   now:
##   
##   config_modelsummary(factory_default = 'tinytable')
##   
##   To set the default back to `kableExtra`:
##   
##   config_modelsummary(factory_default = 'kableExtra')
## 
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "performance"  "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "flextable"    "performance"  "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [11] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "broom"        "flextable"    "performance"  "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "scales"       "broom"        "flextable"    "performance"  "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "ggeffects"    "scales"       "broom"        "flextable"    "performance" 
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "marginaleffects" "ggeffects"       "scales"          "broom"          
##  [5] "flextable"       "performance"     "survey"          "survival"       
##  [9] "Matrix"          "grid"            "effects"         "carData"        
## [13] "modelsummary"    "fst"             "infer"           "lubridate"      
## [17] "forcats"         "stringr"         "dplyr"           "purrr"          
## [21] "readr"           "tidyr"           "tibble"          "ggplot2"        
## [25] "tidyverse"       "stats"           "graphics"        "grDevices"      
## [29] "utils"           "datasets"        "methods"         "base"

loading the data:

france_data <-read.fst("france_data.fst")

filtering and recoding our main variable of interest which is income:

france_data <- france_data %>%
  filter(cntry == "FR")
france_data$weight <- france_data$dweight * france_data$pweight
survey_design <- svydesign(ids = ~1, data = france_data, weights = ~weight)
france_data <- france_data %>%
  mutate(income = case_when(
    hincfel == 1 ~ 1,    # 1 yes I feel I have high income 
    hincfel == 2 ~ 0,    # No to 0, I don't feel I have high income
    TRUE ~ NA_real_    # Everything else to NA
  ))
france_data <- france_data %>% filter(!is.na(income))

recoding predictor: education

francedf <- france_data %>%
  mutate(
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      essround >= 5 & edulvlb > 600 ~ "BA or more",
      TRUE ~ "No BA"
    ),
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla),
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb),
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )
table(francedf$educ.ba)
## 
##      No BA BA or more 
##       9767       3082

recoding predictor: field of study

france_data <- francedf %>%
  mutate(field_of_study = recode(as.character(edufld), 
                      '1' = "general no specific field", 
                      '2' = "law and legal services", 
                      '3' = "humanities", 
                      '4' = "technical and engineering", 
                      '5' = "medical/healthservices/nursing/etc.",
                      '66' = NA_character_,
                      '99' = NA_character_,))

# check
table(france_data$field_of_study)
## 
##                                  10                                  11 
##                                 118                                  64 
##                                  12                                  13 
##                                 153                                  26 
##                                  14                                   6 
##                                  25                                 112 
##                                   7                                  77 
##                                 216                                   3 
##                                   8                                  88 
##                                 242                                  41 
##                                   9           general no specific field 
##                                 463                                 938 
##                          humanities              law and legal services 
##                                 163                                  53 
## medical/healthservices/nursing/etc.           technical and engineering 
##                                  96                                 576

looking at France by itself:

df <- france_data
df$year <- NA
replacements <- c(2002, 2004, 2006, 2008, 2010, 2012, 2014, 2016, 2018, 2020)
for(i in 1:10){
  df$year[df$essround == i] <- replacements[i]
}

income_avg <- df %>%
  group_by(year) %>%
  summarize(income_avg = mean(income, na.rm = TRUE))

income_plot_FR <- ggplot(income_avg, aes(x = year, y = income_avg)) +
  geom_point(aes(size = income_avg, color = income_avg), alpha = 0.7) + 
  geom_line(aes(group = 1), color = "blue", linetype = "dashed") +  
  labs(
       x = "Survey Year",
       y = "In(Income)") +
  theme_minimal() +
  theme(legend.position = "none")

print(income_plot_FR)

#categorical descriptive table 1

francedf <- francedf %>%
  mutate(income = case_when(
    hincfel == 1 ~ "Yes",    # 1 to 'Yes'
    hincfel == 2 ~ "No",     # 0 to 'No'
    TRUE ~ NA_character_     # Keep NA values as is, but ensure character type
  ))

francet1 <- francedf %>%
  # Relabeling variables for clarity
  mutate(
    income = income,      
    education = educ.ba,      
    field = edufld
  )
table1 <- datasummary_skim(francet1  %>% dplyr::select(income, educ.ba, edufld), type = "categorical",
                           title = "Table 1. Descriptive statistics for main variables", output = "flextable")

table1
Table 1. Descriptive statistics for main variables

N

%

income

No

7868

61.2

Yes

4981

38.8

educ.ba

No BA

9767

76.0

BA or more

3082

24.0

#numeric descriptive table 1

#Findings part one #visual by infer package #step:1

test_stat <- france_data %>%
  specify(explanatory = educ.ba, # change variable name for explanatory variable
          response = income) %>% # change variable name for outcome of interest
  hypothesize(null = "independence") %>%
  calculate(stat = "t") # replace in between quotation marks appropriate test statistic
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
print(test_stat$stat)
##         t 
## -28.17443

#step: 2

null_distribution <- france_data %>%
  specify(explanatory = educ.ba,
          response = income) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 1000, type = "permute") %>% # only line we are adding, use reps = 1000 as standard but change if too computationally demanding
  calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.

#step:3

p_val <- null_distribution %>% # replace name here if you assigned something other than null_distribution above
  get_pvalue(obs_stat = test_stat, direction = "two-sided") # would only replace test_stat if assigned another name in Step 1
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0
null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?

null_distribution
## Response: income (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 1,000 × 2
##    replicate   stat
##        <int>  <dbl>
##  1         1  0.117
##  2         2  1.31 
##  3         3  1.65 
##  4         4 -0.518
##  5         5  0.244
##  6         6 -0.264
##  7         7  0.244
##  8         8 -0.180
##  9         9  0.584
## 10        10  0.329
## # ℹ 990 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)
## Warning in (function (mapping = NULL, data = NULL, stat = "identity", position = "identity", : All aesthetics have length 1, but the data has 1000 rows.
## ℹ Did you mean to use `annotate()`?

#Findings Part 2 #regression models main single predictor & two predictor

model1 <- lm(income ~ educ.ba, data = df, weights = weight)
model2 <- lm(income ~ educ.ba + edufld, data = df, weights = weight)
modelsummary(
  list(model1, model2), 
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}", # what we are specifying here is what we want to show in our model summary table
                  "{estimate} ({std.error}){stars}"),
  statistic = NULL)
 (1)   (2)
(Intercept) 0.3 (0.0)*** 0.3 (0.0)***
educ.baBA or more 0.3 (0.0)*** 0.2 (0.0)***
edufld 0.0 (0.0)+
Num.Obs. 12849 3405
R2 0.054 0.050
R2 Adj. 0.054 0.049
AIC 18716.7 4891.0
BIC 18739.1 4915.6
Log.Lik. −9355.344 −2441.519
RMSE 0.47 0.47

Interaction model

model3 <- lm(income ~ educ.ba + edufld, data = df, weights = weight)
model4 <- lm(income ~ educ.ba + edufld + educ.ba*edufld, data = df, weights = weight)
modelsummary(
  list(model3, model4),
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}",
                "{estimate} ({std.error}){stars}"),
  statistic = NULL)
 (1)   (2)
(Intercept) 0.3 (0.0)*** 0.3 (0.0)***
educ.baBA or more 0.2 (0.0)*** 0.2 (0.0)***
edufld 0.0 (0.0)+ 0.0 (0.0)+
educ.baBA or more × edufld 0.0 (0.0)
Num.Obs. 3405 3405
R2 0.050 0.050
R2 Adj. 0.049 0.049
AIC 4891.0 4892.4
BIC 4915.6 4923.1
Log.Lik. −2441.519 −2441.201
RMSE 0.47 0.47

#visual of choice

model4 <- glm(income ~ educ.ba + edufld + edufld*educ.ba, family = binomial, data = df, weights = round(weight)) # both predictors + interaction
mydf <- predict_response(model4, terms = c("edufld", "educ.ba"))
print(mydf, collapse_tables = TRUE)
## # Predicted probabilities of income
## 
## edufld |    educ.ba | Predicted |     95% CI
## --------------------------------------------
##      1 |      No BA |      0.33 | 0.31, 0.34
##      4 |            |      0.32 | 0.31, 0.34
##      6 |            |      0.32 | 0.31, 0.34
##      9 |            |      0.32 | 0.31, 0.33
##     12 |            |      0.32 | 0.31, 0.33
##     99 |            |      0.26 | 0.22, 0.31
##      1 | BA or more |      0.54 | 0.49, 0.58
##      4 |            |      0.55 | 0.52, 0.57
##      6 |            |      0.55 | 0.53, 0.57
##      9 |            |      0.56 | 0.54, 0.59
##     12 |            |      0.57 | 0.53, 0.61
##     99 |            |      0.82 | 0.25, 0.98
## 
## Not all rows are shown in the output. Use `print(..., n = Inf)` to show
##   all rows.
ggplot(mydf, aes(x, predicted, colour = group)) + geom_line()

#End