SOC222 Research Report

packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "performance", "flextable", "broom", "scales", "ggeffects", "marginaleffects") # add any you need here

# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
## 
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
## 
## Loading required package: grid
## 
## Loading required package: Matrix
## 
## 
## Attaching package: 'Matrix'
## 
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Loading required package: survival
## 
## 
## Attaching package: 'survey'
## 
## 
## The following object is masked from 'package:graphics':
## 
##     dotchart
## 
## 
## 
## Attaching package: 'flextable'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     compose
## 
## 
## 
## Attaching package: 'scales'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
## [[1]]
##  [1] "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"     "readr"    
##  [7] "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"     "graphics" 
## [13] "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[2]]
##  [1] "infer"     "lubridate" "forcats"   "stringr"   "dplyr"     "purrr"    
##  [7] "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse" "stats"    
## [13] "graphics"  "grDevices" "utils"     "datasets"  "methods"   "base"     
## 
## [[3]]
##  [1] "fst"       "infer"     "lubridate" "forcats"   "stringr"   "dplyr"    
##  [7] "purrr"     "readr"     "tidyr"     "tibble"    "ggplot2"   "tidyverse"
## [13] "stats"     "graphics"  "grDevices" "utils"     "datasets"  "methods"  
## [19] "base"     
## 
## [[4]]
##  [1] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
##  [6] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [11] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [16] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[5]]
##  [1] "effects"      "carData"      "modelsummary" "fst"          "infer"       
##  [6] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [11] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [16] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [21] "methods"      "base"        
## 
## [[6]]
##  [1] "survey"       "survival"     "Matrix"       "grid"         "effects"     
##  [6] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [11] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [16] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [21] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [26] "base"        
## 
## [[7]]
##  [1] "performance"  "survey"       "survival"     "Matrix"       "grid"        
##  [6] "effects"      "carData"      "modelsummary" "fst"          "infer"       
## [11] "lubridate"    "forcats"      "stringr"      "dplyr"        "purrr"       
## [16] "readr"        "tidyr"        "tibble"       "ggplot2"      "tidyverse"   
## [21] "stats"        "graphics"     "grDevices"    "utils"        "datasets"    
## [26] "methods"      "base"        
## 
## [[8]]
##  [1] "flextable"    "performance"  "survey"       "survival"     "Matrix"      
##  [6] "grid"         "effects"      "carData"      "modelsummary" "fst"         
## [11] "infer"        "lubridate"    "forcats"      "stringr"      "dplyr"       
## [16] "purrr"        "readr"        "tidyr"        "tibble"       "ggplot2"     
## [21] "tidyverse"    "stats"        "graphics"     "grDevices"    "utils"       
## [26] "datasets"     "methods"      "base"        
## 
## [[9]]
##  [1] "broom"        "flextable"    "performance"  "survey"       "survival"    
##  [6] "Matrix"       "grid"         "effects"      "carData"      "modelsummary"
## [11] "fst"          "infer"        "lubridate"    "forcats"      "stringr"     
## [16] "dplyr"        "purrr"        "readr"        "tidyr"        "tibble"      
## [21] "ggplot2"      "tidyverse"    "stats"        "graphics"     "grDevices"   
## [26] "utils"        "datasets"     "methods"      "base"        
## 
## [[10]]
##  [1] "scales"       "broom"        "flextable"    "performance"  "survey"      
##  [6] "survival"     "Matrix"       "grid"         "effects"      "carData"     
## [11] "modelsummary" "fst"          "infer"        "lubridate"    "forcats"     
## [16] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
## [21] "tibble"       "ggplot2"      "tidyverse"    "stats"        "graphics"    
## [26] "grDevices"    "utils"        "datasets"     "methods"      "base"        
## 
## [[11]]
##  [1] "ggeffects"    "scales"       "broom"        "flextable"    "performance" 
##  [6] "survey"       "survival"     "Matrix"       "grid"         "effects"     
## [11] "carData"      "modelsummary" "fst"          "infer"        "lubridate"   
## [16] "forcats"      "stringr"      "dplyr"        "purrr"        "readr"       
## [21] "tidyr"        "tibble"       "ggplot2"      "tidyverse"    "stats"       
## [26] "graphics"     "grDevices"    "utils"        "datasets"     "methods"     
## [31] "base"        
## 
## [[12]]
##  [1] "marginaleffects" "ggeffects"       "scales"          "broom"          
##  [5] "flextable"       "performance"     "survey"          "survival"       
##  [9] "Matrix"          "grid"            "effects"         "carData"        
## [13] "modelsummary"    "fst"             "infer"           "lubridate"      
## [17] "forcats"         "stringr"         "dplyr"           "purrr"          
## [21] "readr"           "tidyr"           "tibble"          "ggplot2"        
## [25] "tidyverse"       "stats"           "graphics"        "grDevices"      
## [29] "utils"           "datasets"        "methods"         "base"
estonia_data <- read_fst("estonia_data.fst")
estonia_income <- estonia_data %>%
  filter(cntry == "EE") %>%
  dplyr::select(hinctnta)
estonia_education <- estonia_data %>%
  filter(cntry == "EE") %>%
  dplyr::select(eisced)
estonia_gender <- estonia_data %>%
  filter(cntry == "EE") %>%
  dplyr::select(gndr)
estonia_data <- estonia_data %>%
  filter(cntry == "EE")
estonia_data$weight <- estonia_data$dweight * estonia_data$pweight
survey_design <- svydesign(ids = ~1, data = estonia_data, weights = ~weight)
estonia_data <- estonia_data %>%
  mutate(income = case_when(
    hincfel == 1 ~ 1,    # the number one is in representation of higher income 
    hincfel == 2 ~ 0,    # 0 setting to be a lower income 
    TRUE ~ NA_real_    # the rest of the data collected to NA 
  ))
estonia_data <- estonia_data %>% filter(!is.na(income))
estoniadf <- estonia_data %>%
  mutate(
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more", #setting the lowest number at 5 
      essround >= 5 & edulvlb > 500 ~ "BA or more", #setting the highest number to 500
      TRUE ~ "No BA"
    ),
    edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla), #setting intergers as 77,88,99
    edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb), #setting the rest of the integers to 5555,7777,8888
    educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
  )
table(estoniadf$educ.ba)
## 
##      No BA BA or more 
##       7597       4169

Recode to Categorize for educ.ba variable needed for future code

estonia_data <- estoniadf %>%
  mutate(field_of_study = recode(as.character(edufld), 
                      '1' = "No Specialty", 
                      '2' = "Law/Government Related ",
                      '3' = "Engineering", 
                      '4' = "Social Sciences", 
                      '5' = "Medicine/ Health Industry",
                
                      '66' = NA_character_,
                      '99' = NA_character_,))
#this step above to organize by specialty- useful later  
#check work
table(estonia_data$field_of_study)
## 
##                        10                        11                        12 
##                        94                        38                       208 
##                        13                        14                         6 
##                        34                        92                       144 
##                         7                        77                         8 
##                        75                         6                       125 
##                        88                         9               Engineering 
##                        21                       296                        91 
##   Law/Government Related  Medicine/ Health Industry              No Specialty 
##                        23                       206                       757 
##           Social Sciences 
##                       622
df <- estonia_data
df$year <- NA
replacements <- c(2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017, 2019) 
for(i in 1:10){
  df$year[df$essround == i] <- replacements[i]
}

income_avg <- df %>%
  group_by(year) %>%
  summarize(income_avg = mean(income, na.rm = TRUE))

income_plot_EE <- ggplot(income_avg, aes(x = year, y = income_avg)) +
  geom_point(aes(size = income_avg, color = income_avg), alpha = 0.7) + 
  geom_line(aes(group = 1), color = "green", linetype = "dashed") +  
  labs(
       x = "Survey Year",
       y = "Income") +
  theme_minimal() +
  theme(legend.position = "none")

print(income_plot_EE)

estoniadf <- estoniadf %>%
  mutate(income = case_when(
    hincfel == 1 ~ "Yes",    
    hincfel == 2 ~ "No",   
    TRUE ~ NA_character_   #reorganize
  ))

estoniat1 <- estoniadf %>%
  # renanming 
  mutate(
    income = income,      
    education = educ.ba,      
    field = edufld
  )
table1 <- datasummary_skim(estoniat1  %>% dplyr::select(income, educ.ba, edufld), type = "categorical",
                           title = "Table 1. Main Variables", output = "flextable")

table1
Table 1. Main Variables

N

%

income

No

9677

82.2

Yes

2089

17.8

educ.ba

No BA

7597

64.6

BA or more

4169

35.4

test_stat <- estonia_data %>%
  specify(explanatory = educ.ba, 
          response = income) %>% 
  hypothesize(null = "independence") %>%
  calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
print(test_stat$stat)
##         t 
## -17.22517
null_distribution <- estonia_data %>%
  specify(explanatory = educ.ba,
          response = income) %>%
  hypothesize(null = "independence") %>%
  generate(reps = 500, type = "permute") %>% 
  calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
p_val <- null_distribution %>% 
  get_pvalue(obs_stat = test_stat, direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
##   p_value
##     <dbl>
## 1       0

Reject null hypothesis because p-value is zero.

null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided")

null_distribution
## Response: income (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 500 × 2
##    replicate    stat
##        <int>   <dbl>
##  1         1 -0.0914
##  2         2  1.64  
##  3         3 -0.444 
##  4         4  1.64  
##  5         5 -0.544 
##  6         6  0.921 
##  7         7 -2.88  
##  8         8  0.0599
##  9         9  2.30  
## 10        10  1.23  
## # ℹ 490 more rows
conf_int <- null_distribution %>%
  get_confidence_interval(level = 0.95, type = "percentile")


null_distribution %>%
  visualize() +
  shade_p_value(obs_stat = test_stat, direction = "two-sided") +
  shade_confidence_interval(endpoints = conf_int)

model1 <- lm(income ~ educ.ba, data = df, weights = weight)
model2 <- lm(income ~ educ.ba + edufld, data = df, weights = weight)
modelsummary(
  list(model1, model2), 
  fmt = 1,
  estimate  = c( "{estimate} ({std.error}){stars}", # what is inputed here is what is supposed to be shown on the chart. 
                  "{estimate} ({std.error}){stars}"),
  statistic = NULL)
 (1)   (2)
(Intercept) 0.1 (0.0)*** 0.1 (0.0)***
educ.baBA or more 0.1 (0.0)*** 0.1 (0.0)***
edufld 0.0 (0.0)
Num.Obs. 11766 3306
R2 0.029 0.015
R2 Adj. 0.029 0.014
AIC 10627.6 2041.0
BIC 10649.7 2065.4
Log.Lik. −5310.804 −1016.478
RMSE 0.38 0.33
estonia_data <- estonia_data %>%
  mutate(
       gndr = case_when(
      gndr == 1 ~ "Male",
      gndr == 2 ~ "Female",
      TRUE ~ NA_character_
      ),
    educ.ba = case_when(
      essround < 5 & edulvla == 5 ~ "BA or more",
      TRUE ~ "No BA"
    )
  ) %>%
  filter(!is.na(gndr) & !is.na(educ.ba))
education_by_gender <- estonia_data %>%
  group_by(gndr, educ.ba) %>%
  summarise(count = n(), .groups = 'drop') %>%
  mutate(percentage = count / sum(count) * 500)
education_plot <- ggplot(education_by_gender, aes(x = educ.ba, y = count, fill = educ.ba)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~gndr) +
  labs(title = "Educational Attainment by Gender in Estonia",
       x = "Educational Level",
       y = "Number of Respondants") +
  scale_fill_manual(values = c("BA or more" ="Blue", "No BA" = "Pink")) +
  theme_minimal()

education_plot