SOC222 Research Report
packages <- c("tidyverse", "infer", "fst", "modelsummary", "effects", "survey", "performance", "flextable", "broom", "scales", "ggeffects", "marginaleffects") # add any you need here
# Install packages if they aren't installed already
new_packages <- packages[!(packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)
# Load the packages
lapply(packages, library, character.only = TRUE)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: carData
##
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
##
## Loading required package: grid
##
## Loading required package: Matrix
##
##
## Attaching package: 'Matrix'
##
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Loading required package: survival
##
##
## Attaching package: 'survey'
##
##
## The following object is masked from 'package:graphics':
##
## dotchart
##
##
##
## Attaching package: 'flextable'
##
##
## The following object is masked from 'package:purrr':
##
## compose
##
##
##
## Attaching package: 'scales'
##
##
## The following object is masked from 'package:purrr':
##
## discard
##
##
## The following object is masked from 'package:readr':
##
## col_factor
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "infer" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "fst" "infer" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[4]]
## [1] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [6] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [11] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[5]]
## [1] "effects" "carData" "modelsummary" "fst" "infer"
## [6] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [11] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [16] "stats" "graphics" "grDevices" "utils" "datasets"
## [21] "methods" "base"
##
## [[6]]
## [1] "survey" "survival" "Matrix" "grid" "effects"
## [6] "carData" "modelsummary" "fst" "infer" "lubridate"
## [11] "forcats" "stringr" "dplyr" "purrr" "readr"
## [16] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [21] "graphics" "grDevices" "utils" "datasets" "methods"
## [26] "base"
##
## [[7]]
## [1] "performance" "survey" "survival" "Matrix" "grid"
## [6] "effects" "carData" "modelsummary" "fst" "infer"
## [11] "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [16] "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [21] "stats" "graphics" "grDevices" "utils" "datasets"
## [26] "methods" "base"
##
## [[8]]
## [1] "flextable" "performance" "survey" "survival" "Matrix"
## [6] "grid" "effects" "carData" "modelsummary" "fst"
## [11] "infer" "lubridate" "forcats" "stringr" "dplyr"
## [16] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [21] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [26] "datasets" "methods" "base"
##
## [[9]]
## [1] "broom" "flextable" "performance" "survey" "survival"
## [6] "Matrix" "grid" "effects" "carData" "modelsummary"
## [11] "fst" "infer" "lubridate" "forcats" "stringr"
## [16] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [21] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [26] "utils" "datasets" "methods" "base"
##
## [[10]]
## [1] "scales" "broom" "flextable" "performance" "survey"
## [6] "survival" "Matrix" "grid" "effects" "carData"
## [11] "modelsummary" "fst" "infer" "lubridate" "forcats"
## [16] "stringr" "dplyr" "purrr" "readr" "tidyr"
## [21] "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [26] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "ggeffects" "scales" "broom" "flextable" "performance"
## [6] "survey" "survival" "Matrix" "grid" "effects"
## [11] "carData" "modelsummary" "fst" "infer" "lubridate"
## [16] "forcats" "stringr" "dplyr" "purrr" "readr"
## [21] "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [26] "graphics" "grDevices" "utils" "datasets" "methods"
## [31] "base"
##
## [[12]]
## [1] "marginaleffects" "ggeffects" "scales" "broom"
## [5] "flextable" "performance" "survey" "survival"
## [9] "Matrix" "grid" "effects" "carData"
## [13] "modelsummary" "fst" "infer" "lubridate"
## [17] "forcats" "stringr" "dplyr" "purrr"
## [21] "readr" "tidyr" "tibble" "ggplot2"
## [25] "tidyverse" "stats" "graphics" "grDevices"
## [29] "utils" "datasets" "methods" "base"
estonia_data <- read_fst("estonia_data.fst")
estonia_income <- estonia_data %>%
filter(cntry == "EE") %>%
dplyr::select(hinctnta)
estonia_education <- estonia_data %>%
filter(cntry == "EE") %>%
dplyr::select(eisced)
estonia_gender <- estonia_data %>%
filter(cntry == "EE") %>%
dplyr::select(gndr)
estonia_data <- estonia_data %>%
filter(cntry == "EE")
estonia_data$weight <- estonia_data$dweight * estonia_data$pweight
survey_design <- svydesign(ids = ~1, data = estonia_data, weights = ~weight)
estonia_data <- estonia_data %>%
mutate(income = case_when(
hincfel == 1 ~ 1, # the number one is in representation of higher income
hincfel == 2 ~ 0, # 0 setting to be a lower income
TRUE ~ NA_real_ # the rest of the data collected to NA
))
estonia_data <- estonia_data %>% filter(!is.na(income))
estoniadf <- estonia_data %>%
mutate(
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more", #setting the lowest number at 5
essround >= 5 & edulvlb > 500 ~ "BA or more", #setting the highest number to 500
TRUE ~ "No BA"
),
edulvla = ifelse(edulvla %in% c(77, 88, 99), NA_integer_, edulvla), #setting intergers as 77,88,99
edulvlb = ifelse(edulvlb %in% c(5555, 7777, 8888), NA_integer_, edulvlb), #setting the rest of the integers to 5555,7777,8888
educ.ba = factor(educ.ba, levels = c("No BA", "BA or more"))
)
table(estoniadf$educ.ba)
##
## No BA BA or more
## 7597 4169
Recode to Categorize for educ.ba variable needed for future code
estonia_data <- estoniadf %>%
mutate(field_of_study = recode(as.character(edufld),
'1' = "No Specialty",
'2' = "Law/Government Related ",
'3' = "Engineering",
'4' = "Social Sciences",
'5' = "Medicine/ Health Industry",
'66' = NA_character_,
'99' = NA_character_,))
#this step above to organize by specialty- useful later
#check work
table(estonia_data$field_of_study)
##
## 10 11 12
## 94 38 208
## 13 14 6
## 34 92 144
## 7 77 8
## 75 6 125
## 88 9 Engineering
## 21 296 91
## Law/Government Related Medicine/ Health Industry No Specialty
## 23 206 757
## Social Sciences
## 622
df <- estonia_data
df$year <- NA
replacements <- c(2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015, 2017, 2019)
for(i in 1:10){
df$year[df$essround == i] <- replacements[i]
}
income_avg <- df %>%
group_by(year) %>%
summarize(income_avg = mean(income, na.rm = TRUE))
income_plot_EE <- ggplot(income_avg, aes(x = year, y = income_avg)) +
geom_point(aes(size = income_avg, color = income_avg), alpha = 0.7) +
geom_line(aes(group = 1), color = "green", linetype = "dashed") +
labs(
x = "Survey Year",
y = "Income") +
theme_minimal() +
theme(legend.position = "none")
print(income_plot_EE)
estoniadf <- estoniadf %>%
mutate(income = case_when(
hincfel == 1 ~ "Yes",
hincfel == 2 ~ "No",
TRUE ~ NA_character_ #reorganize
))
estoniat1 <- estoniadf %>%
# renanming
mutate(
income = income,
education = educ.ba,
field = edufld
)
table1 <- datasummary_skim(estoniat1 %>% dplyr::select(income, educ.ba, edufld), type = "categorical",
title = "Table 1. Main Variables", output = "flextable")
table1
|
| N | % |
|---|---|---|---|
income | No | 9677 | 82.2 |
Yes | 2089 | 17.8 | |
educ.ba | No BA | 7597 | 64.6 |
BA or more | 4169 | 35.4 |
test_stat <- estonia_data %>%
specify(explanatory = educ.ba,
response = income) %>%
hypothesize(null = "independence") %>%
calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
print(test_stat$stat)
## t
## -17.22517
null_distribution <- estonia_data %>%
specify(explanatory = educ.ba,
response = income) %>%
hypothesize(null = "independence") %>%
generate(reps = 500, type = "permute") %>%
calculate(stat = "t")
## Warning: The statistic is based on a difference or ratio; by default, for
## difference-based statistics, the explanatory variable is subtracted in the
## order "No BA" - "BA or more", or divided in the order "No BA" / "BA or more"
## for ratio-based statistics. To specify this order yourself, supply `order =
## c("No BA", "BA or more")` to the calculate() function.
p_val <- null_distribution %>%
get_pvalue(obs_stat = test_stat, direction = "two-sided")
## Warning: Please be cautious in reporting a p-value of 0. This result is an approximation
## based on the number of `reps` chosen in the `generate()` step.
## ℹ See `get_p_value()` (`?infer::get_p_value()`) for more information.
p_val
## # A tibble: 1 × 1
## p_value
## <dbl>
## 1 0
Reject null hypothesis because p-value is zero.
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided")
null_distribution
## Response: income (numeric)
## Explanatory: educ.ba (factor)
## Null Hypothesis: independence
## # A tibble: 500 × 2
## replicate stat
## <int> <dbl>
## 1 1 -0.0914
## 2 2 1.64
## 3 3 -0.444
## 4 4 1.64
## 5 5 -0.544
## 6 6 0.921
## 7 7 -2.88
## 8 8 0.0599
## 9 9 2.30
## 10 10 1.23
## # ℹ 490 more rows
conf_int <- null_distribution %>%
get_confidence_interval(level = 0.95, type = "percentile")
null_distribution %>%
visualize() +
shade_p_value(obs_stat = test_stat, direction = "two-sided") +
shade_confidence_interval(endpoints = conf_int)
model1 <- lm(income ~ educ.ba, data = df, weights = weight)
model2 <- lm(income ~ educ.ba + edufld, data = df, weights = weight)
modelsummary(
list(model1, model2),
fmt = 1,
estimate = c( "{estimate} ({std.error}){stars}", # what is inputed here is what is supposed to be shown on the chart.
"{estimate} ({std.error}){stars}"),
statistic = NULL)
| Â (1) | Â Â (2) | |
|---|---|---|
| (Intercept) | 0.1 (0.0)*** | 0.1 (0.0)*** |
| educ.baBA or more | 0.1 (0.0)*** | 0.1 (0.0)*** |
| edufld | 0.0 (0.0) | |
| Num.Obs. | 11766 | 3306 |
| R2 | 0.029 | 0.015 |
| R2 Adj. | 0.029 | 0.014 |
| AIC | 10627.6 | 2041.0 |
| BIC | 10649.7 | 2065.4 |
| Log.Lik. | −5310.804 | −1016.478 |
| RMSE | 0.38 | 0.33 |
estonia_data <- estonia_data %>%
mutate(
gndr = case_when(
gndr == 1 ~ "Male",
gndr == 2 ~ "Female",
TRUE ~ NA_character_
),
educ.ba = case_when(
essround < 5 & edulvla == 5 ~ "BA or more",
TRUE ~ "No BA"
)
) %>%
filter(!is.na(gndr) & !is.na(educ.ba))
education_by_gender <- estonia_data %>%
group_by(gndr, educ.ba) %>%
summarise(count = n(), .groups = 'drop') %>%
mutate(percentage = count / sum(count) * 500)
education_plot <- ggplot(education_by_gender, aes(x = educ.ba, y = count, fill = educ.ba)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~gndr) +
labs(title = "Educational Attainment by Gender in Estonia",
x = "Educational Level",
y = "Number of Respondants") +
scale_fill_manual(values = c("BA or more" ="Blue", "No BA" = "Pink")) +
theme_minimal()
education_plot