options(
digits = 2,
contrasts=c("contr.treatment", "contr.treatment"),
pillar.print_max = 1000
)
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
## Loading required package: weights
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: assertthat
##
## Attaching package: 'assertthat'
## The following object is masked from 'package:tibble':
##
## has_name
## Loading required package: psych
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
## Attaching package: 'kirkegaard'
## The following object is masked from 'package:psych':
##
## rescale
## The following object is masked from 'package:assertthat':
##
## are_equal
## The following objects are masked from 'package:purrr':
##
## is_logical, is_numeric
## The following object is masked from 'package:base':
##
## +
load_packages(
haven,
rms,
mirt
)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Loading required package: stats4
theme_set(theme_bw())
#https://gssdataexplorer.norc.org/pages/show?page=gss%2Fgss_data
#2018, release 3
d = read_sav("data/GSS_spss/GSS7218_R3.sav")
# d_vars = df_var_table(d)
#wave white standardize
do_cwz = function(variable, data, whites = T) {
plyr::ddply(data, "YEAR", function(dd) {
#white subset
# browser()
if (whites) {
tibble(
var = dd[[variable]] %>% standardize(focal_group = dd$race_3way == "WHITE")
) %>% return()
}
#everybody
tibble(
var = dd[[variable]] %>% standardize()
) %>% return()
}) %>% pull(var)
}
#race simple
d %<>% mutate(
race_3way = RACE %>% as_factor() %>% fct_relevel("WHITE") %>% fct_drop()
)
d$race_3way %>% table2(include_NA = F)
#sex, age, year
d %<>% mutate(
SEX = SEX %>% as_factor(),
YEAR = YEAR %>% as.numeric(),
AGE = AGE %>% as.numeric(),
height_cm = HEIGHT * 2.54,
weight_kg = WEIGHT * 0.453592,
BMI = weight_kg / (height_cm/100)^2
)
#politics
d %<>% mutate(
pol_ideo_self = POLVIEWS %>% as_factor() %>% mapvalues(from = c("DK", "NA", "IAP"), to = rep(NA, 3)),
pol_ideo_self_num = pol_ideo_self %>% as.numeric() %>% subtract(1) %>% rescale(new_min = 0, new_max = 1)
)
d$pol_ideo_self_num %>% describe2()
#education degrees and IQ
d %<>% mutate(
WORDSUM = WORDSUM %>% as.numeric(),
wordsum_z = standardize(WORDSUM),
wordsum_wz = standardize(WORDSUM, focal_group = (race_3way == "WHITE")),
wordsum_cwz = do_cwz(variable = "WORDSUM", whites = T, data = .)
)
#wordsum g
fa_wordsum = mirt::mirt(
d %>% select(WORDA:WORDH) %>% map_df(as.numeric),
model = 1
)
## Warning: data contains response patterns with only NAs
##
Iteration: 1, Log-Lik: -104263.197, Max-Change: 0.71642
Iteration: 2, Log-Lik: -101244.504, Max-Change: 0.37105
Iteration: 3, Log-Lik: -100582.695, Max-Change: 0.18657
Iteration: 4, Log-Lik: -100460.004, Max-Change: 0.14463
Iteration: 5, Log-Lik: -100414.350, Max-Change: 0.07458
Iteration: 6, Log-Lik: -100396.651, Max-Change: 0.09150
Iteration: 7, Log-Lik: -100388.716, Max-Change: 0.04054
Iteration: 8, Log-Lik: -100385.023, Max-Change: 0.02814
Iteration: 9, Log-Lik: -100383.328, Max-Change: 0.01823
Iteration: 10, Log-Lik: -100382.175, Max-Change: 0.00372
Iteration: 11, Log-Lik: -100382.107, Max-Change: 0.00255
Iteration: 12, Log-Lik: -100382.069, Max-Change: 0.00280
Iteration: 13, Log-Lik: -100382.045, Max-Change: 0.00255
Iteration: 14, Log-Lik: -100382.024, Max-Change: 0.00107
Iteration: 15, Log-Lik: -100382.017, Max-Change: 0.00048
Iteration: 16, Log-Lik: -100382.016, Max-Change: 0.00032
Iteration: 17, Log-Lik: -100382.016, Max-Change: 0.00023
Iteration: 18, Log-Lik: -100382.015, Max-Change: 0.00022
Iteration: 19, Log-Lik: -100382.014, Max-Change: 0.00014
Iteration: 20, Log-Lik: -100382.014, Max-Change: 0.00013
Iteration: 21, Log-Lik: -100382.014, Max-Change: 0.00048
Iteration: 22, Log-Lik: -100382.014, Max-Change: 0.00015
Iteration: 23, Log-Lik: -100382.014, Max-Change: 0.00012
Iteration: 24, Log-Lik: -100382.014, Max-Change: 0.00011
Iteration: 25, Log-Lik: -100382.014, Max-Change: 0.00002
#details
fa_wordsum
##
## Call:
## mirt::mirt(data = d %>% select(WORDA:WORDH) %>% map_df(as.numeric),
## model = 1)
##
## Full-information item factor analysis with 1 factor(s).
## Converged within 1e-04 tolerance after 25 EM iterations.
## mirt version: 1.36.1
## M-step optimizer: BFGS
## EM acceleration: Ramsay
## Number of rectangular quadrature: 61
## Latent density type: Gaussian
##
## Log-likelihood = -1e+05
## Estimated parameters: 16
## AIC = 2e+05
## BIC = 2e+05; SABIC = 2e+05
fa_wordsum %>% summary
## F1 h2
## WORDA 0.495 0.245
## WORDB 0.796 0.633
## WORDC 0.559 0.313
## WORDD 0.806 0.649
## WORDE 0.725 0.526
## WORDF 0.729 0.532
## WORDG 0.479 0.229
## WORDH 0.623 0.388
##
## SS loadings: 3.5
## Proportion Var: 0.44
##
## Factor correlations:
##
## F1
## F1 1
#scores
fa_wordsum_scores = mirt::fscores(
fa_wordsum,
full.scores = TRUE,
full.scores.SE = TRUE
)
#reliability
mirt::marginal_rxx(fa_wordsum)
## [1] 0.63
# mirt::empirical_rxx(fa_wordsum_scores) #error
#save scores
d$g = fa_wordsum_scores[, 1] %>% standardize(focal_group = (d$race_3way == "WHITE"))
#without cohort changes, cohort, white standardized
d$g_cwz = do_cwz(variable = "g", whites = T, data = d)
#simple and mirt scores
GG_scatter(d, "g", "WORDSUM")
## `geom_smooth()` using formula 'y ~ x'
GG_scatter(d, "g_cwz", "wordsum_cwz")
## `geom_smooth()` using formula 'y ~ x'
#sum stats
describeBy(
d$g,
d$race_3way
)
##
## Descriptive statistics by group
## group: WHITE
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 25489 0 1 0 0.03 0.97 -3.4 1.7 5.1 -0.35 0.06 0.01
## ------------------------------------------------------------
## group: BLACK
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 4690 -0.63 1 -0.47 -0.59 0.99 -3.4 1.7 5.1 -0.33 -0.21 0.01
## ------------------------------------------------------------
## group: OTHER
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 1791 -0.68 1.1 -0.6 -0.66 1.2 -3.4 1.7 5.1 -0.21 -0.51 0.03
miss_count(d$g, reverse = T)
## [1] 31970
#fertility
d %<>% mutate(
fertility = CHILDS %>% as.numeric() %>% mapvalues(from = 9, to = NA, warn_missing = F),
)
#standardized within cohorts, and for sex (since we need the age interaction)
d$fertility_csz = plyr::ddply(d, "YEAR", function(dd) {
#remove age within this survey year
fit = ols(fertility ~ rcs(AGE) * SEX, data = dd)
tibble(
fert = resid(fit) %>% standardize()
)
}) %>% pull(fert)
#what does it look like?
d$fertility_csz %>% describe2()
d$fertility_csz %>% GG_denhist()
## Warning: Removed 412 rows containing non-finite values (stat_bin).
## Warning: Removed 412 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 412 rows containing non-finite values (stat_bin).
## Removed 412 rows containing non-finite values (stat_density).
#religion
d$OTHER %>% as_factor() %>% table2()
d$RELIG %>% as_factor() %>% table2()
d$attend_service = d$ATTEND %>% as_factor() %>% fct_drop() %>% as.ordered()
d$attend_service %>% table2()
d$attend_service_num = d$attend_service %>% as.numeric() %>% rescale(new_min = 0, new_max = 1)
#recode mormons
d %<>% mutate(
mormon = case_when(
(OTHER %in% c(59:62, 64)) ~ T,
TRUE ~ F
),
religion_group = RELIG %>% as_factor() %>% mapvalues(from = c("DK", "NA"), to = rep(NA, 2)) %>% fct_lump_min(min = 150, other_level = "OTHER") %>% fct_relevel("NONE"),
any_religion = (religion_group != "NONE")
)
d$mormon %>% table2()
d$religion_group %>% table2()
d$religion_group %>% levels()
## [1] "NONE" "PROTESTANT" "CATHOLIC" "JEWISH" "BUDDHISM"
## [6] "MOSLEM/ISLAM" "CHRISTIAN" "OTHER"
#old people, with completed fertility
d %<>% mutate(
complete_fert = (AGE >= 45 & SEX == "MALE") | (AGE >= 40 & SEX != "MALE")
)
d$complete_fert %>% table2()
#various subsets
d_c = d %>% filter(complete_fert)
d_cw = d %>% filter(complete_fert, race_3way == "WHITE")
d_cm = d %>% filter(complete_fert, mormon)
d_cwm = d %>% filter(complete_fert, race_3way == "WHITE", mormon)
d_w = d %>% filter(race_3way == "WHITE")
d_wm = d %>% filter(race_3way == "WHITE", mormon)
d_m = d %>% filter(mormon)
Define a function to run the regression for a given variable. This makes coding a lot easier and less error prone.
run_regs = function(x, data, y = "fertility") {
#make models
m1 = str_glue("{y} ~ g_cwz + {x} * g_cwz")
m2 = str_glue("{y} ~ g_cwz + {x} * g_cwz + SEX")
m3 = str_glue("{y} ~ g_cwz + {x} * g_cwz + SEX * rcs(AGE)")
m4 = str_glue("{y} ~ g_cwz + {x} * g_cwz + SEX * rcs(AGE) + rcs(YEAR)")
m5 = str_glue("{y} ~ g_cwz + {x} * g_cwz + SEX * rcs(AGE) * rcs(YEAR)")
m6 = str_glue("{y} ~ g_cwz + {x} * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way")
m7 = str_glue("{y} ~ g_cwz + {x} * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + {x} * rcs(AGE)")
list(
ols(as.formula(m1), data = data),
ols(as.formula(m2), data = data),
ols(as.formula(m3), data = data),
ols(as.formula(m4), data = data),
ols(as.formula(m5), data = data),
ols(as.formula(m6), data = data),
ols(as.formula(m7), data = data)
)
}
run_regs_summary = function(x, data, y = "fertility") {
run_regs(x = x, data = data, y = y) %>%
summarize_models(asterisks_only = F)
}
#without cohort and age controls
run_regs2 = function(x, data, y = "fertility_csz") {
#make models
m1 = str_glue("{y} ~ g_cwz + {x} * g_cwz")
m2 = str_glue("{y} ~ g_cwz + {x} * g_cwz + race_3way")
m3 = str_glue("{y} ~ g_cwz + {x} * g_cwz + race_3way + {x} * rcs(AGE)")
list(
ols(as.formula(m1), data = data),
ols(as.formula(m2), data = data),
ols(as.formula(m3), data = data)
)
}
run_regs2_summary = function(x, data, y = "fertility_csz") {
run_regs2(x = x, data = data, y = y) %>%
summarize_models(asterisks_only = F)
}
#mormons
run_regs_summary("mormon", d)
run_regs_summary("mormon", d_c)
#any religion
run_regs_summary("any_religion", d)
run_regs_summary("any_religion", d_c)
#major groups
run_regs_summary("religion_group", d)
run_regs_summary("religion_group", d_c)
#mormons
run_regs2_summary("mormon", d)
run_regs2_summary("mormon", d_c)
#any religion
run_regs2_summary("any_religion", d)
run_regs2_summary("any_religion", d_c)
#major groups
run_regs2_summary("religion_group", d)
run_regs2_summary("religion_group", d_c)
run_regs_summary("race_3way", d)
run_regs_summary("race_3way", d_c)
run_regs_summary("pol_ideo_self", d)
run_regs_summary("pol_ideo_self", d_c)
run_regs_summary("pol_ideo_self_num", d)
run_regs_summary("pol_ideo_self_num", d_c)
#cohort standardized
run_regs2_summary("pol_ideo_self", d)
run_regs2_summary("pol_ideo_self", d_c)
run_regs2_summary("pol_ideo_self_num", d)
run_regs2_summary("pol_ideo_self_num", d_c)
Reviewer requests
#R2 requested this
#model with all predictors together, main effects
list(
#standard coding
ols(fertility ~ g_cwz + mormon * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + attend_service * g_cwz + pol_ideo_self_num * g_cwz, data = d),
#religious service as numerical (higher power, less sensible)
ols(fertility ~ g_cwz + mormon * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + attend_service_num * g_cwz + pol_ideo_self_num * g_cwz, data = d)
) %>% summarize_models(asterisks_only = F, collapse_nonlinear = T, add_ref_level = F)
#model with all predictors together, interaction effect for polviews x mormon
list(
#standard coding
ols(fertility ~ g_cwz + mormon * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + attend_service * g_cwz + pol_ideo_self_num * g_cwz + pol_ideo_self_num * mormon, data = d),
#religious service as numerical (higher power, less sensible)
ols(fertility ~ g_cwz + mormon * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + attend_service_num * g_cwz + pol_ideo_self_num * g_cwz + pol_ideo_self_num * mormon, data = d)
) %>% summarize_models(asterisks_only = F, collapse_nonlinear = T, add_ref_level = F)
#model with all predictors together, interaction effect for polviews x rel attendance
list(
#standard coding
ols(fertility ~ g_cwz + mormon * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + attend_service * g_cwz + pol_ideo_self_num * g_cwz + pol_ideo_self_num * attend_service, data = d),
#religious service as numerical (higher power, less sensible)
ols(fertility ~ g_cwz + mormon * g_cwz + SEX * rcs(AGE) * rcs(YEAR) + race_3way + attend_service_num * g_cwz + pol_ideo_self_num * g_cwz + pol_ideo_self_num * attend_service_num, data = d)
) %>% summarize_models(asterisks_only = F, collapse_nonlinear = T, add_ref_level = F)
write_sessioninfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mirt_1.36.1 rms_6.3-0 SparseM_1.81
## [4] haven_2.5.0 kirkegaard_2022-05-06 psych_2.2.3
## [7] assertthat_0.2.1 weights_1.0.4 Hmisc_4.7-0
## [10] Formula_1.2-4 survival_3.2-13 lattice_0.20-45
## [13] magrittr_2.0.3 forcats_0.5.1 stringr_1.4.0
## [16] dplyr_1.0.9 purrr_0.3.4 readr_2.1.2
## [19] tidyr_1.2.0 tibble_3.1.7 ggplot2_3.3.6
## [22] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.1-1 minqa_1.2.4 colorspace_2.0-3
## [4] ellipsis_0.3.2 htmlTable_2.4.0 base64enc_0.1-3
## [7] fs_1.5.2 rstudioapi_0.13 mice_3.14.0
## [10] farver_2.1.0 Deriv_4.1.3 MatrixModels_0.5-0
## [13] fansi_1.0.3 mvtnorm_1.1-3 lubridate_1.8.0
## [16] xml2_1.3.3 codetools_0.2-18 splines_4.2.0
## [19] mnormt_2.0.2 knitr_1.39 jsonlite_1.8.0
## [22] nloptr_2.0.1 broom_0.8.0 cluster_2.1.3
## [25] dbplyr_2.1.1 png_0.1-7 compiler_4.2.0
## [28] httr_1.4.3 backports_1.4.1 Matrix_1.4-1
## [31] fastmap_1.1.0 cli_3.3.0 htmltools_0.5.2
## [34] quantreg_5.93 tools_4.2.0 gtable_0.3.0
## [37] glue_1.6.2 Rcpp_1.0.8.3 cellranger_1.1.0
## [40] jquerylib_0.1.4 vctrs_0.4.1 gdata_2.18.0
## [43] nlme_3.1-157 xfun_0.30 lme4_1.1-29
## [46] rvest_1.0.2 lifecycle_1.0.1 dcurver_0.9.2
## [49] gtools_3.9.2 polspline_1.1.20 MASS_7.3-57
## [52] zoo_1.8-10 scales_1.2.0 hms_1.1.1
## [55] parallel_4.2.0 sandwich_3.0-1 RColorBrewer_1.1-3
## [58] yaml_2.3.5 pbapply_1.5-0 gridExtra_2.3
## [61] sass_0.4.1 rpart_4.1.16 latticeExtra_0.6-29
## [64] stringi_1.7.6 highr_0.9 permute_0.9-7
## [67] checkmate_2.1.0 boot_1.3-28 rlang_1.0.2
## [70] pkgconfig_2.0.3 evaluate_0.15 labeling_0.4.2
## [73] htmlwidgets_1.5.4 tidyselect_1.1.2 plyr_1.8.7
## [76] R6_2.5.1 generics_0.1.2 multcomp_1.4-19
## [79] DBI_1.1.2 mgcv_1.8-40 pillar_1.7.0
## [82] foreign_0.8-82 withr_2.5.0 nnet_7.3-17
## [85] modelr_0.1.8 crayon_1.5.1 utf8_1.2.2
## [88] tmvnsim_1.0-2 tzdb_0.3.0 rmarkdown_2.14
## [91] jpeg_0.1-9 grid_4.2.0 readxl_1.4.0
## [94] data.table_1.14.2 vegan_2.6-2 reprex_2.0.1
## [97] digest_0.6.29 GPArotation_2022.4-1 munsell_0.5.0
## [100] bslib_0.3.1
#upload to OSF
if (F) {
library(osfr)
#auth
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/czs48/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(osf_proj, conflicts = "overwrite", path = c(
"data",
"notebook.Rmd", "notebook.html", "sessions_info.txt"
))
}