This was updated to add more stringent Hispanic filtering and age normed IQs.
options(digits = 2)
#remotes::install_github("tidyverse/haven")
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
haven,
rms,
mirt,
ggeffects
)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:psych':
##
## describe
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Loading required package: stats4
## Loading required package: lattice
theme_set(theme_bw())
d = read_stata("data/GSS_stata/gss7224_r1.dta")
d_vars = df_var_table(d)
#race
d$race %>% as_factor() %>% table2(include_NA = F)
d$race_3way = d$race %>% as_factor() %>% fct_drop()
d$race_3way %>% table2()
#hispanic
d$hispanic %>% as_factor() %>% table2()
d$not_hispanic = d$hispanic==1
d$not_hispanic %>% table2()
d$ethnic2 = d$ethnic %>% as_factor() %>% as.character()
#too little data
#recode from ethnic
d$hispanic_ethnic = d$ethnic2 %in% c("mexico", "american indian", "puerto rico", "cuba", "other central america", "ecuador", "peru", "venezuela", "honduras", "panama", "nicaragua", "argentina", "costa rica", "bolivia", "guyana", "belize", "chile", "french guiana", "uruguay", "paraguay", "suriname", "other spanish")
d$hispanic_ethnic %>% table2()
#ethnic origin
d$ethnic2 %>% table2() %>% print(n=20)
## # A tibble: 107 × 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 uncodeable 13257 17.5
## 2 germany 9936 13.1
## 3 england & wales 7883 10.4
## 4 ireland 7147 9.44
## 5 africa (general) 4779 6.31
## 6 italy 3407 4.50
## 7 mexico 3011 3.98
## 8 american indian 2764 3.65
## 9 scotland 2021 2.67
## 10 poland 1763 2.33
## 11 american only 1460 1.93
## 12 france 1234 1.63
## 13 norway 1048 1.38
## 14 sweden 951 1.26
## 15 netherlands 887 1.17
## 16 russia 840 1.11
## 17 other spanish 729 0.963
## 18 czechoslovakia 723 0.955
## 19 don't know 722 0.954
## 20 puerto rico 716 0.946
## # ℹ 87 more rows
#sex
d %<>% mutate(
sex = sex %>% as_factor(),
sex_2way = case_when(
sex %in% c("female", "male") ~ sex,
.default = NA
) %>% fct_drop()
)
#year and age
d$year %<>% as.numeric()
d$age %<>% as.numeric()
#filter out hispanics
d %<>% filter(not_hispanic, !hispanic_ethnic)
#catch indians in Ethnic variable
#but we can't use this for whites, too many countries
d$Indian = (d$ethnic2 == "india")
d$Indian %>% sum()
## [1] 310
#wordsum items
wordsum_items = d %>% select(worda:wordj)
#IRT fit
wordsum_irt = mirt(
wordsum_items %>% map_df(as.numeric)
)
## Warning: data contains response patterns with only NAs
#scores
wordsum_irt_scores = fscores(wordsum_irt)
#save the white standard scores
d$wordsum_g = wordsum_irt_scores[, 1] %>% standardize()
#age controlled
wordsum_age_adj = kirkegaard::make_norms(
d$wordsum_g,
age = d$age,
norm_group = d$race_3way=="white"
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
d$wordsum_IQ = wordsum_age_adj$data$IQ
#examine the missing data structure
d %>% select(
Indian, race_3way, age, born, wordsum_IQ, sex_2way, year, hispanic_ethnic
) %>%
miss_patterns()
#then subset to variables with no missing
d_indian = d %>% select(
Indian, race_3way, age, born, wordsum_IQ, year, sex_2way, hispanic_ethnic
) %>%
miss_filter() %>%
filter(
Indian | race_3way == "white"
)
#we need special encoding for race here
d_indian$Indian2 = tibble(
Indian = d_indian$Indian,
White = d_indian$race_3way=="white",
US_born = d_indian$born==1
) %>%
encode_combinations() %>%
fct_relevel("White, US_born")
#counts
d_indian$Indian2 %>% table2()
#desc by group
describe2(
d_indian$wordsum_IQ,
d_indian$Indian2
)
## New names:
## • `` -> `...1`
#ols
ols_list = list(
ols(wordsum_IQ ~ Indian2, data = d_indian),
ols(wordsum_IQ ~ Indian2 + sex_2way, data = d_indian),
ols(wordsum_IQ ~ Indian2 + year + sex_2way, data = d_indian)
)
ols_list %>% summarize_models()
ols_list[[3]] %>%
ggaverage(terms = c("Indian2")) %>%
plot()
#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] ggeffects_2.3.1 mirt_1.45.1 lattice_0.22-7
## [4] rms_8.0-0 Hmisc_5.2-4 haven_2.5.5
## [7] kirkegaard_2025-11-19 robustbase_0.99-6 psych_2.5.6
## [10] assertthat_0.2.1 weights_1.1.2 magrittr_2.0.4
## [13] lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
## [16] dplyr_1.1.4 purrr_1.2.0 readr_2.1.5
## [19] tidyr_1.3.1 tibble_3.3.0 ggplot2_4.0.1.9000
## [22] tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] RColorBrewer_1.1-3 rstudioapi_0.17.1 audio_0.1-11
## [4] jsonlite_2.0.0 shape_1.4.6.1 datawizard_1.3.0
## [7] TH.data_1.1-4 jomo_2.7-6 farver_2.1.2
## [10] nloptr_2.2.1 rmarkdown_2.30 vctrs_0.6.5
## [13] minqa_1.2.8 base64enc_0.1-3 htmltools_0.5.8.1
## [16] polspline_1.1.25 broom_1.0.11 Formula_1.2-5
## [19] mitml_0.4-5 dcurver_0.9.2 sass_0.4.10
## [22] parallelly_1.45.1 bslib_0.9.0 htmlwidgets_1.6.4
## [25] plyr_1.8.9 sandwich_3.1-1 testthat_3.2.3
## [28] zoo_1.8-14 cachem_1.1.0 lifecycle_1.0.4
## [31] iterators_1.0.14 pkgconfig_2.0.3 Matrix_1.7-4
## [34] R6_2.6.1 fastmap_1.2.0 rbibutils_2.3
## [37] future_1.67.0 digest_0.6.39 colorspace_2.1-2
## [40] vegan_2.7-2 labeling_0.4.3 progressr_0.16.0
## [43] timechange_0.3.0 gdata_3.0.1 mgcv_1.9-3
## [46] compiler_4.5.2 withr_3.0.2 htmlTable_2.4.3
## [49] S7_0.2.1 backports_1.5.0 R.utils_2.13.0
## [52] pan_1.9 MASS_7.3-65 quantreg_6.1
## [55] sessioninfo_1.2.3 GPArotation_2025.3-1 gtools_3.9.5
## [58] permute_0.9-8 tools_4.5.2 foreign_0.8-90
## [61] future.apply_1.20.0 clipr_0.8.0 nnet_7.3-20
## [64] R.oo_1.27.1 glue_1.8.0 nlme_3.1-168
## [67] grid_4.5.2 checkmate_2.3.3 cluster_2.1.8.1
## [70] generics_0.1.4 gtable_0.3.6 tzdb_0.5.0
## [73] R.methodsS3_1.8.2 data.table_1.17.8 hms_1.1.3
## [76] utf8_1.2.6 Deriv_4.2.0 foreach_1.5.2
## [79] pillar_1.11.1 splines_4.5.2 survival_3.8-3
## [82] SparseM_1.84-2 tidyselect_1.2.1 pbapply_1.7-4
## [85] knitr_1.50 reformulas_0.4.1 gridExtra_2.3
## [88] xfun_0.53 brio_1.1.5 DEoptimR_1.1-4
## [91] stringi_1.8.7 yaml_2.3.10 boot_1.3-32
## [94] evaluate_1.0.5 codetools_0.2-20 beepr_2.0
## [97] cli_3.6.5 rpart_4.1.24 Rdpack_2.6.4
## [100] jquerylib_0.1.4 Rcpp_1.1.0 globals_0.18.0
## [103] parallel_4.5.2 MatrixModels_0.5-4 marginaleffects_0.30.0
## [106] lme4_1.1-37 listenv_0.9.1 glmnet_4.1-10
## [109] mvtnorm_1.3-3 SimDesign_2.21 scales_1.4.0
## [112] insight_1.4.2 rlang_1.1.6.9000 multcomp_1.4-28
## [115] mnormt_2.1.1 mice_3.18.0
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")
#OSF
if (F) {
library(osfr)
#login
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/XXX/")
#upload all files in project
#overwrite existing (versioning)
osf_upload(
osf_proj,
path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"),
conflicts = "overwrite"
)
}