Update note

This was updated to add more stringent Hispanic filtering and age normed IQs.

Init

options(digits = 2)

#remotes::install_github("tidyverse/haven")
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  haven,
  rms,
  mirt,
  ggeffects
  )
## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Loading required package: stats4
## Loading required package: lattice
theme_set(theme_bw())

Functions

Data

d = read_stata("data/GSS_stata/gss7224_r1.dta")
d_vars = df_var_table(d)

Recode

#race
d$race %>% as_factor() %>% table2(include_NA = F)
d$race_3way = d$race %>% as_factor() %>% fct_drop()
d$race_3way %>% table2()
#hispanic
d$hispanic %>% as_factor() %>% table2()
d$not_hispanic = d$hispanic==1
d$not_hispanic %>% table2()
d$ethnic2 = d$ethnic %>% as_factor() %>% as.character()

#too little data
#recode from ethnic
d$hispanic_ethnic = d$ethnic2 %in% c("mexico", "american indian", "puerto rico", "cuba", "other central america", "ecuador", "peru", "venezuela", "honduras", "panama", "nicaragua", "argentina", "costa rica", "bolivia", "guyana", "belize", "chile", "french guiana", "uruguay", "paraguay", "suriname", "other spanish")

d$hispanic_ethnic %>% table2()
#ethnic origin
d$ethnic2 %>% table2() %>% print(n=20)
## # A tibble: 107 × 3
##    Group            Count Percent
##    <chr>            <dbl>   <dbl>
##  1 uncodeable       13257  17.5  
##  2 germany           9936  13.1  
##  3 england & wales   7883  10.4  
##  4 ireland           7147   9.44 
##  5 africa (general)  4779   6.31 
##  6 italy             3407   4.50 
##  7 mexico            3011   3.98 
##  8 american indian   2764   3.65 
##  9 scotland          2021   2.67 
## 10 poland            1763   2.33 
## 11 american only     1460   1.93 
## 12 france            1234   1.63 
## 13 norway            1048   1.38 
## 14 sweden             951   1.26 
## 15 netherlands        887   1.17 
## 16 russia             840   1.11 
## 17 other spanish      729   0.963
## 18 czechoslovakia     723   0.955
## 19 don't know         722   0.954
## 20 puerto rico        716   0.946
## # ℹ 87 more rows
#sex
d %<>% mutate(
  sex = sex %>% as_factor(),
  sex_2way = case_when(
    sex %in% c("female", "male") ~ sex,
    .default = NA
  ) %>% fct_drop()
)

#year and age
d$year %<>% as.numeric()
d$age %<>% as.numeric()

#filter out hispanics
d %<>% filter(not_hispanic, !hispanic_ethnic)

#catch indians in Ethnic variable
#but we can't use this for whites, too many countries
d$Indian = (d$ethnic2 == "india")

d$Indian %>% sum()
## [1] 310

Analysis

Score WORDSUM

#wordsum items
wordsum_items = d %>% select(worda:wordj)

#IRT fit
wordsum_irt = mirt(
  wordsum_items %>% map_df(as.numeric)
)
## Warning: data contains response patterns with only NAs
#scores
wordsum_irt_scores = fscores(wordsum_irt)

#save the white standard scores
d$wordsum_g = wordsum_irt_scores[, 1] %>% standardize()

#age controlled
wordsum_age_adj = kirkegaard::make_norms(
  d$wordsum_g,
  age = d$age,
  norm_group = d$race_3way=="white"
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
d$wordsum_IQ = wordsum_age_adj$data$IQ

Indian IQ

#examine the missing data structure
d %>% select(
  Indian, race_3way, age, born, wordsum_IQ, sex_2way, year, hispanic_ethnic
) %>% 
  miss_patterns()
#then subset to variables with no missing
d_indian = d %>% select(
  Indian, race_3way, age, born, wordsum_IQ, year, sex_2way, hispanic_ethnic
) %>% 
  miss_filter() %>% 
  filter(
    Indian | race_3way == "white"
  )

#we need special encoding for race here
d_indian$Indian2 = tibble(
  Indian = d_indian$Indian,
  White = d_indian$race_3way=="white",
  US_born = d_indian$born==1
) %>% 
  encode_combinations() %>% 
  fct_relevel("White, US_born")

#counts
d_indian$Indian2 %>% table2()
#desc by group
describe2(
  d_indian$wordsum_IQ,
  d_indian$Indian2
)
## New names:
## • `` -> `...1`
#ols
ols_list = list(
  ols(wordsum_IQ ~ Indian2, data = d_indian),
  ols(wordsum_IQ ~ Indian2 + sex_2way, data = d_indian),
  ols(wordsum_IQ ~ Indian2 + year + sex_2way, data = d_indian)
)

ols_list %>% summarize_models()
ols_list[[3]] %>% 
  ggaverage(terms = c("Indian2")) %>% 
  plot()

Meta

#versions
write_sessioninfo()
## R version 4.5.2 (2025-10-31)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggeffects_2.3.1       mirt_1.45.1           lattice_0.22-7       
##  [4] rms_8.0-0             Hmisc_5.2-4           haven_2.5.5          
##  [7] kirkegaard_2025-11-19 robustbase_0.99-6     psych_2.5.6          
## [10] assertthat_0.2.1      weights_1.1.2         magrittr_2.0.4       
## [13] lubridate_1.9.4       forcats_1.0.1         stringr_1.6.0        
## [16] dplyr_1.1.4           purrr_1.2.0           readr_2.1.5          
## [19] tidyr_1.3.1           tibble_3.3.0          ggplot2_4.0.1.9000   
## [22] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.17.1      audio_0.1-11          
##   [4] jsonlite_2.0.0         shape_1.4.6.1          datawizard_1.3.0      
##   [7] TH.data_1.1-4          jomo_2.7-6             farver_2.1.2          
##  [10] nloptr_2.2.1           rmarkdown_2.30         vctrs_0.6.5           
##  [13] minqa_1.2.8            base64enc_0.1-3        htmltools_0.5.8.1     
##  [16] polspline_1.1.25       broom_1.0.11           Formula_1.2-5         
##  [19] mitml_0.4-5            dcurver_0.9.2          sass_0.4.10           
##  [22] parallelly_1.45.1      bslib_0.9.0            htmlwidgets_1.6.4     
##  [25] plyr_1.8.9             sandwich_3.1-1         testthat_3.2.3        
##  [28] zoo_1.8-14             cachem_1.1.0           lifecycle_1.0.4       
##  [31] iterators_1.0.14       pkgconfig_2.0.3        Matrix_1.7-4          
##  [34] R6_2.6.1               fastmap_1.2.0          rbibutils_2.3         
##  [37] future_1.67.0          digest_0.6.39          colorspace_2.1-2      
##  [40] vegan_2.7-2            labeling_0.4.3         progressr_0.16.0      
##  [43] timechange_0.3.0       gdata_3.0.1            mgcv_1.9-3            
##  [46] compiler_4.5.2         withr_3.0.2            htmlTable_2.4.3       
##  [49] S7_0.2.1               backports_1.5.0        R.utils_2.13.0        
##  [52] pan_1.9                MASS_7.3-65            quantreg_6.1          
##  [55] sessioninfo_1.2.3      GPArotation_2025.3-1   gtools_3.9.5          
##  [58] permute_0.9-8          tools_4.5.2            foreign_0.8-90        
##  [61] future.apply_1.20.0    clipr_0.8.0            nnet_7.3-20           
##  [64] R.oo_1.27.1            glue_1.8.0             nlme_3.1-168          
##  [67] grid_4.5.2             checkmate_2.3.3        cluster_2.1.8.1       
##  [70] generics_0.1.4         gtable_0.3.6           tzdb_0.5.0            
##  [73] R.methodsS3_1.8.2      data.table_1.17.8      hms_1.1.3             
##  [76] utf8_1.2.6             Deriv_4.2.0            foreach_1.5.2         
##  [79] pillar_1.11.1          splines_4.5.2          survival_3.8-3        
##  [82] SparseM_1.84-2         tidyselect_1.2.1       pbapply_1.7-4         
##  [85] knitr_1.50             reformulas_0.4.1       gridExtra_2.3         
##  [88] xfun_0.53              brio_1.1.5             DEoptimR_1.1-4        
##  [91] stringi_1.8.7          yaml_2.3.10            boot_1.3-32           
##  [94] evaluate_1.0.5         codetools_0.2-20       beepr_2.0             
##  [97] cli_3.6.5              rpart_4.1.24           Rdpack_2.6.4          
## [100] jquerylib_0.1.4        Rcpp_1.1.0             globals_0.18.0        
## [103] parallel_4.5.2         MatrixModels_0.5-4     marginaleffects_0.30.0
## [106] lme4_1.1-37            listenv_0.9.1          glmnet_4.1-10         
## [109] mvtnorm_1.3-3          SimDesign_2.21         scales_1.4.0          
## [112] insight_1.4.2          rlang_1.1.6.9000       multcomp_1.4-28       
## [115] mnormt_2.1.1           mice_3.18.0
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}