This was updated to add more stringent Hispanic filtering and age normed IQs.
options(digits = 2)
#remotes::install_github("tidyverse/haven")
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1.9000 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
## Loading required package: robustbase
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
haven,
rms,
mirt,
ggeffects
)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:psych':
##
## describe
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Loading required package: stats4
## Loading required package: lattice
theme_set(theme_bw())
d = read_stata("data/GSS_stata/gss7224_r1.dta")
d_vars = df_var_table(d)
#race
d$race %>% as_factor() %>% table2(include_NA = F)
d$race_3way = d$race %>% as_factor() %>% fct_drop()
d$race_3way %>% table2()
#hispanic
d$hispanic %>% as_factor() %>% table2()
d$not_hispanic = d$hispanic==1
d$not_hispanic %>% table2()
d$ethnic2 = d$ethnic %>% as_factor() %>% as.character()
#too little data
#recode from ethnic
d$hispanic_ethnic = d$ethnic2 %in% c("mexico", "american indian", "puerto rico", "cuba", "other central america", "ecuador", "peru", "venezuela", "honduras", "panama", "nicaragua", "argentina", "costa rica", "bolivia", "guyana", "belize", "chile", "french guiana", "uruguay", "paraguay", "suriname", "other spanish")
d$hispanic_ethnic %>% table2()
#ethnic origin
d$ethnic2 %>% table2() %>% print(n=20)
## # A tibble: 107 × 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 uncodeable 13257 17.5
## 2 germany 9936 13.1
## 3 england & wales 7883 10.4
## 4 ireland 7147 9.44
## 5 africa (general) 4779 6.31
## 6 italy 3407 4.50
## 7 mexico 3011 3.98
## 8 american indian 2764 3.65
## 9 scotland 2021 2.67
## 10 poland 1763 2.33
## 11 american only 1460 1.93
## 12 france 1234 1.63
## 13 norway 1048 1.38
## 14 sweden 951 1.26
## 15 netherlands 887 1.17
## 16 russia 840 1.11
## 17 other spanish 729 0.963
## 18 czechoslovakia 723 0.955
## 19 don't know 722 0.954
## 20 puerto rico 716 0.946
## # ℹ 87 more rows
#sex
d %<>% mutate(
sex = sex %>% as_factor(),
sex_2way = case_when(
sex %in% c("female", "male") ~ sex,
.default = NA
) %>% fct_drop()
)
#year and age
d$year %<>% as.numeric()
d$age %<>% as.numeric()
#filter out hispanics
d %<>% filter(not_hispanic, !hispanic_ethnic)
#catch indians in Ethnic variable
#but we can't use this for whites, too many countries
d$Indian = (d$ethnic2 == "india")
d$Indian %>% sum()
## [1] 310
#wordsum items
wordsum_items = d %>% select(worda:wordj)
#IRT fit
wordsum_irt = mirt(
wordsum_items %>% map_df(as.numeric)
)
## Warning: data contains response patterns with only NAs
#scores
wordsum_irt_scores = fscores(wordsum_irt)
#save the white standard scores
d$wordsum_g = wordsum_irt_scores[, 1] %>% standardize()
#age controlled
wordsum_age_adj = kirkegaard::make_norms(
d$wordsum_g,
age = d$age,
norm_group = d$race_3way=="white"
)
## Detected linear effect of age on the score (p = <0.001***). Model used.
## Detected variance effect of age on the score (p = <0.001***). Model used.
d$wordsum_IQ = wordsum_age_adj$data$IQ
d_bw = d %>%
filter(race_3way %in% c("white", "black")) %>%
mutate(
race_bw = race_3way %>% fct_drop()
)
d_bw$race_bw %>% table2()
#IQ by education completed
d_bw %>%
filter(educ >= 8) %>%
GG_group_means("wordsum_IQ", subgroupvar = "race_bw", groupvar = "educ", type = "point") +
geom_text(aes(label = round(mean))) +
labs(
title = "Black and White Wordsum IQs by years of education",
subtitle = "GSS",
y = "Wordsum IQ",
x = "Years of education"
)
## Missing values were removed.
GG_save("figs/bw_iq_years_of_education.png")
list(
lm(wordsum_IQ ~ race_bw, data = d_bw),
lm(wordsum_IQ ~ race_bw + educ, data = d_bw),
lm(wordsum_IQ ~ race_bw + factor(educ), data = d_bw)
) %>%
summarize_models()