Update note

This was updated to add more stringent Hispanic filtering and age normed IQs.

Init

options(digits = 2)

#remotes::install_github("tidyverse/haven")
library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4          ✔ readr     2.1.5     
## ✔ forcats   1.0.1          ✔ stringr   1.6.0     
## ✔ ggplot2   4.0.1.9000     ✔ tibble    3.3.0     
## ✔ lubridate 1.9.4          ✔ tidyr     1.3.1     
## ✔ purrr     1.2.0          
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## Loading required package: robustbase
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  haven,
  rms,
  mirt,
  ggeffects
  )

## Loading required package: Hmisc
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Loading required package: stats4
## Loading required package: lattice

theme_set(theme_bw())

Functions

Data

d = read_stata("data/GSS_stata/gss7224_r1.dta")
d_vars = df_var_table(d)

Recode

#race
d$race %>% as_factor() %>% table2(include_NA = F)

d$race_3way = d$race %>% as_factor() %>% fct_drop()
d$race_3way %>% table2()

#hispanic
d$hispanic %>% as_factor() %>% table2()

d$not_hispanic = d$hispanic==1
d$not_hispanic %>% table2()

d$ethnic2 = d$ethnic %>% as_factor() %>% as.character()

#too little data
#recode from ethnic
d$hispanic_ethnic = d$ethnic2 %in% c("mexico", "american indian", "puerto rico", "cuba", "other central america", "ecuador", "peru", "venezuela", "honduras", "panama", "nicaragua", "argentina", "costa rica", "bolivia", "guyana", "belize", "chile", "french guiana", "uruguay", "paraguay", "suriname", "other spanish")

d$hispanic_ethnic %>% table2()

#ethnic origin
d$ethnic2 %>% table2() %>% print(n=20)

## # A tibble: 107 × 3
##    Group            Count Percent
##    <chr>            <dbl>   <dbl>
##  1 uncodeable       13257  17.5  
##  2 germany           9936  13.1  
##  3 england & wales   7883  10.4  
##  4 ireland           7147   9.44 
##  5 africa (general)  4779   6.31 
##  6 italy             3407   4.50 
##  7 mexico            3011   3.98 
##  8 american indian   2764   3.65 
##  9 scotland          2021   2.67 
## 10 poland            1763   2.33 
## 11 american only     1460   1.93 
## 12 france            1234   1.63 
## 13 norway            1048   1.38 
## 14 sweden             951   1.26 
## 15 netherlands        887   1.17 
## 16 russia             840   1.11 
## 17 other spanish      729   0.963
## 18 czechoslovakia     723   0.955
## 19 don't know         722   0.954
## 20 puerto rico        716   0.946
## # ℹ 87 more rows

#sex
d %<>% mutate(
  sex = sex %>% as_factor(),
  sex_2way = case_when(
    sex %in% c("female", "male") ~ sex,
    .default = NA
  ) %>% fct_drop()
)

#year and age
d$year %<>% as.numeric()
d$age %<>% as.numeric()

#filter out hispanics
d %<>% filter(not_hispanic, !hispanic_ethnic)

#catch indians in Ethnic variable
#but we can't use this for whites, too many countries
d$Indian = (d$ethnic2 == "india")

d$Indian %>% sum()

## [1] 310

Analysis

Score WORDSUM

#wordsum items
wordsum_items = d %>% select(worda:wordj)

#IRT fit
wordsum_irt = mirt(
  wordsum_items %>% map_df(as.numeric)
)

## Warning: data contains response patterns with only NAs

#scores
wordsum_irt_scores = fscores(wordsum_irt)

#save the white standard scores
d$wordsum_g = wordsum_irt_scores[, 1] %>% standardize()

#age controlled
wordsum_age_adj = kirkegaard::make_norms(
  d$wordsum_g,
  age = d$age,
  norm_group = d$race_3way=="white"
)

## Detected linear effect of age on the score (p = <0.001***). Model used.

## Detected variance effect of age on the score (p = <0.001***). Model used.

d$wordsum_IQ = wordsum_age_adj$data$IQ

Results

d_bw = d %>% 
  filter(race_3way %in% c("white", "black")) %>% 
  mutate(
    race_bw = race_3way %>% fct_drop()
  )

d_bw$race_bw %>% table2()

#IQ by education completed
d_bw %>% 
  filter(educ >= 8) %>% 
  GG_group_means("wordsum_IQ", subgroupvar = "race_bw", groupvar = "educ", type = "point") +
  geom_text(aes(label = round(mean))) +
  labs(
    title = "Black and White Wordsum IQs by years of education",
    subtitle = "GSS",
    y = "Wordsum IQ",
    x = "Years of education"
  )

## Missing values were removed.

GG_save("figs/bw_iq_years_of_education.png")

list(
  lm(wordsum_IQ ~ race_bw, data = d_bw),
  lm(wordsum_IQ ~ race_bw + educ, data = d_bw),
  lm(wordsum_IQ ~ race_bw + factor(educ), data = d_bw)
) %>% 
  summarize_models()

Black and White IQ in GSS by education

11 January, 2026

Update note

Init

Functions

Data

Recode

Analysis

Score WORDSUM

Results