Init

options(digits = 2)

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## Loading required package: lattice
## 
## Loading required package: survival
## 
## Loading required package: Formula
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following objects are masked from 'package:purrr':
## 
##     is_logical, is_numeric
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  rms,
  ggeffects
  )

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

Data

#2022 article
systemic_racism_data = read_rds("data/systemic racism data_out.rds") %>% 
  mutate(FIPS = FIPS %>% as.numeric())
systemic_racism_data_table = df_var_table(systemic_racism_data)

#2016 article
counties2016 = read_csv("data/2016 study/data_merged.csv") %>% 
  df_legalize_names()

## New names:
## Rows: 3146 Columns: 93
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): County, State, temp_bins, lat_bins, lon_bins, precip_bins, elevati... dbl
## (86): ...1, FIPS, Less.Than.High.School, At.Least.High.School.Diploma, A...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

counties2016_table = df_var_table(counties2016)

#county health rankings data
#https://www.countyhealthrankings.org/explore-health-rankings/rankings-data-documentation
chr_data2022 = haven::read_sas("data/analytic_data2022.sas7bdat")
chr_data2022_table = df_var_table(chr_data2022)

#inquisitive bird
ib_data = read_csv("data/Bird FullData.csv") %>% 
  df_legalize_names() %>% 
  mutate(fips = fips %>% as.numeric())

## Rows: 3221 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): fips, County
## dbl (12): Population, Homicide Rate, Violent Crime Rate, Non-Hispanic White ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ib_data_table = df_var_table(ib_data)

#join the data we need
d = full_join(
  ib_data %>% filter(!str_detect(County, ", Puerto Rico")),
  systemic_racism_data,
  by = c("fips" = "FIPS")
) %>% full_join(
  counties2016 %>% select(FIPS, White:SIRE_homogeneity, S, temp, precip, lon, lat) %>% filter(!is.na(FIPS)),
  by = c("fips" = "FIPS")
)

#no dups!
assert_that(!anyDuplicated(d$fips))

## [1] TRUE

Analysis

USA counties, n = 3,200

First we analyze data from US counties. The data are from Kirkegaard (2016).

#other race variable
d %<>% mutate(
  Other_share = 1-(Non_Hispanic_White_Share+Black_Share+Native_Share+Latino_Share+Asian_Share),
  race_homogeneity = rowSums(cbind(Non_Hispanic_White_Share, Black_Share, Native_Share, Latino_Share, Asian_Share, Other_share)^2),
  sqrt_pop = sqrt(popsize2019),
  g = cs_mn_all
)

#correlations
d %>% 
  select(S, g, Homicide_Rate, Non_Hispanic_White_Share, Black_Share, Native_Share, Latino_Share, Asian_Share, Other_share, race_homogeneity) %>% 
  wtd.cors()

##                               S      g Homicide_Rate Non_Hispanic_White_Share
## S                         1.000  0.760      -0.64004                    0.465
## g                         0.760  1.000      -0.58310                    0.571
## Homicide_Rate            -0.640 -0.583       1.00000                   -0.534
## Non_Hispanic_White_Share  0.465  0.571      -0.53419                    1.000
## Black_Share              -0.554 -0.487       0.66687                   -0.585
## Native_Share             -0.184 -0.303       0.18949                   -0.298
## Latino_Share             -0.084 -0.201      -0.00085                   -0.606
## Asian_Share               0.268  0.154      -0.04549                   -0.308
## Other_share               0.176  0.081      -0.04154                   -0.055
## race_homogeneity          0.276  0.373      -0.42069                    0.815
##                          Black_Share Native_Share Latino_Share Asian_Share
## S                             -0.554       -0.184     -0.08405       0.268
## g                             -0.487       -0.303     -0.20098       0.154
## Homicide_Rate                  0.667        0.189     -0.00085      -0.045
## Non_Hispanic_White_Share      -0.585       -0.298     -0.60565      -0.308
## Black_Share                    1.000       -0.102     -0.10521       0.033
## Native_Share                  -0.102        1.000     -0.05655      -0.032
## Latino_Share                  -0.105       -0.057      1.00000       0.166
## Asian_Share                    0.033       -0.032      0.16566       1.000
## Other_share                   -0.143        0.203     -0.11843       0.359
## race_homogeneity              -0.538       -0.128     -0.43056      -0.407
##                          Other_share race_homogeneity
## S                              0.176             0.28
## g                              0.081             0.37
## Homicide_Rate                 -0.042            -0.42
## Non_Hispanic_White_Share      -0.055             0.81
## Black_Share                   -0.143            -0.54
## Native_Share                   0.203            -0.13
## Latino_Share                  -0.118            -0.43
## Asian_Share                    0.359            -0.41
## Other_share                    1.000            -0.25
## race_homogeneity              -0.254             1.00

d %>% 
  select(S, g, Homicide_Rate, Non_Hispanic_White_Share, Black_Share, Native_Share, Latino_Share, Asian_Share, Other_share, race_homogeneity) %>% 
  GG_heatmap()

GG_save("figures/cors_heatmap.png")

#plot
GG_scatter(d, "race_homogeneity", "S", case_names = F) +
  scale_x_continuous("Racial homogeneity (based on self-reported race/ethnicity)", breaks = seq(0, 1, .1)) +
  scale_y_continuous("General social status factor (S-factor)") +
  geom_smooth()

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("figures/USA_homo_S.png")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#plot
GG_scatter(d, "race_homogeneity", "Homicide_Rate", case_names = F) +
  scale_x_continuous("Racial homogeneity (based on self-reported race/ethnicity)", breaks = seq(0, 1, .1)) +
  scale_y_continuous("Homicide rate (2010-2020)") +
  geom_smooth()

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

GG_save("figures/USA_homo_homicide_rate.png")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#models
list(
  ols(S ~ race_homogeneity, data = d),
  ols(S ~ race_homogeneity + Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share, data = d),
  ols(S ~ race_homogeneity + Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g, data = d),
  ols(S ~ race_homogeneity + Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g + lon + lat + temp + precip, data = d)
) %>% 
  summarize_models()

## # A tibble: 14 × 5
##    `Predictor/Model` `1`              `2`              `3`              `4`     
##    <chr>             <chr>            <chr>            <chr>            <chr>   
##  1 Intercept         -1.30 (0.062***) 1.10 (0.129***)  0.39 (0.107***)  -0.15 (…
##  2 race_homogeneity  1.46 (0.091***)  -1.13 (0.135***) -0.67 (0.111***) -0.94 (…
##  3 Black_Share       <NA>             -4.99 (0.140***) -2.42 (0.133***) -1.62 (…
##  4 Native_Share      <NA>             -3.61 (0.176***) -0.90 (0.163***) -1.82 (…
##  5 Latino_Share      <NA>             -2.22 (0.128***) -0.63 (0.113***) -0.65 (…
##  6 Asian_Share       <NA>             8.94 (0.491***)  5.18 (0.414***)  4.60 (0…
##  7 Other_share       <NA>             -2.96 (0.765***) -0.18 (0.632)    -0.10 (…
##  8 g                 <NA>             <NA>             2.15 (0.056***)  2.02 (0…
##  9 lon               <NA>             <NA>             <NA>             0.00 (0…
## 10 lat               <NA>             <NA>             <NA>             0.02 (0…
## 11 temp              <NA>             <NA>             <NA>             -0.03 (…
## 12 precip            <NA>             <NA>             <NA>             0.00 (0…
## 13 R2 adj.           0.076            0.497            0.660            0.739   
## 14 N                 3123             3123             3095             1520

list(
  ols(Homicide_Rate ~ race_homogeneity, data = d),
  ols(Homicide_Rate ~ race_homogeneity +  Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share, data = d),
  ols(Homicide_Rate ~ race_homogeneity +  Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g, data = d),
    ols(Homicide_Rate ~ race_homogeneity +  Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g + lon + lat + temp + precip, data = d)
) %>% 
  summarize_models()

## # A tibble: 14 × 5
##    `Predictor/Model` `1`               `2`               `3`              `4`   
##    <chr>             <chr>             <chr>             <chr>            <chr> 
##  1 Intercept         12.28 (0.294***)  -0.37 (0.629)     0.76 (0.597)     3.87 …
##  2 race_homogeneity  -11.26 (0.434***) 2.43 (0.659***)   1.84 (0.620**)   3.03 …
##  3 Black_Share       <NA>              26.11 (0.686***)  21.24 (0.739***) 19.99…
##  4 Native_Share      <NA>              17.17 (0.858***)  11.89 (0.901***) 14.31…
##  5 Latino_Share      <NA>              5.47 (0.623***)   2.39 (0.626***)  0.56 …
##  6 Asian_Share       <NA>              -13.46 (2.400***) -5.43 (2.301)    2.86 …
##  7 Other_share       <NA>              18.91 (3.705***)  14.16 (3.491***) 17.28…
##  8 g                 <NA>              <NA>              -4.26 (0.309***) -3.72…
##  9 lon               <NA>              <NA>              <NA>             -0.03…
## 10 lat               <NA>              <NA>              <NA>             -0.16…
## 11 temp              <NA>              <NA>              <NA>             -0.02…
## 12 precip            <NA>              <NA>              <NA>             0.00 …
## 13 R2 adj.           0.177             0.528             0.572            0.593 
## 14 N                 3136              3136              3100             1524

#standardized effects
d_std = d %>% select(g, S, Non_Hispanic_White_Share, Black_Share, Native_Share, Latino_Share, Asian_Share, Other_share, race_homogeneity, Homicide_Rate, lon, lat, temp, precip) %>% df_standardize(exclude_range_01 = F)

#models
list(
  ols(S ~ race_homogeneity, data = d_std),
  ols(S ~ race_homogeneity + Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share, data = d_std),
  ols(S ~ race_homogeneity + Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g, data = d_std),
  ols(S ~ race_homogeneity + Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g + lon + lat + temp + precip, data = d_std)
) %>% 
  summarize_models()

## # A tibble: 14 × 5
##    `Predictor/Model` `1`             `2`              `3`              `4`      
##    <chr>             <chr>           <chr>            <chr>            <chr>    
##  1 Intercept         0.00 (0.017)    0.00 (0.013)     0.00 (0.010)     -0.01 (0…
##  2 race_homogeneity  0.28 (0.017***) -0.21 (0.025***) -0.13 (0.021***) -0.18 (0…
##  3 Black_Share       <NA>            -0.75 (0.021***) -0.36 (0.020***) -0.24 (0…
##  4 Native_Share      <NA>            -0.30 (0.015***) -0.07 (0.013***) -0.15 (0…
##  5 Latino_Share      <NA>            -0.32 (0.019***) -0.09 (0.016***) -0.09 (0…
##  6 Asian_Share       <NA>            0.27 (0.015***)  0.16 (0.013***)  0.14 (0.…
##  7 Other_share       <NA>            -0.07 (0.017***) 0.00 (0.014)     0.00 (0.…
##  8 g                 <NA>            <NA>             0.57 (0.015***)  0.53 (0.…
##  9 lon               <NA>            <NA>             <NA>             -0.05 (0…
## 10 lat               <NA>            <NA>             <NA>             0.14 (0.…
## 11 temp              <NA>            <NA>             <NA>             -0.14 (0…
## 12 precip            <NA>            <NA>             <NA>             -0.10 (0…
## 13 R2 adj.           0.076           0.497            0.660            0.739    
## 14 N                 3123            3123             3095             1520

list(
  ols(Homicide_Rate ~ race_homogeneity, data = d_std),
  ols(Homicide_Rate ~ race_homogeneity +  Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share, data = d_std),
  ols(Homicide_Rate ~ race_homogeneity +  Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g, data = d_std),
  ols(Homicide_Rate ~ race_homogeneity +  Black_Share + Native_Share + Latino_Share + Asian_Share + Other_share + g + lon + lat + temp + precip, data = d_std)
) %>% 
  summarize_models()

## # A tibble: 14 × 5
##    `Predictor/Model` `1`              `2`              `3`              `4`     
##    <chr>             <chr>            <chr>            <chr>            <chr>   
##  1 Intercept         0.00 (0.016)     0.00 (0.012)     0.00 (0.011)     0.01 (0…
##  2 race_homogeneity  -0.42 (0.016***) 0.09 (0.025***)  0.07 (0.023**)   0.11 (0…
##  3 Black_Share       <NA>             0.77 (0.020***)  0.63 (0.022***)  0.59 (0…
##  4 Native_Share      <NA>             0.28 (0.014***)  0.19 (0.015***)  0.23 (0…
##  5 Latino_Share      <NA>             0.16 (0.018***)  0.07 (0.018***)  0.02 (0…
##  6 Asian_Share       <NA>             -0.08 (0.015***) -0.03 (0.014)    0.02 (0…
##  7 Other_share       <NA>             0.08 (0.016***)  0.06 (0.016***)  0.08 (0…
##  8 g                 <NA>             <NA>             -0.22 (0.016***) -0.19 (…
##  9 lon               <NA>             <NA>             <NA>             -0.08 (…
## 10 lat               <NA>             <NA>             <NA>             -0.18 (…
## 11 temp              <NA>             <NA>             <NA>             -0.02 (…
## 12 precip            <NA>             <NA>             <NA>             0.00 (0…
## 13 R2 adj.           0.177            0.528            0.572            0.593   
## 14 N                 3136             3136             3100             1524

We see the expected pattern for when there is a large majority group and within group variation in cognitive ability. The bivariate analysis shows a clear relationship between racial homogeneity and better outcomes. But when we include just the main effects of the non-largest groups, the beta shrinks substantially (83%), but still retains some validity. When we furthermore add CA, it becomes p>alpha, despite the large sample size. And finally, looking for non-linear effets of homogeneity also yields nothing but a 0.2% increase in model variance explained.

Does racial homogeneity matter? A multi-dataset analysis

Emil O. W. Kirkegaard

Init

Data

Analysis

USA counties, n = 3,200