Init

options(
  digits = 2,
  contrasts = c("contr.treatment", "contr.treatment")
)

library(pacman)
p_load(
  kirkegaard,
  rms,
  ggeffects,
  tidymodels,
  recipes,
  parsnip,
  DirichletReg,
  randomForestSRC
)

theme_set(theme_bw())

Functions

#largest value by row
rowwise_which_max = function(x) {
  x %>% 
    as_tibble() %>% 
    plyr::aaply(.margins = 1, .fun = function(dd) {
        which.max(unlist(dd))
      }, .expand = F)
}

#rowwise get most likely group
get_most_likely = function(model = NULL, preds = NULL) {
  
  #get preds
  if ("rms" %in% class(model)) {
    # browser()
    model %>% 
      predict(type = "fitted.ind") %>% 
      rowwise_which_max() ->
      y
    
    #fix names
    y = mapvalues(y, from = unique(y) %>% sort(), to = bw_model2$freq %>% names()) %>% 
      factor(levels = bw_model2$freq %>% names())
    
  } else {
    model %>% 
      fitted() %>% 
      as_tibble() %>% 
      plyr::aaply(.margins = 1, .fun = function(dd) {
        which.max(unlist(dd))
      }, .expand = F) ->
      y
  }

  y
}

Data

#load data from PING 1 project
d = read_rds("data/data_out.rds")

#recode SIRE
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "Other", "White")

d$SIRE_combos_all = map_chr(1:nrow(d), .f = function(row) {
  
  #extract SIREs
  SIREs = d[row, v_SIREs] %>% unlist
  
  #only one?
  if (sum(SIREs) == 1) {
    #then return the name of it
    return(names(SIREs)[which(SIREs)])
  } else if (sum(SIREs) == 0) {
    #did not select any
    return("None")
  } else {
    #return combo
    return(str_c(names(SIREs)[which(SIREs)], collapse = ", "))
  }
})

#results?
d$SIRE_combos_all %>% table2() %>% print(n=Inf)

## # A tibble: 45 × 3
##    Group                                                      Count Percent
##    <chr>                                                      <dbl>   <dbl>
##  1 White                                                        571 41.0   
##  2 African_American                                             140 10.1   
##  3 Hispanic, White                                              140 10.1   
##  4 Asian                                                        122  8.77  
##  5 Hispanic                                                      71  5.10  
##  6 Asian, White                                                  60  4.31  
##  7 Pacific_Islander, Asian, White                                32  2.30  
##  8 Hispanic, African_American                                    29  2.08  
##  9 African_American, White                                       25  1.80  
## 10 Other                                                         24  1.73  
## 11 Pacific_Islander, Asian                                       17  1.22  
## 12 Pacific_Islander                                              16  1.15  
## 13 Hispanic, American_Indian, White                              13  0.935 
## 14 Hispanic, Asian                                               13  0.935 
## 15 Hispanic, Pacific_Islander, Asian, White                      13  0.935 
## 16 American_Indian, White                                        11  0.791 
## 17 Hispanic, Pacific_Islander, Asian                             11  0.791 
## 18 Pacific_Islander, White                                        9  0.647 
## 19 Hispanic, American_Indian                                      7  0.503 
## 20 Hispanic, Asian, White                                         6  0.431 
## 21 African_American, American_Indian                              5  0.359 
## 22 Asian, American_Indian, White                                  5  0.359 
## 23 Hispanic, Pacific_Islander                                     5  0.359 
## 24 Pacific_Islander, Asian, American_Indian, White                5  0.359 
## 25 American_Indian                                                4  0.288 
## 26 Asian, African_American                                        4  0.288 
## 27 Hispanic, African_American, American_Indian                    4  0.288 
## 28 African_American, American_Indian, White                       3  0.216 
## 29 Pacific_Islander, African_American, White                      3  0.216 
## 30 Pacific_Islander, Asian, African_American, White               3  0.216 
## 31 Asian, African_American, White                                 2  0.144 
## 32 Hispanic, African_American, American_Indian, White             2  0.144 
## 33 Hispanic, Asian, American_Indian, White                        2  0.144 
## 34 Hispanic, Pacific_Islander, Asian, African_American            2  0.144 
## 35 Hispanic, Pacific_Islander, Asian, African_American, White     2  0.144 
## 36 Hispanic, Pacific_Islander, White                              2  0.144 
## 37 Hispanic, African_American, White                              1  0.0719
## 38 Hispanic, Asian, African_American, White                       1  0.0719
## 39 Hispanic, Asian, American_Indian                               1  0.0719
## 40 Hispanic, Pacific_Islander, African_American, White            1  0.0719
## 41 Hispanic, Pacific_Islander, American_Indian                    1  0.0719
## 42 Hispanic, Pacific_Islander, Asian, American_Indian             1  0.0719
## 43 Pacific_Islander, Asian, African_American                      1  0.0719
## 44 Pacific_Islander, Asian, American_Indian                       1  0.0719
## 45 <NA>                                                           0  0

#lumped old
d$SIRE_common_old = d$SIRE_common

d$SIRE_common = fct_lump_min(d$SIRE_combos_all, min = 20, other_level = "Remainder")

#compare
d$SIRE_common %>% table2() %>% print(n=Inf)

## # A tibble: 12 × 3
##    Group                          Count Percent
##    <chr>                          <dbl>   <dbl>
##  1 White                            571   41.0 
##  2 Remainder                        177   12.7 
##  3 African_American                 140   10.1 
##  4 Hispanic, White                  140   10.1 
##  5 Asian                            122    8.77
##  6 Hispanic                          71    5.10
##  7 Asian, White                      60    4.31
##  8 Pacific_Islander, Asian, White    32    2.30
##  9 Hispanic, African_American        29    2.08
## 10 African_American, White           25    1.80
## 11 Other                             24    1.73
## 12 <NA>                               0    0

d$SIRE_common_old %>% table2() %>% print(n=Inf)

## # A tibble: 10 × 3
##    Group                          Count Percent
##    <chr>                          <dbl>   <dbl>
##  1 White                            571   41.0 
##  2 Other                            226   16.2 
##  3 African_American                 140   10.1 
##  4 Hispanic, White                  140   10.1 
##  5 Asian                            122    8.77
##  6 Hispanic                          71    5.10
##  7 Asian, White                      60    4.31
##  8 Pacific_Islander, Asian, White    32    2.30
##  9 Hispanic, African_American        29    2.08
## 10 <NA>                               0    0

#remove persons without relevant data
#no such persons
# d %<>% filter(
#   !is.na(European),
#   !is.na(White)
# )

#subsets
BWs = d %>% filter(White | African_American, !Asian, !Hispanic, !Other, !American_Indian, !Pacific_Islander, (African+European) > .95)

#relevel factors
BWs$BW = case_when(
  BWs$White & BWs$African_American ~ "White + Black",
  BWs$White & !BWs$African_American ~ "White",
  !BWs$White & BWs$African_American ~ "Black"
) %>% ordered(levels = c("White", "White + Black", "Black"))

BWs$BW %>% table2()

BWs$justwhite = BWs$BW %>% as.character() %>% str_detect("Black", negate = T)

Simple results

d$Age %>% describe2()

d$Sex %>% table2(include_NA = F)

d$SIRE_standard %>% table2(include_NA = F)

Simple models, simple data

Data restricted to people who report being white, black, or both. We use simple models: logistic ordinal regression and linear regression.

SIRE from ancestry

Predict self-report from ancestry data.

#ancetries correlate close to -1.0
GG_scatter(BWs, "European", "African")

## `geom_smooth()` using formula 'y ~ x'

#models
(bw_model1 = lrm(BW ~ European, data = BWs)) #variant 1

## Logistic Regression Model
##  
##  lrm(formula = BW ~ European, data = BWs)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           649    LR chi2     728.19    R2       0.933    C       0.994    
##   White        497    d.f.             1    g        4.967    Dxy     0.987    
##   White + Black 22    Pr(> chi2) <0.0001    gr     143.576    gamma   0.995    
##   Black        130                          gp       0.349    tau-a   0.368    
##  max |deriv| 7e-09                          Brier    0.008                     
##  
##                   Coef     S.E.   Wald Z Pr(>|Z|)
##  y>=White + Black  12.3886 1.6947  7.31  <0.0001 
##  y>=Black           8.0003 0.9404  8.51  <0.0001 
##  European         -17.1091 1.8503 -9.25  <0.0001 
##

(bw_model2 = lrm(BW ~ African, data = BWs)) #2

## Logistic Regression Model
##  
##  lrm(formula = BW ~ African, data = BWs)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           649    LR chi2     726.47    R2       0.932    C       0.994    
##   White        497    d.f.             1    g        4.854    Dxy     0.988    
##   White + Black 22    Pr(> chi2) <0.0001    gr     128.215    gamma   0.996    
##   Black        130                          gp       0.348    tau-a   0.368    
##  max |deriv| 4e-09                          Brier    0.008                     
##  
##                   Coef    S.E.   Wald Z Pr(>|Z|)
##  y>=White + Black -4.6422 0.4304 -10.79 <0.0001 
##  y>=Black         -8.9295 1.0750  -8.31 <0.0001 
##  African          16.9240 1.8143   9.33 <0.0001 
##

#visualize
predict_SIRE = tibble(
  African = seq(0, 1, length.out = 10e3),
  European = 1 - African
)

#SIRE, from African
bw_model2_preds = predict(bw_model2, newdata = predict_SIRE, type = "fitted.ind")
predict_SIRE$prob_White = bw_model2_preds[, 1]
predict_SIRE$prob_White_and_Black = bw_model2_preds[, 2]
predict_SIRE$prob_Black = bw_model2_preds[, 3]


#ancestry only, SIRE
predict_SIRE %>% 
  pivot_longer(cols = starts_with("prob_")) %>% 
  mutate(
    name = case_when(
      name == "prob_White" ~ "White",
      name == "prob_White_and_Black" ~ "White + Black",
      name == "prob_Black" ~ "Black"
    ) %>% ordered(levels = c("White", "White + Black", "Black"))
  ) %>% 
  ggplot(aes(African, value, color = name)) +
  geom_line() +
  theme_bw() +
  scale_color_discrete("Social race") +
  scale_x_continuous("African ancestry (genetic)", labels = scales::percent) +
  scale_y_continuous("Probability of being...", labels = scales::percent)

GG_save("figs/AFR_to_SIRE.png")

#concordance in real data
BWs$fitted_ml = bw_model2 %>% get_most_likely() %>% ordered()

#confusion matrix
table(true = BWs$BW, predicted = BWs$fitted_ml)

##                predicted
## true            White White + Black Black
##   White           496             0     1
##   White + Black     5            13     4
##   Black             0             3   127

#concordances
(BWs$BW == BWs$fitted_ml) %>% mean()

## [1] 0.98

BWs %>% 
  group_by(BW) %>% 
  summarise(concordance = mean(BW == fitted_ml))

#simplified case with just white vs. black (including BW)
(bw_model2b = lrm(justwhite ~ African, data = BWs))

## Logistic Regression Model
##  
##  lrm(formula = justwhite ~ African, data = BWs)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           649    LR chi2     646.64    R2       0.951    C       0.995    
##   FALSE        152    d.f.             1    g        5.288    Dxy     0.991    
##   TRUE         497    Pr(> chi2) <0.0001    gr     197.948    gamma   0.998    
##  max |deriv| 2e-09                          gp       0.349    tau-a   0.356    
##                                             Brier    0.008                     
##  
##            Coef     S.E.   Wald Z Pr(>|Z|)
##  Intercept   4.7813 0.4792  9.98  <0.0001 
##  African   -18.4383 2.7559 -6.69  <0.0001 
##

BWs$fitted_ml_white = bw_model2b %>% predict(type = "fitted.ind") %>% round()
(BWs$justwhite == BWs$fitted_ml_white) %>% mean() %>% print(digits = 3)

## [1] 0.991

Ancestry from SIRE

Predict ancestry values from self-report data.

(bw_model1 = ols(European ~ BW, data = BWs)) #variant 1

## Linear Regression Model
##  
##  ols(formula = European ~ BW, data = BWs)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     649    LR chi2    2014.99    R2       0.955    
##  sigma0.0715    d.f.             2    R2 adj   0.955    
##  d.f.    646    Pr(> chi2)  0.0000    g        0.278    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.8857447  0.0006753  0.0062453  0.0062453  0.4367222 
##  
##  
##                   Coef    S.E.   t       Pr(>|t|)
##  Intercept         0.9938 0.0032  309.91 <0.0001 
##  BW=White + Black -0.3952 0.0156  -25.37 <0.0001 
##  BW=Black         -0.8198 0.0070 -116.41 <0.0001 
##

(bw_model2 = ols(African ~ BW, data = BWs)) #works a little better in this case

## Linear Regression Model
##  
##  ols(formula = African ~ BW, data = BWs)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     649    LR chi2    2014.40    R2       0.955    
##  sigma0.0714    d.f.             2    R2 adj   0.955    
##  d.f.    646    Pr(> chi2)  0.0000    g        0.278    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -0.431846 -0.003267 -0.003267 -0.003267  0.888723 
##  
##  
##                   Coef   S.E.   t      Pr(>|t|)
##  Intercept        0.0033 0.0032   1.02 0.3078  
##  BW=White + Black 0.3969 0.0155  25.53 <0.0001 
##  BW=Black         0.8179 0.0070 116.33 <0.0001 
##

#show the predictions
bw_model1 %>% ggpredict() %>% plot()

## $BW

bw_model2 %>% ggpredict() %>% plot()

## $BW

#of course, this is quite similar to just doing violin plots and seeing the validity
BWs %>% 
  ggplot(aes(BW, European, fill = BW)) +
  geom_violin(scale = "width")

BWs %>% 
  ggplot(aes(BW, African, fill = BW)) +
  geom_violin(scale = "width") +
  scale_y_continuous("African ancestry", labels = scales::percent) +
  scale_x_discrete("Social race") +
  scale_fill_discrete("Social race", guide = "none")

GG_save("figs/SIRE_to_AFR.png")

Simple models, standard encoding of race

We use all the data. We use simple models: multinomial regression and dirichlet regression.

SIRE from ancestry

Predict self-report from ancestry data.

#predict what distribution?
d$SIRE_standard %>% table2()

#make a recipe
all_recipe = recipe(SIRE_standard ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = multinom_reg() %>% 
  set_engine("nnet")

#resampling method
#actually the same as above
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  fit_resamples(
    all_folds,
    control = control_resamples(save_pred = TRUE)
    )

## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold04: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold07: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold16: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold18: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Americ...

## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Americ...

#metrics
collect_metrics(all_fit)

#concordance in real data
all_fit %>% 
  collect_predictions() %>% 
  arrange(.row) %>% 
  pull(.pred_class) ->
  d$fitted


#confusion matrix
table(true = d$SIRE_standard, predicted = d$fitted)

##                   predicted
## true               White African_American American_Indian Asian Hispanic
##   White              556                1               0     0        7
##   African_American     0              133               0     0        4
##   American_Indian      2                1               0     0        1
##   Asian                1                0               0   106        2
##   Hispanic            37               12               0     5      245
##   Multi_ethnic        21                9               0    13        9
##   Other                9                5               0     4        2
##   Pacific_Islander     0                0               0     0        0
##                   predicted
## true               Multi_ethnic Other Pacific_Islander
##   White                       7     0                0
##   African_American            3     0                0
##   American_Indian             0     0                0
##   Asian                      13     0                0
##   Hispanic                   29     0                0
##   Multi_ethnic              125     3                6
##   Other                       4     0                0
##   Pacific_Islander            7     0                9

#concordances
(d$SIRE_standard == d$fitted) %>% mean()

## [1] 0.84

d %>% 
  group_by(SIRE_standard) %>% 
  summarise(concordance = mean(SIRE_standard == fitted))

Ancestry from SIRE

Predict ancestry values from self-report data.

#direchlet reg
#prep Y data
#check data first, do the rows sum to 1?
d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% rowSums() %>% table()

## .
## 0.99999       1 1.00001 
##      32    1241     118

#close enough not to worry!
#transform
direct_y = DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European))

## Warning in DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, : not all rows sum up to 1 => normalization forced
##   some entries are 0 or 1 => transformation forced

#fit
dirichreg_fit = DirichReg(direct_y ~ SIRE_standard, data = d, model = "alternative")

dirichreg_fit %>% summary()

## Call:
## DirichReg(formula = direct_y ~ SIRE_standard, data = d, model = "alternative")
## 
## Standardized Residuals:
##                  Min     1Q  Median     3Q  Max
## African        -0.91  -0.46   -0.43  -0.18  7.5
## Amerindian     -0.86  -0.46   -0.44  -0.43  3.4
## East_Asian     -2.91  -0.43   -0.42  -0.15  7.2
## Oceanian       -0.76  -0.45   -0.42  -0.42  1.2
## Central_Asian  -0.58  -0.44   -0.44  -0.44  7.6
## European       -2.90  -0.35    0.66   1.08  6.6
## 
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: African
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Amerindian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.0187     0.0585    0.32  0.74979    
## SIRE_standardAfrican_American  -2.5552     0.1112  -22.98  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.0409     0.6953   -0.06  0.95308    
## SIRE_standardAsian             -0.0258     0.1392   -0.19  0.85317    
## SIRE_standardHispanic           0.3254     0.0934    3.49  0.00049 ***
## SIRE_standardMulti_ethnic      -0.2793     0.1172   -2.38  0.01717 *  
## SIRE_standardOther             -0.6419     0.2852   -2.25  0.02438 *  
## SIRE_standardPacific_Islander  -0.1629     0.3543   -0.46  0.64562    
## ------------------------------------------------------------------
## Coefficients for variable no. 3: East_Asian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -0.0284     0.0585   -0.49    0.627    
## SIRE_standardAfrican_American  -2.5360     0.1111  -22.82  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.3000     0.6980   -0.43    0.667    
## SIRE_standardAsian              2.5754     0.1151   22.37  < 2e-16 ***
## SIRE_standardHispanic          -0.5519     0.0954   -5.79  7.2e-09 ***
## SIRE_standardMulti_ethnic       1.1203     0.1117   10.03  < 2e-16 ***
## SIRE_standardOther             -0.6342     0.2852   -2.22    0.026 *  
## SIRE_standardPacific_Islander   2.3691     0.2842    8.34  < 2e-16 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Oceanian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -0.0496     0.0585   -0.85    0.397    
## SIRE_standardAfrican_American  -2.5348     0.1111  -22.81  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.0907     0.6965   -0.13    0.896    
## SIRE_standardAsian              0.1769     0.1390    1.27    0.203    
## SIRE_standardHispanic          -0.7576     0.0956   -7.93  2.2e-15 ***
## SIRE_standardMulti_ethnic       0.0243     0.1169    0.21    0.835    
## SIRE_standardOther             -0.6829     0.2854   -2.39    0.017 *  
## SIRE_standardPacific_Islander   1.5380     0.3268    4.71  2.5e-06 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 5: Central_Asian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.0194     0.0585    0.33    0.741    
## SIRE_standardAfrican_American  -2.5764     0.1112  -23.17   <2e-16 ***
## SIRE_standardAmerican_Indian   -0.3478     0.6980   -0.50    0.618    
## SIRE_standardAsian              0.2067     0.1387    1.49    0.136    
## SIRE_standardHispanic          -0.8868     0.0956   -9.28   <2e-16 ***
## SIRE_standardMulti_ethnic      -0.2691     0.1172   -2.30    0.022 *  
## SIRE_standardOther             -0.2907     0.2837   -1.02    0.306    
## SIRE_standardPacific_Islander  -0.1636     0.3543   -0.46    0.644    
## ------------------------------------------------------------------
## Coefficients for variable no. 6: European
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     2.7443     0.0495   55.41  < 2e-16 ***
## SIRE_standardAfrican_American  -3.9422     0.1080  -36.50  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.4092     0.5362   -0.76     0.45    
## SIRE_standardAsian             -2.6218     0.1353  -19.38  < 2e-16 ***
## SIRE_standardHispanic          -1.2190     0.0761  -16.02  < 2e-16 ***
## SIRE_standardMulti_ethnic      -0.8665     0.0940   -9.22  < 2e-16 ***
## SIRE_standardOther             -1.0020     0.2275   -4.40  1.1e-05 ***
## SIRE_standardPacific_Islander  -2.3967     0.3502   -6.84  7.7e-12 ***
## ------------------------------------------------------------------
## 
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9739     0.0216    45.1   <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 32591 on 41 df (186 BFGS + 2 NR Iterations)
## AIC: -65099, BIC: -64884
## Number of Observations: 1391
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative

#prep model preds
dirichreg_fit_preds = fitted(dirichreg_fit) %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
dirichreg_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_count() +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)

## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(dirichreg_fit_preds, "pred_value", "true_value")

## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
dirichreg_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

Simple models, common combination encoding

Complex self-report data. Get the exact combination right.
Same simple models as before.

SIRE from ancestry

Predict ancestry values from self-report data.

#predict what distribution?
d$SIRE_common %>% table2()

#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = multinom_reg() %>% 
  set_engine("nnet")

#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  fit_resamples(all_folds)

## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...

## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...

#metrics
collect_metrics(all_fit)

Ancestry from SIRE

Predict ancestry values from self-report data.

#direchlet reg
#prep Y data
#check data first, do the rows sum to 1?
d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% rowSums() %>% table()

## .
## 0.99999       1 1.00001 
##      32    1241     118

#close enough not to worry!
#transform
direct_y = DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European))

## Warning in DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, : not all rows sum up to 1 => normalization forced
##   some entries are 0 or 1 => transformation forced

#fit
dirichreg_fit = DirichReg(direct_y ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d, model = "alternative")

dirichreg_fit %>% summary()

## Call:
## DirichReg(formula = direct_y ~ Hispanic + Pacific_Islander + Asian +
## African_American + American_Indian + Other + White, data = d, model =
## "alternative")
## 
## Standardized Residuals:
##                  Min     1Q  Median     3Q    Max
## African        -2.11  -0.44   -0.44  -0.15   8.37
## Amerindian     -1.33  -0.45   -0.45  -0.32   2.90
## East_Asian     -3.85  -0.44   -0.44  -0.30   6.64
## Oceanian       -1.01  -0.44   -0.42  -0.42   0.59
## Central_Asian  -0.58  -0.45   -0.43  -0.40  11.86
## European       -3.33  -0.15    0.80   1.08   5.18
## 
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: African
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Amerindian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.1677     0.0951   -1.76    0.078 .  
## HispanicTRUE           0.8352     0.0841    9.92   <2e-16 ***
## Pacific_IslanderTRUE  -0.2997     0.1373   -2.18    0.029 *  
## AsianTRUE              0.0814     0.1022    0.80    0.426    
## African_AmericanTRUE  -2.5227     0.1081  -23.35   <2e-16 ***
## American_IndianTRUE    0.1738     0.1613    1.08    0.281    
## OtherTRUE             -0.4950     0.2937   -1.69    0.092 .  
## WhiteTRUE              0.1888     0.0911    2.07    0.038 *  
## ------------------------------------------------------------------
## Coefficients for variable no. 3: East_Asian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.29761    0.08929    3.33  0.00086 ***
## HispanicTRUE         -0.83067    0.08641   -9.61  < 2e-16 ***
## Pacific_IslanderTRUE  0.74637    0.12082    6.18  6.5e-10 ***
## AsianTRUE             2.18801    0.09410   23.25  < 2e-16 ***
## African_AmericanTRUE -2.74589    0.10610  -25.88  < 2e-16 ***
## American_IndianTRUE   0.00175    0.15871    0.01  0.99121    
## OtherTRUE            -1.00153    0.29216   -3.43  0.00061 ***
## WhiteTRUE            -0.30568    0.08369   -3.65  0.00026 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Oceanian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.4077     0.0969   -4.20  2.6e-05 ***
## HispanicTRUE          -0.5220     0.0893   -5.85  5.0e-09 ***
## Pacific_IslanderTRUE   1.2534     0.1338    9.37  < 2e-16 ***
## AsianTRUE              0.2949     0.1027    2.87   0.0041 ** 
## African_AmericanTRUE  -2.2268     0.1076  -20.69  < 2e-16 ***
## American_IndianTRUE    0.0109     0.1669    0.07   0.9480    
## OtherTRUE             -0.3695     0.2946   -1.25   0.2098    
## WhiteTRUE              0.3084     0.0920    3.35   0.0008 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 5: Central_Asian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.5133     0.0961   -5.34  9.1e-08 ***
## HispanicTRUE          -0.4859     0.0880   -5.52  3.4e-08 ***
## Pacific_IslanderTRUE  -0.0153     0.1369   -0.11     0.91    
## AsianTRUE              0.4645     0.1014    4.58  4.6e-06 ***
## African_AmericanTRUE  -2.0995     0.1068  -19.66  < 2e-16 ***
## American_IndianTRUE    0.0181     0.1666    0.11     0.91    
## OtherTRUE              0.2225     0.2923    0.76     0.45    
## WhiteTRUE              0.4655     0.0913    5.10  3.4e-07 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 6: European
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.2473     0.0862   14.48  < 2e-16 ***
## HispanicTRUE          -0.4430     0.0731   -6.06  1.4e-09 ***
## Pacific_IslanderTRUE  -0.3185     0.1232   -2.59   0.0097 ** 
## AsianTRUE             -0.5920     0.0904   -6.55  5.9e-11 ***
## African_AmericanTRUE  -2.4100     0.0978  -24.64  < 2e-16 ***
## American_IndianTRUE    0.2369     0.1380    1.72   0.0862 .  
## OtherTRUE              0.6528     0.2338    2.79   0.0052 ** 
## WhiteTRUE              1.6093     0.0837   19.22  < 2e-16 ***
## ------------------------------------------------------------------
## 
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.159      0.023    50.4   <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 32869 on 41 df (201 BFGS + 2 NR Iterations)
## AIC: -65657, BIC: -65442
## Number of Observations: 1391
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative

#prep model preds
dirichreg_fit_preds = fitted(dirichreg_fit) %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
dirichreg_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_count() +
  geom_smooth(method = "lm") +
  scale_x_continuous("Predicted value", labels = scales::percent) +
  scale_y_continuous("Genetic ancestry", labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)

## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/common_dirichlet.png")

## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(dirichreg_fit_preds, "pred_value", "true_value")

## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
cor.test(dirichreg_fit_preds$pred_value, dirichreg_fit_preds$true_value)

## 
##  Pearson's product-moment correlation
## 
## data:  dirichreg_fit_preds$pred_value and dirichreg_fit_preds$true_value
## t = 219, df = 8344, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.92 0.93
## sample estimates:
##  cor 
## 0.92

cor(dirichreg_fit_preds$pred_value, dirichreg_fit_preds$true_value)^2

## [1] 0.85

dirichreg_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

RF, common combination encoding

We apply standard random forest regression/classification
Complex self-report data. Get the exact combination right. Same as above.

SIRE from ancestry

Predict ancestry values from self-report data.

#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = rand_forest(mtry = tune()) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  tune_grid(
    resamples = all_folds,
    grid = 25,
    control = control_grid(
      save_pred = TRUE
    )
  )

## i Creating pre-processing data to finalize unknown parameter: mtry

## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...

## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...

#metrics
collect_metrics(all_fit)

Ancestry from SIRE

Predict ancestry values from self-report data.

#fit to all variables at once
rf_all_fit = rfsrc(Multivar(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d)
rf_all_fit

##                          Sample size: 1391
##                      Number of trees: 500
##            Forest terminal node size: 5
##        Average no. of terminal nodes: 29.716
## No. of variables tried at each split: 3
##               Total no. of variables: 7
##               Total no. of responses: 6
##          User has requested response: African
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 879
##                             Analysis: mRF-R
##                               Family: regr+
##                       Splitting rule: mv.mse *random*
##        Number of random split points: 10
##                      (OOB) R squared: 0.87389042
##    (OOB) Requested performance error: 0.009397

#get predictions out of sample
rf_all_fit_preds = rf_all_fit$regrOutput %>% map_df(function(x) {
  x$predicted.oob
}) %>% {
  #unitize
  ./rowSums(.)
} %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
rf_all_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)

## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(rf_all_fit_preds, "pred_value", "true_value")

## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
rf_all_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

SVM, common combination encoding

SIRE from ancestry

Predict ancestry values from self-report data.

#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = svm_rbf(
  cost = tune(),
  rbf_sigma = tune()
  ) %>% 
  set_mode("classification")

#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  tune_grid(
    resamples = all_folds,
    grid = 25,
    control = control_grid(
      save_pred = TRUE
    )
  )

## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...

## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...

## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...

## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...

## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...

## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...

#metrics
collect_metrics(all_fit)

show_best(all_fit, metric = "roc_auc")

show_best(all_fit, metric = "accuracy")

Supplement

Regular OLS for multivariate ancestry prediction

This does not enforce that predictions stay within 0-1, or sum to 1. Still, it works reasonably well.

#just regular ols
all_fit = lm(cbind(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d)
all_fit$coefficients

##                      African Amerindian East_Asian Oceanian Central_Asian
## (Intercept)           0.1269      0.063      0.227  0.01513         0.041
## HispanicTRUE          0.0127      0.174     -0.091 -0.00721        -0.020
## Pacific_IslanderTRUE -0.0305     -0.031      0.166  0.07589        -0.048
## AsianTRUE            -0.0734     -0.048      0.442 -0.01033         0.061
## African_AmericanTRUE  0.5799     -0.069     -0.176 -0.01174        -0.024
## American_IndianTRUE  -0.0099      0.013     -0.025  0.00021        -0.015
## OtherTRUE             0.1295     -0.054     -0.220 -0.01382         0.134
## WhiteTRUE            -0.1090     -0.045     -0.202 -0.01249        -0.028
##                      European
## (Intercept)             0.526
## HispanicTRUE           -0.069
## Pacific_IslanderTRUE   -0.132
## AsianTRUE              -0.371
## African_AmericanTRUE   -0.298
## American_IndianTRUE     0.037
## OtherTRUE               0.024
## WhiteTRUE               0.397

#prep model preds
all_fit_preds = all_fit$fitted.values %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
all_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_count() +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)

## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(all_fit_preds, "pred_value", "true_value")

## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
all_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

#bounds
(all_fit_preds$pred_value<0) %>% mean()

## [1] 0.15

(all_fit_preds$pred_value>1) %>% mean()

## [1] 0

Genetic ancestry and social race are nearly interchangeable

23 December, 2021

Init

Functions

Data

Simple results

Simple models, simple data

SIRE from ancestry

Ancestry from SIRE

Simple models, standard encoding of race

SIRE from ancestry

Ancestry from SIRE

Simple models, common combination encoding

SIRE from ancestry

Ancestry from SIRE

RF, common combination encoding

SIRE from ancestry

Ancestry from SIRE

SVM, common combination encoding

SIRE from ancestry

Supplement

Regular OLS for multivariate ancestry prediction

Meta