Init

options(
  digits = 2,
  contrasts = c("contr.treatment", "contr.treatment")
)

library(pacman)
p_load(
  kirkegaard,
  rms,
  ggeffects,
  tidymodels,
  recipes,
  parsnip,
  DirichletReg,
  randomForestSRC
)

theme_set(theme_bw())

Functions

#largest value by row
rowwise_which_max = function(x) {
  x %>% 
    as_tibble() %>% 
    plyr::aaply(.margins = 1, .fun = function(dd) {
        which.max(unlist(dd))
      }, .expand = F)
}

#rowwise get most likely group
get_most_likely = function(model = NULL, preds = NULL) {
  
  #get preds
  if ("rms" %in% class(model)) {
    # browser()
    model %>% 
      predict(type = "fitted.ind") %>% 
      rowwise_which_max() ->
      y
    
    #fix names
    y = mapvalues(y, from = unique(y) %>% sort(), to = bw_model2$freq %>% names()) %>% 
      factor(levels = bw_model2$freq %>% names())
    
  } else {
    model %>% 
      fitted() %>% 
      as_tibble() %>% 
      plyr::aaply(.margins = 1, .fun = function(dd) {
        which.max(unlist(dd))
      }, .expand = F) ->
      y
  }

  y
}

Data

#load data from PING 1 project
d = read_rds("data/data_out.rds")

#recode SIRE
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "Other", "White")

d$SIRE_combos_all = map_chr(1:nrow(d), .f = function(row) {
  
  #extract SIREs
  SIREs = d[row, v_SIREs] %>% unlist
  
  #only one?
  if (sum(SIREs) == 1) {
    #then return the name of it
    return(names(SIREs)[which(SIREs)])
  } else if (sum(SIREs) == 0) {
    #did not select any
    return("None")
  } else {
    #return combo
    return(str_c(names(SIREs)[which(SIREs)], collapse = ", "))
  }
})

#results?
d$SIRE_combos_all %>% table2() %>% print(n=Inf)
## # A tibble: 45 × 3
##    Group                                                      Count Percent
##    <chr>                                                      <dbl>   <dbl>
##  1 White                                                        571 41.0   
##  2 African_American                                             140 10.1   
##  3 Hispanic, White                                              140 10.1   
##  4 Asian                                                        122  8.77  
##  5 Hispanic                                                      71  5.10  
##  6 Asian, White                                                  60  4.31  
##  7 Pacific_Islander, Asian, White                                32  2.30  
##  8 Hispanic, African_American                                    29  2.08  
##  9 African_American, White                                       25  1.80  
## 10 Other                                                         24  1.73  
## 11 Pacific_Islander, Asian                                       17  1.22  
## 12 Pacific_Islander                                              16  1.15  
## 13 Hispanic, American_Indian, White                              13  0.935 
## 14 Hispanic, Asian                                               13  0.935 
## 15 Hispanic, Pacific_Islander, Asian, White                      13  0.935 
## 16 American_Indian, White                                        11  0.791 
## 17 Hispanic, Pacific_Islander, Asian                             11  0.791 
## 18 Pacific_Islander, White                                        9  0.647 
## 19 Hispanic, American_Indian                                      7  0.503 
## 20 Hispanic, Asian, White                                         6  0.431 
## 21 African_American, American_Indian                              5  0.359 
## 22 Asian, American_Indian, White                                  5  0.359 
## 23 Hispanic, Pacific_Islander                                     5  0.359 
## 24 Pacific_Islander, Asian, American_Indian, White                5  0.359 
## 25 American_Indian                                                4  0.288 
## 26 Asian, African_American                                        4  0.288 
## 27 Hispanic, African_American, American_Indian                    4  0.288 
## 28 African_American, American_Indian, White                       3  0.216 
## 29 Pacific_Islander, African_American, White                      3  0.216 
## 30 Pacific_Islander, Asian, African_American, White               3  0.216 
## 31 Asian, African_American, White                                 2  0.144 
## 32 Hispanic, African_American, American_Indian, White             2  0.144 
## 33 Hispanic, Asian, American_Indian, White                        2  0.144 
## 34 Hispanic, Pacific_Islander, Asian, African_American            2  0.144 
## 35 Hispanic, Pacific_Islander, Asian, African_American, White     2  0.144 
## 36 Hispanic, Pacific_Islander, White                              2  0.144 
## 37 Hispanic, African_American, White                              1  0.0719
## 38 Hispanic, Asian, African_American, White                       1  0.0719
## 39 Hispanic, Asian, American_Indian                               1  0.0719
## 40 Hispanic, Pacific_Islander, African_American, White            1  0.0719
## 41 Hispanic, Pacific_Islander, American_Indian                    1  0.0719
## 42 Hispanic, Pacific_Islander, Asian, American_Indian             1  0.0719
## 43 Pacific_Islander, Asian, African_American                      1  0.0719
## 44 Pacific_Islander, Asian, American_Indian                       1  0.0719
## 45 <NA>                                                           0  0
#lumped old
d$SIRE_common_old = d$SIRE_common

d$SIRE_common = fct_lump_min(d$SIRE_combos_all, min = 20, other_level = "Remainder")

#compare
d$SIRE_common %>% table2() %>% print(n=Inf)
## # A tibble: 12 × 3
##    Group                          Count Percent
##    <chr>                          <dbl>   <dbl>
##  1 White                            571   41.0 
##  2 Remainder                        177   12.7 
##  3 African_American                 140   10.1 
##  4 Hispanic, White                  140   10.1 
##  5 Asian                            122    8.77
##  6 Hispanic                          71    5.10
##  7 Asian, White                      60    4.31
##  8 Pacific_Islander, Asian, White    32    2.30
##  9 Hispanic, African_American        29    2.08
## 10 African_American, White           25    1.80
## 11 Other                             24    1.73
## 12 <NA>                               0    0
d$SIRE_common_old %>% table2() %>% print(n=Inf)
## # A tibble: 10 × 3
##    Group                          Count Percent
##    <chr>                          <dbl>   <dbl>
##  1 White                            571   41.0 
##  2 Other                            226   16.2 
##  3 African_American                 140   10.1 
##  4 Hispanic, White                  140   10.1 
##  5 Asian                            122    8.77
##  6 Hispanic                          71    5.10
##  7 Asian, White                      60    4.31
##  8 Pacific_Islander, Asian, White    32    2.30
##  9 Hispanic, African_American        29    2.08
## 10 <NA>                               0    0
#remove persons without relevant data
#no such persons
# d %<>% filter(
#   !is.na(European),
#   !is.na(White)
# )

#subsets
BWs = d %>% filter(White | African_American, !Asian, !Hispanic, !Other, !American_Indian, !Pacific_Islander, (African+European) > .95)

#relevel factors
BWs$BW = case_when(
  BWs$White & BWs$African_American ~ "White + Black",
  BWs$White & !BWs$African_American ~ "White",
  !BWs$White & BWs$African_American ~ "Black"
) %>% ordered(levels = c("White", "White + Black", "Black"))

BWs$BW %>% table2()
BWs$justwhite = BWs$BW %>% as.character() %>% str_detect("Black", negate = T)

Simple results

d$Age %>% describe2()
d$Sex %>% table2(include_NA = F)
d$SIRE_standard %>% table2(include_NA = F)

Simple models, simple data

Data restricted to people who report being white, black, or both. We use simple models: logistic ordinal regression and linear regression.

SIRE from ancestry

Predict self-report from ancestry data.

#ancetries correlate close to -1.0
GG_scatter(BWs, "European", "African")
## `geom_smooth()` using formula 'y ~ x'

#models
(bw_model1 = lrm(BW ~ European, data = BWs)) #variant 1
## Logistic Regression Model
##  
##  lrm(formula = BW ~ European, data = BWs)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           649    LR chi2     728.19    R2       0.933    C       0.994    
##   White        497    d.f.             1    g        4.967    Dxy     0.987    
##   White + Black 22    Pr(> chi2) <0.0001    gr     143.576    gamma   0.995    
##   Black        130                          gp       0.349    tau-a   0.368    
##  max |deriv| 7e-09                          Brier    0.008                     
##  
##                   Coef     S.E.   Wald Z Pr(>|Z|)
##  y>=White + Black  12.3886 1.6947  7.31  <0.0001 
##  y>=Black           8.0003 0.9404  8.51  <0.0001 
##  European         -17.1091 1.8503 -9.25  <0.0001 
## 
(bw_model2 = lrm(BW ~ African, data = BWs)) #2
## Logistic Regression Model
##  
##  lrm(formula = BW ~ African, data = BWs)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           649    LR chi2     726.47    R2       0.932    C       0.994    
##   White        497    d.f.             1    g        4.854    Dxy     0.988    
##   White + Black 22    Pr(> chi2) <0.0001    gr     128.215    gamma   0.996    
##   Black        130                          gp       0.348    tau-a   0.368    
##  max |deriv| 4e-09                          Brier    0.008                     
##  
##                   Coef    S.E.   Wald Z Pr(>|Z|)
##  y>=White + Black -4.6422 0.4304 -10.79 <0.0001 
##  y>=Black         -8.9295 1.0750  -8.31 <0.0001 
##  African          16.9240 1.8143   9.33 <0.0001 
## 
#visualize
predict_SIRE = tibble(
  African = seq(0, 1, length.out = 10e3),
  European = 1 - African
)

#SIRE, from African
bw_model2_preds = predict(bw_model2, newdata = predict_SIRE, type = "fitted.ind")
predict_SIRE$prob_White = bw_model2_preds[, 1]
predict_SIRE$prob_White_and_Black = bw_model2_preds[, 2]
predict_SIRE$prob_Black = bw_model2_preds[, 3]


#ancestry only, SIRE
predict_SIRE %>% 
  pivot_longer(cols = starts_with("prob_")) %>% 
  mutate(
    name = case_when(
      name == "prob_White" ~ "White",
      name == "prob_White_and_Black" ~ "White + Black",
      name == "prob_Black" ~ "Black"
    ) %>% ordered(levels = c("White", "White + Black", "Black"))
  ) %>% 
  ggplot(aes(African, value, color = name)) +
  geom_line() +
  theme_bw() +
  scale_color_discrete("Social race") +
  scale_x_continuous("African ancestry (genetic)", labels = scales::percent) +
  scale_y_continuous("Probability of being...", labels = scales::percent)

GG_save("figs/AFR_to_SIRE.png")

#concordance in real data
BWs$fitted_ml = bw_model2 %>% get_most_likely() %>% ordered()

#confusion matrix
table(true = BWs$BW, predicted = BWs$fitted_ml)
##                predicted
## true            White White + Black Black
##   White           496             0     1
##   White + Black     5            13     4
##   Black             0             3   127
#concordances
(BWs$BW == BWs$fitted_ml) %>% mean()
## [1] 0.98
BWs %>% 
  group_by(BW) %>% 
  summarise(concordance = mean(BW == fitted_ml))
#simplified case with just white vs. black (including BW)
(bw_model2b = lrm(justwhite ~ African, data = BWs))
## Logistic Regression Model
##  
##  lrm(formula = justwhite ~ African, data = BWs)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           649    LR chi2     646.64    R2       0.951    C       0.995    
##   FALSE        152    d.f.             1    g        5.288    Dxy     0.991    
##   TRUE         497    Pr(> chi2) <0.0001    gr     197.948    gamma   0.998    
##  max |deriv| 2e-09                          gp       0.349    tau-a   0.356    
##                                             Brier    0.008                     
##  
##            Coef     S.E.   Wald Z Pr(>|Z|)
##  Intercept   4.7813 0.4792  9.98  <0.0001 
##  African   -18.4383 2.7559 -6.69  <0.0001 
## 
BWs$fitted_ml_white = bw_model2b %>% predict(type = "fitted.ind") %>% round()
(BWs$justwhite == BWs$fitted_ml_white) %>% mean() %>% print(digits = 3)
## [1] 0.991

Ancestry from SIRE

Predict ancestry values from self-report data.

(bw_model1 = ols(European ~ BW, data = BWs)) #variant 1
## Linear Regression Model
##  
##  ols(formula = European ~ BW, data = BWs)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     649    LR chi2    2014.99    R2       0.955    
##  sigma0.0715    d.f.             2    R2 adj   0.955    
##  d.f.    646    Pr(> chi2)  0.0000    g        0.278    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.8857447  0.0006753  0.0062453  0.0062453  0.4367222 
##  
##  
##                   Coef    S.E.   t       Pr(>|t|)
##  Intercept         0.9938 0.0032  309.91 <0.0001 
##  BW=White + Black -0.3952 0.0156  -25.37 <0.0001 
##  BW=Black         -0.8198 0.0070 -116.41 <0.0001 
## 
(bw_model2 = ols(African ~ BW, data = BWs)) #works a little better in this case
## Linear Regression Model
##  
##  ols(formula = African ~ BW, data = BWs)
##  
##                   Model Likelihood    Discrimination    
##                         Ratio Test           Indexes    
##  Obs     649    LR chi2    2014.40    R2       0.955    
##  sigma0.0714    d.f.             2    R2 adj   0.955    
##  d.f.    646    Pr(> chi2)  0.0000    g        0.278    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -0.431846 -0.003267 -0.003267 -0.003267  0.888723 
##  
##  
##                   Coef   S.E.   t      Pr(>|t|)
##  Intercept        0.0033 0.0032   1.02 0.3078  
##  BW=White + Black 0.3969 0.0155  25.53 <0.0001 
##  BW=Black         0.8179 0.0070 116.33 <0.0001 
## 
#show the predictions
bw_model1 %>% ggpredict() %>% plot()
## $BW

bw_model2 %>% ggpredict() %>% plot()
## $BW

#of course, this is quite similar to just doing violin plots and seeing the validity
BWs %>% 
  ggplot(aes(BW, European, fill = BW)) +
  geom_violin(scale = "width")

BWs %>% 
  ggplot(aes(BW, African, fill = BW)) +
  geom_violin(scale = "width") +
  scale_y_continuous("African ancestry", labels = scales::percent) +
  scale_x_discrete("Social race") +
  scale_fill_discrete("Social race", guide = "none")

GG_save("figs/SIRE_to_AFR.png")

Simple models, standard encoding of race

We use all the data. We use simple models: multinomial regression and dirichlet regression.

SIRE from ancestry

Predict self-report from ancestry data.

#predict what distribution?
d$SIRE_standard %>% table2()
#make a recipe
all_recipe = recipe(SIRE_standard ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = multinom_reg() %>% 
  set_engine("nnet")

#resampling method
#actually the same as above
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  fit_resamples(
    all_folds,
    control = control_resamples(save_pred = TRUE)
    )
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold04: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold07: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold16: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold18: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Americ...
#metrics
collect_metrics(all_fit)
#concordance in real data
all_fit %>% 
  collect_predictions() %>% 
  arrange(.row) %>% 
  pull(.pred_class) ->
  d$fitted


#confusion matrix
table(true = d$SIRE_standard, predicted = d$fitted)
##                   predicted
## true               White African_American American_Indian Asian Hispanic
##   White              556                1               0     0        7
##   African_American     0              133               0     0        4
##   American_Indian      2                1               0     0        1
##   Asian                1                0               0   106        2
##   Hispanic            37               12               0     5      245
##   Multi_ethnic        21                9               0    13        9
##   Other                9                5               0     4        2
##   Pacific_Islander     0                0               0     0        0
##                   predicted
## true               Multi_ethnic Other Pacific_Islander
##   White                       7     0                0
##   African_American            3     0                0
##   American_Indian             0     0                0
##   Asian                      13     0                0
##   Hispanic                   29     0                0
##   Multi_ethnic              125     3                6
##   Other                       4     0                0
##   Pacific_Islander            7     0                9
#concordances
(d$SIRE_standard == d$fitted) %>% mean()
## [1] 0.84
d %>% 
  group_by(SIRE_standard) %>% 
  summarise(concordance = mean(SIRE_standard == fitted))

Ancestry from SIRE

Predict ancestry values from self-report data.

#direchlet reg
#prep Y data
#check data first, do the rows sum to 1?
d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% rowSums() %>% table()
## .
## 0.99999       1 1.00001 
##      32    1241     118
#close enough not to worry!
#transform
direct_y = DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European))
## Warning in DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, : not all rows sum up to 1 => normalization forced
##   some entries are 0 or 1 => transformation forced
#fit
dirichreg_fit = DirichReg(direct_y ~ SIRE_standard, data = d, model = "alternative")

dirichreg_fit %>% summary()
## Call:
## DirichReg(formula = direct_y ~ SIRE_standard, data = d, model = "alternative")
## 
## Standardized Residuals:
##                  Min     1Q  Median     3Q  Max
## African        -0.91  -0.46   -0.43  -0.18  7.5
## Amerindian     -0.86  -0.46   -0.44  -0.43  3.4
## East_Asian     -2.91  -0.43   -0.42  -0.15  7.2
## Oceanian       -0.76  -0.45   -0.42  -0.42  1.2
## Central_Asian  -0.58  -0.44   -0.44  -0.44  7.6
## European       -2.90  -0.35    0.66   1.08  6.6
## 
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: African
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Amerindian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.0187     0.0585    0.32  0.74979    
## SIRE_standardAfrican_American  -2.5552     0.1112  -22.98  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.0409     0.6953   -0.06  0.95308    
## SIRE_standardAsian             -0.0258     0.1392   -0.19  0.85317    
## SIRE_standardHispanic           0.3254     0.0934    3.49  0.00049 ***
## SIRE_standardMulti_ethnic      -0.2793     0.1172   -2.38  0.01717 *  
## SIRE_standardOther             -0.6419     0.2852   -2.25  0.02438 *  
## SIRE_standardPacific_Islander  -0.1629     0.3543   -0.46  0.64562    
## ------------------------------------------------------------------
## Coefficients for variable no. 3: East_Asian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -0.0284     0.0585   -0.49    0.627    
## SIRE_standardAfrican_American  -2.5360     0.1111  -22.82  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.3000     0.6980   -0.43    0.667    
## SIRE_standardAsian              2.5754     0.1151   22.37  < 2e-16 ***
## SIRE_standardHispanic          -0.5519     0.0954   -5.79  7.2e-09 ***
## SIRE_standardMulti_ethnic       1.1203     0.1117   10.03  < 2e-16 ***
## SIRE_standardOther             -0.6342     0.2852   -2.22    0.026 *  
## SIRE_standardPacific_Islander   2.3691     0.2842    8.34  < 2e-16 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Oceanian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    -0.0496     0.0585   -0.85    0.397    
## SIRE_standardAfrican_American  -2.5348     0.1111  -22.81  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.0907     0.6965   -0.13    0.896    
## SIRE_standardAsian              0.1769     0.1390    1.27    0.203    
## SIRE_standardHispanic          -0.7576     0.0956   -7.93  2.2e-15 ***
## SIRE_standardMulti_ethnic       0.0243     0.1169    0.21    0.835    
## SIRE_standardOther             -0.6829     0.2854   -2.39    0.017 *  
## SIRE_standardPacific_Islander   1.5380     0.3268    4.71  2.5e-06 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 5: Central_Asian
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.0194     0.0585    0.33    0.741    
## SIRE_standardAfrican_American  -2.5764     0.1112  -23.17   <2e-16 ***
## SIRE_standardAmerican_Indian   -0.3478     0.6980   -0.50    0.618    
## SIRE_standardAsian              0.2067     0.1387    1.49    0.136    
## SIRE_standardHispanic          -0.8868     0.0956   -9.28   <2e-16 ***
## SIRE_standardMulti_ethnic      -0.2691     0.1172   -2.30    0.022 *  
## SIRE_standardOther             -0.2907     0.2837   -1.02    0.306    
## SIRE_standardPacific_Islander  -0.1636     0.3543   -0.46    0.644    
## ------------------------------------------------------------------
## Coefficients for variable no. 6: European
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     2.7443     0.0495   55.41  < 2e-16 ***
## SIRE_standardAfrican_American  -3.9422     0.1080  -36.50  < 2e-16 ***
## SIRE_standardAmerican_Indian   -0.4092     0.5362   -0.76     0.45    
## SIRE_standardAsian             -2.6218     0.1353  -19.38  < 2e-16 ***
## SIRE_standardHispanic          -1.2190     0.0761  -16.02  < 2e-16 ***
## SIRE_standardMulti_ethnic      -0.8665     0.0940   -9.22  < 2e-16 ***
## SIRE_standardOther             -1.0020     0.2275   -4.40  1.1e-05 ***
## SIRE_standardPacific_Islander  -2.3967     0.3502   -6.84  7.7e-12 ***
## ------------------------------------------------------------------
## 
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.9739     0.0216    45.1   <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 32591 on 41 df (186 BFGS + 2 NR Iterations)
## AIC: -65099, BIC: -64884
## Number of Observations: 1391
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
#prep model preds
dirichreg_fit_preds = fitted(dirichreg_fit) %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
dirichreg_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_count() +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(dirichreg_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
dirichreg_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

Simple models, common combination encoding

SIRE from ancestry

Predict ancestry values from self-report data.

#predict what distribution?
d$SIRE_common %>% table2()
#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = multinom_reg() %>% 
  set_engine("nnet")

#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  fit_resamples(all_folds)
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...
#metrics
collect_metrics(all_fit)

Ancestry from SIRE

Predict ancestry values from self-report data.

#direchlet reg
#prep Y data
#check data first, do the rows sum to 1?
d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% rowSums() %>% table()
## .
## 0.99999       1 1.00001 
##      32    1241     118
#close enough not to worry!
#transform
direct_y = DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European))
## Warning in DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, : not all rows sum up to 1 => normalization forced
##   some entries are 0 or 1 => transformation forced
#fit
dirichreg_fit = DirichReg(direct_y ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d, model = "alternative")

dirichreg_fit %>% summary()
## Call:
## DirichReg(formula = direct_y ~ Hispanic + Pacific_Islander + Asian +
## African_American + American_Indian + Other + White, data = d, model =
## "alternative")
## 
## Standardized Residuals:
##                  Min     1Q  Median     3Q    Max
## African        -2.11  -0.44   -0.44  -0.15   8.37
## Amerindian     -1.33  -0.45   -0.45  -0.32   2.90
## East_Asian     -3.85  -0.44   -0.44  -0.30   6.64
## Oceanian       -1.01  -0.44   -0.42  -0.42   0.59
## Central_Asian  -0.58  -0.45   -0.43  -0.40  11.86
## European       -3.33  -0.15    0.80   1.08   5.18
## 
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: African
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Amerindian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.1677     0.0951   -1.76    0.078 .  
## HispanicTRUE           0.8352     0.0841    9.92   <2e-16 ***
## Pacific_IslanderTRUE  -0.2997     0.1373   -2.18    0.029 *  
## AsianTRUE              0.0814     0.1022    0.80    0.426    
## African_AmericanTRUE  -2.5227     0.1081  -23.35   <2e-16 ***
## American_IndianTRUE    0.1738     0.1613    1.08    0.281    
## OtherTRUE             -0.4950     0.2937   -1.69    0.092 .  
## WhiteTRUE              0.1888     0.0911    2.07    0.038 *  
## ------------------------------------------------------------------
## Coefficients for variable no. 3: East_Asian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.29761    0.08929    3.33  0.00086 ***
## HispanicTRUE         -0.83067    0.08641   -9.61  < 2e-16 ***
## Pacific_IslanderTRUE  0.74637    0.12082    6.18  6.5e-10 ***
## AsianTRUE             2.18801    0.09410   23.25  < 2e-16 ***
## African_AmericanTRUE -2.74589    0.10610  -25.88  < 2e-16 ***
## American_IndianTRUE   0.00175    0.15871    0.01  0.99121    
## OtherTRUE            -1.00153    0.29216   -3.43  0.00061 ***
## WhiteTRUE            -0.30568    0.08369   -3.65  0.00026 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Oceanian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.4077     0.0969   -4.20  2.6e-05 ***
## HispanicTRUE          -0.5220     0.0893   -5.85  5.0e-09 ***
## Pacific_IslanderTRUE   1.2534     0.1338    9.37  < 2e-16 ***
## AsianTRUE              0.2949     0.1027    2.87   0.0041 ** 
## African_AmericanTRUE  -2.2268     0.1076  -20.69  < 2e-16 ***
## American_IndianTRUE    0.0109     0.1669    0.07   0.9480    
## OtherTRUE             -0.3695     0.2946   -1.25   0.2098    
## WhiteTRUE              0.3084     0.0920    3.35   0.0008 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 5: Central_Asian
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -0.5133     0.0961   -5.34  9.1e-08 ***
## HispanicTRUE          -0.4859     0.0880   -5.52  3.4e-08 ***
## Pacific_IslanderTRUE  -0.0153     0.1369   -0.11     0.91    
## AsianTRUE              0.4645     0.1014    4.58  4.6e-06 ***
## African_AmericanTRUE  -2.0995     0.1068  -19.66  < 2e-16 ***
## American_IndianTRUE    0.0181     0.1666    0.11     0.91    
## OtherTRUE              0.2225     0.2923    0.76     0.45    
## WhiteTRUE              0.4655     0.0913    5.10  3.4e-07 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 6: European
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            1.2473     0.0862   14.48  < 2e-16 ***
## HispanicTRUE          -0.4430     0.0731   -6.06  1.4e-09 ***
## Pacific_IslanderTRUE  -0.3185     0.1232   -2.59   0.0097 ** 
## AsianTRUE             -0.5920     0.0904   -6.55  5.9e-11 ***
## African_AmericanTRUE  -2.4100     0.0978  -24.64  < 2e-16 ***
## American_IndianTRUE    0.2369     0.1380    1.72   0.0862 .  
## OtherTRUE              0.6528     0.2338    2.79   0.0052 ** 
## WhiteTRUE              1.6093     0.0837   19.22  < 2e-16 ***
## ------------------------------------------------------------------
## 
## PRECISION MODEL:
## ------------------------------------------------------------------
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.159      0.023    50.4   <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-likelihood: 32869 on 41 df (201 BFGS + 2 NR Iterations)
## AIC: -65657, BIC: -65442
## Number of Observations: 1391
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
#prep model preds
dirichreg_fit_preds = fitted(dirichreg_fit) %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
dirichreg_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_count() +
  geom_smooth(method = "lm") +
  scale_x_continuous("Predicted value", labels = scales::percent) +
  scale_y_continuous("Genetic ancestry", labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/common_dirichlet.png")
## `geom_smooth()` using formula 'y ~ x'
#overall correlation, across variables
GG_scatter(dirichreg_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
cor.test(dirichreg_fit_preds$pred_value, dirichreg_fit_preds$true_value)
## 
##  Pearson's product-moment correlation
## 
## data:  dirichreg_fit_preds$pred_value and dirichreg_fit_preds$true_value
## t = 219, df = 8344, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.92 0.93
## sample estimates:
##  cor 
## 0.92
cor(dirichreg_fit_preds$pred_value, dirichreg_fit_preds$true_value)^2
## [1] 0.85
dirichreg_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

RF, common combination encoding

SIRE from ancestry

Predict ancestry values from self-report data.

#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = rand_forest(mtry = tune()) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  tune_grid(
    resamples = all_folds,
    grid = 25,
    control = control_grid(
      save_pred = TRUE
    )
  )
## i Creating pre-processing data to finalize unknown parameter: mtry
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...
#metrics
collect_metrics(all_fit)

Ancestry from SIRE

Predict ancestry values from self-report data.

#fit to all variables at once
rf_all_fit = rfsrc(Multivar(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d)
rf_all_fit
##                          Sample size: 1391
##                      Number of trees: 500
##            Forest terminal node size: 5
##        Average no. of terminal nodes: 29.716
## No. of variables tried at each split: 3
##               Total no. of variables: 7
##               Total no. of responses: 6
##          User has requested response: African
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 879
##                             Analysis: mRF-R
##                               Family: regr+
##                       Splitting rule: mv.mse *random*
##        Number of random split points: 10
##                      (OOB) R squared: 0.87389042
##    (OOB) Requested performance error: 0.009397
#get predictions out of sample
rf_all_fit_preds = rf_all_fit$regrOutput %>% map_df(function(x) {
  x$predicted.oob
}) %>% {
  #unitize
  ./rowSums(.)
} %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
rf_all_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_point() +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(rf_all_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
rf_all_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})

SVM, common combination encoding

SIRE from ancestry

Predict ancestry values from self-report data.

#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)

#make a model
#swap to multinom_reg() + nnet
all_model = svm_rbf(
  cost = tune(),
  rbf_sigma = tune()
  ) %>% 
  set_mode("classification")

#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)

#make workflow
all_wf = workflow() %>% 
  add_model(all_model) %>% 
  add_recipe(all_recipe)

#fit
all_fit = all_wf %>% 
  tune_grid(
    resamples = all_folds,
    grid = 25,
    control = control_grid(
      save_pred = TRUE
    )
  )
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...
#metrics
collect_metrics(all_fit)
show_best(all_fit, metric = "roc_auc")
show_best(all_fit, metric = "accuracy")

Supplement

Regular OLS for multivariate ancestry prediction

This does not enforce that predictions stay within 0-1, or sum to 1. Still, it works reasonably well.

#just regular ols
all_fit = lm(cbind(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d)
all_fit$coefficients
##                      African Amerindian East_Asian Oceanian Central_Asian
## (Intercept)           0.1269      0.063      0.227  0.01513         0.041
## HispanicTRUE          0.0127      0.174     -0.091 -0.00721        -0.020
## Pacific_IslanderTRUE -0.0305     -0.031      0.166  0.07589        -0.048
## AsianTRUE            -0.0734     -0.048      0.442 -0.01033         0.061
## African_AmericanTRUE  0.5799     -0.069     -0.176 -0.01174        -0.024
## American_IndianTRUE  -0.0099      0.013     -0.025  0.00021        -0.015
## OtherTRUE             0.1295     -0.054     -0.220 -0.01382         0.134
## WhiteTRUE            -0.1090     -0.045     -0.202 -0.01249        -0.028
##                      European
## (Intercept)             0.526
## HispanicTRUE           -0.069
## Pacific_IslanderTRUE   -0.132
## AsianTRUE              -0.371
## African_AmericanTRUE   -0.298
## American_IndianTRUE     0.037
## OtherTRUE               0.024
## WhiteTRUE               0.397
#prep model preds
all_fit_preds = all_fit$fitted.values %>%
  as.data.frame() %>% 
  mutate(id = 1:n()) %>% 
  pivot_longer(cols = -id, values_to = "pred_value") %>% 
  bind_cols(
    d %>% 
      select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% 
      mutate(id = 1:n()) %>% 
      pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>% 
      select(-id)
  )

#plot results
all_fit_preds %>% 
  ggplot(aes(pred_value, true_value, color = name)) +
  geom_count() +
  geom_smooth(method = "lm") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_discrete("Ancestry") +
  facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'

#overall correlation, across variables
GG_scatter(all_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'

#each correlation by ancestry
all_fit_preds %>% plyr::ddply("name", function(dd) {
  wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})
#bounds
(all_fit_preds$pred_value<0) %>% mean()
## [1] 0.15
(all_fit_preds$pred_value>1) %>% mean()
## [1] 0

Meta

write_sessioninfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kernlab_0.9-29         ranger_0.13.1          nnet_7.3-16           
##  [4] vctrs_0.3.8            randomForestSRC_2.14.0 DirichletReg_0.7-1    
##  [7] yardstick_0.0.8        workflowsets_0.1.0     workflows_0.2.4       
## [10] tune_0.1.6             rsample_0.1.1          recipes_0.1.17        
## [13] parsnip_0.1.7          modeldata_0.1.1        infer_1.0.0           
## [16] dials_0.0.10           scales_1.1.1           broom_0.7.10          
## [19] tidymodels_0.1.4       ggeffects_1.1.1        rms_6.2-0             
## [22] SparseM_1.81           kirkegaard_2021-12-14  rlang_0.4.12          
## [25] metafor_3.0-2          Matrix_1.4-0           psych_2.1.9           
## [28] magrittr_2.0.1         assertthat_0.2.1       weights_1.0.4         
## [31] Hmisc_4.6-0            Formula_1.2-4          survival_3.2-13       
## [34] lattice_0.20-45        forcats_0.5.1          stringr_1.4.0         
## [37] dplyr_1.0.7            purrr_0.3.4            readr_2.1.0           
## [40] tidyr_1.1.4            tibble_3.1.6           ggplot2_3.3.5         
## [43] tidyverse_1.3.1        pacman_0.5.1          
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2           tidyselect_1.1.1     lme4_1.1-27.1       
##   [4] htmlwidgets_1.5.4    grid_4.1.2           pROC_1.18.0         
##   [7] miscTools_0.6-26     munsell_0.5.0        codetools_0.2-18    
##  [10] future_1.23.0        withr_2.4.2          colorspace_2.0-2    
##  [13] highr_0.9            knitr_1.36           rstudioapi_0.13     
##  [16] stats4_4.1.2         listenv_0.8.0        labeling_0.4.2      
##  [19] mnormt_2.0.2         DiceDesign_1.9       farver_2.1.0        
##  [22] parallelly_1.28.1    generics_0.1.1       TH.data_1.1-0       
##  [25] ipred_0.9-12         xfun_0.28            R6_2.5.1            
##  [28] lhs_1.1.3            multcomp_1.4-17      gtable_0.3.0        
##  [31] globals_0.14.0       conquer_1.2.1        sandwich_3.0-1      
##  [34] timeDate_3043.102    MatrixModels_0.5-0   splines_4.1.2       
##  [37] ModelMetrics_1.2.2.2 checkmate_2.0.0      yaml_2.2.1          
##  [40] reshape2_1.4.4       modelr_0.1.8         backports_1.3.0     
##  [43] caret_6.0-90         DiagrammeR_1.0.6.1   tools_4.1.2         
##  [46] lava_1.6.10          ellipsis_0.3.2       jquerylib_0.1.4     
##  [49] multilevel_2.6       RColorBrewer_1.1-2   Rcpp_1.0.7          
##  [52] plyr_1.8.6           base64enc_0.1-3      visNetwork_2.1.0    
##  [55] rpart_4.1-15         zoo_1.8-9            haven_2.4.3         
##  [58] cluster_2.1.2        fs_1.5.0             furrr_0.2.3         
##  [61] data.table_1.14.2    reprex_2.0.1         GPfit_1.0-8         
##  [64] tmvnsim_1.0-2        mvtnorm_1.1-3        matrixStats_0.61.0  
##  [67] hms_1.1.1            evaluate_0.14        jpeg_0.1-9          
##  [70] readxl_1.3.1         gridExtra_2.3        compiler_4.1.2      
##  [73] psychometric_2.2     mice_3.13.0          crayon_1.4.2        
##  [76] minqa_1.2.4          htmltools_0.5.2      mgcv_1.8-38         
##  [79] tzdb_0.2.0           lubridate_1.8.0      DBI_1.1.1           
##  [82] sjlabelled_1.1.8     dbplyr_2.1.1         MASS_7.3-54         
##  [85] boot_1.3-28          data.tree_1.0.0      cli_3.1.0           
##  [88] gdata_2.18.0         parallel_4.1.2       insight_0.14.5      
##  [91] gower_0.2.2          pkgconfig_2.0.3      foreign_0.8-81      
##  [94] xml2_1.3.2           foreach_1.5.1        bslib_0.3.1         
##  [97] hardhat_0.1.6        prodlim_2019.11.13   rvest_1.0.2         
## [100] digest_0.6.28        rmarkdown_2.11       cellranger_1.1.0    
## [103] htmlTable_2.3.0      maxLik_1.5-2         gtools_3.9.2        
## [106] quantreg_5.86        nloptr_1.2.2.3       lifecycle_1.0.1     
## [109] nlme_3.1-152         jsonlite_1.7.2       viridisLite_0.4.0   
## [112] fansi_0.5.0          pillar_1.6.4         fastmap_1.1.0       
## [115] httr_1.4.2           glue_1.5.0           png_0.1-7           
## [118] iterators_1.0.13     class_7.3-19         stringi_1.7.5       
## [121] sass_0.4.0           polspline_1.1.19     latticeExtra_0.6-29 
## [124] mathjaxr_1.4-0       future.apply_1.8.1
#upload to OSF
#avoid uploading the data in case they freak out again
if (F) {
  library(osfr)
  
  #auth
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/qxvg8/")
  
  #upload files
  #overwrite existing (versioning)
  osf_upload(osf_proj, conflicts = "overwrite", 
             path = c(
               "figs",
               "data/public",
               "notebook.html",
               "notebook.Rmd",
               "Family income, parental education and brain structure.pdf"
             ))
}