options(
digits = 2,
contrasts = c("contr.treatment", "contr.treatment")
)
library(pacman)
p_load(
kirkegaard,
rms,
ggeffects,
tidymodels,
recipes,
parsnip,
DirichletReg,
randomForestSRC
)
theme_set(theme_bw())
#largest value by row
rowwise_which_max = function(x) {
x %>%
as_tibble() %>%
plyr::aaply(.margins = 1, .fun = function(dd) {
which.max(unlist(dd))
}, .expand = F)
}
#rowwise get most likely group
get_most_likely = function(model = NULL, preds = NULL) {
#get preds
if ("rms" %in% class(model)) {
# browser()
model %>%
predict(type = "fitted.ind") %>%
rowwise_which_max() ->
y
#fix names
y = mapvalues(y, from = unique(y) %>% sort(), to = bw_model2$freq %>% names()) %>%
factor(levels = bw_model2$freq %>% names())
} else {
model %>%
fitted() %>%
as_tibble() %>%
plyr::aaply(.margins = 1, .fun = function(dd) {
which.max(unlist(dd))
}, .expand = F) ->
y
}
y
}
#load data from PING 1 project
d = read_rds("data/data_out.rds")
#recode SIRE
v_SIREs = c("Hispanic", "Pacific_Islander", "Asian", "African_American", "American_Indian", "Other", "White")
d$SIRE_combos_all = map_chr(1:nrow(d), .f = function(row) {
#extract SIREs
SIREs = d[row, v_SIREs] %>% unlist
#only one?
if (sum(SIREs) == 1) {
#then return the name of it
return(names(SIREs)[which(SIREs)])
} else if (sum(SIREs) == 0) {
#did not select any
return("None")
} else {
#return combo
return(str_c(names(SIREs)[which(SIREs)], collapse = ", "))
}
})
#results?
d$SIRE_combos_all %>% table2() %>% print(n=Inf)
## # A tibble: 45 × 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 White 571 41.0
## 2 African_American 140 10.1
## 3 Hispanic, White 140 10.1
## 4 Asian 122 8.77
## 5 Hispanic 71 5.10
## 6 Asian, White 60 4.31
## 7 Pacific_Islander, Asian, White 32 2.30
## 8 Hispanic, African_American 29 2.08
## 9 African_American, White 25 1.80
## 10 Other 24 1.73
## 11 Pacific_Islander, Asian 17 1.22
## 12 Pacific_Islander 16 1.15
## 13 Hispanic, American_Indian, White 13 0.935
## 14 Hispanic, Asian 13 0.935
## 15 Hispanic, Pacific_Islander, Asian, White 13 0.935
## 16 American_Indian, White 11 0.791
## 17 Hispanic, Pacific_Islander, Asian 11 0.791
## 18 Pacific_Islander, White 9 0.647
## 19 Hispanic, American_Indian 7 0.503
## 20 Hispanic, Asian, White 6 0.431
## 21 African_American, American_Indian 5 0.359
## 22 Asian, American_Indian, White 5 0.359
## 23 Hispanic, Pacific_Islander 5 0.359
## 24 Pacific_Islander, Asian, American_Indian, White 5 0.359
## 25 American_Indian 4 0.288
## 26 Asian, African_American 4 0.288
## 27 Hispanic, African_American, American_Indian 4 0.288
## 28 African_American, American_Indian, White 3 0.216
## 29 Pacific_Islander, African_American, White 3 0.216
## 30 Pacific_Islander, Asian, African_American, White 3 0.216
## 31 Asian, African_American, White 2 0.144
## 32 Hispanic, African_American, American_Indian, White 2 0.144
## 33 Hispanic, Asian, American_Indian, White 2 0.144
## 34 Hispanic, Pacific_Islander, Asian, African_American 2 0.144
## 35 Hispanic, Pacific_Islander, Asian, African_American, White 2 0.144
## 36 Hispanic, Pacific_Islander, White 2 0.144
## 37 Hispanic, African_American, White 1 0.0719
## 38 Hispanic, Asian, African_American, White 1 0.0719
## 39 Hispanic, Asian, American_Indian 1 0.0719
## 40 Hispanic, Pacific_Islander, African_American, White 1 0.0719
## 41 Hispanic, Pacific_Islander, American_Indian 1 0.0719
## 42 Hispanic, Pacific_Islander, Asian, American_Indian 1 0.0719
## 43 Pacific_Islander, Asian, African_American 1 0.0719
## 44 Pacific_Islander, Asian, American_Indian 1 0.0719
## 45 <NA> 0 0
#lumped old
d$SIRE_common_old = d$SIRE_common
d$SIRE_common = fct_lump_min(d$SIRE_combos_all, min = 20, other_level = "Remainder")
#compare
d$SIRE_common %>% table2() %>% print(n=Inf)
## # A tibble: 12 × 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 White 571 41.0
## 2 Remainder 177 12.7
## 3 African_American 140 10.1
## 4 Hispanic, White 140 10.1
## 5 Asian 122 8.77
## 6 Hispanic 71 5.10
## 7 Asian, White 60 4.31
## 8 Pacific_Islander, Asian, White 32 2.30
## 9 Hispanic, African_American 29 2.08
## 10 African_American, White 25 1.80
## 11 Other 24 1.73
## 12 <NA> 0 0
d$SIRE_common_old %>% table2() %>% print(n=Inf)
## # A tibble: 10 × 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 White 571 41.0
## 2 Other 226 16.2
## 3 African_American 140 10.1
## 4 Hispanic, White 140 10.1
## 5 Asian 122 8.77
## 6 Hispanic 71 5.10
## 7 Asian, White 60 4.31
## 8 Pacific_Islander, Asian, White 32 2.30
## 9 Hispanic, African_American 29 2.08
## 10 <NA> 0 0
#remove persons without relevant data
#no such persons
# d %<>% filter(
# !is.na(European),
# !is.na(White)
# )
#subsets
BWs = d %>% filter(White | African_American, !Asian, !Hispanic, !Other, !American_Indian, !Pacific_Islander, (African+European) > .95)
#relevel factors
BWs$BW = case_when(
BWs$White & BWs$African_American ~ "White + Black",
BWs$White & !BWs$African_American ~ "White",
!BWs$White & BWs$African_American ~ "Black"
) %>% ordered(levels = c("White", "White + Black", "Black"))
BWs$BW %>% table2()
BWs$justwhite = BWs$BW %>% as.character() %>% str_detect("Black", negate = T)
d$Age %>% describe2()
d$Sex %>% table2(include_NA = F)
d$SIRE_standard %>% table2(include_NA = F)
Data restricted to people who report being white, black, or both. We use simple models: logistic ordinal regression and linear regression.
Predict self-report from ancestry data.
#ancetries correlate close to -1.0
GG_scatter(BWs, "European", "African")
## `geom_smooth()` using formula 'y ~ x'
#models
(bw_model1 = lrm(BW ~ European, data = BWs)) #variant 1
## Logistic Regression Model
##
## lrm(formula = BW ~ European, data = BWs)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 649 LR chi2 728.19 R2 0.933 C 0.994
## White 497 d.f. 1 g 4.967 Dxy 0.987
## White + Black 22 Pr(> chi2) <0.0001 gr 143.576 gamma 0.995
## Black 130 gp 0.349 tau-a 0.368
## max |deriv| 7e-09 Brier 0.008
##
## Coef S.E. Wald Z Pr(>|Z|)
## y>=White + Black 12.3886 1.6947 7.31 <0.0001
## y>=Black 8.0003 0.9404 8.51 <0.0001
## European -17.1091 1.8503 -9.25 <0.0001
##
(bw_model2 = lrm(BW ~ African, data = BWs)) #2
## Logistic Regression Model
##
## lrm(formula = BW ~ African, data = BWs)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 649 LR chi2 726.47 R2 0.932 C 0.994
## White 497 d.f. 1 g 4.854 Dxy 0.988
## White + Black 22 Pr(> chi2) <0.0001 gr 128.215 gamma 0.996
## Black 130 gp 0.348 tau-a 0.368
## max |deriv| 4e-09 Brier 0.008
##
## Coef S.E. Wald Z Pr(>|Z|)
## y>=White + Black -4.6422 0.4304 -10.79 <0.0001
## y>=Black -8.9295 1.0750 -8.31 <0.0001
## African 16.9240 1.8143 9.33 <0.0001
##
#visualize
predict_SIRE = tibble(
African = seq(0, 1, length.out = 10e3),
European = 1 - African
)
#SIRE, from African
bw_model2_preds = predict(bw_model2, newdata = predict_SIRE, type = "fitted.ind")
predict_SIRE$prob_White = bw_model2_preds[, 1]
predict_SIRE$prob_White_and_Black = bw_model2_preds[, 2]
predict_SIRE$prob_Black = bw_model2_preds[, 3]
#ancestry only, SIRE
predict_SIRE %>%
pivot_longer(cols = starts_with("prob_")) %>%
mutate(
name = case_when(
name == "prob_White" ~ "White",
name == "prob_White_and_Black" ~ "White + Black",
name == "prob_Black" ~ "Black"
) %>% ordered(levels = c("White", "White + Black", "Black"))
) %>%
ggplot(aes(African, value, color = name)) +
geom_line() +
theme_bw() +
scale_color_discrete("Social race") +
scale_x_continuous("African ancestry (genetic)", labels = scales::percent) +
scale_y_continuous("Probability of being...", labels = scales::percent)
GG_save("figs/AFR_to_SIRE.png")
#concordance in real data
BWs$fitted_ml = bw_model2 %>% get_most_likely() %>% ordered()
#confusion matrix
table(true = BWs$BW, predicted = BWs$fitted_ml)
## predicted
## true White White + Black Black
## White 496 0 1
## White + Black 5 13 4
## Black 0 3 127
#concordances
(BWs$BW == BWs$fitted_ml) %>% mean()
## [1] 0.98
BWs %>%
group_by(BW) %>%
summarise(concordance = mean(BW == fitted_ml))
#simplified case with just white vs. black (including BW)
(bw_model2b = lrm(justwhite ~ African, data = BWs))
## Logistic Regression Model
##
## lrm(formula = justwhite ~ African, data = BWs)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 649 LR chi2 646.64 R2 0.951 C 0.995
## FALSE 152 d.f. 1 g 5.288 Dxy 0.991
## TRUE 497 Pr(> chi2) <0.0001 gr 197.948 gamma 0.998
## max |deriv| 2e-09 gp 0.349 tau-a 0.356
## Brier 0.008
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 4.7813 0.4792 9.98 <0.0001
## African -18.4383 2.7559 -6.69 <0.0001
##
BWs$fitted_ml_white = bw_model2b %>% predict(type = "fitted.ind") %>% round()
(BWs$justwhite == BWs$fitted_ml_white) %>% mean() %>% print(digits = 3)
## [1] 0.991
Predict ancestry values from self-report data.
(bw_model1 = ols(European ~ BW, data = BWs)) #variant 1
## Linear Regression Model
##
## ols(formula = European ~ BW, data = BWs)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 649 LR chi2 2014.99 R2 0.955
## sigma0.0715 d.f. 2 R2 adj 0.955
## d.f. 646 Pr(> chi2) 0.0000 g 0.278
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.8857447 0.0006753 0.0062453 0.0062453 0.4367222
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.9938 0.0032 309.91 <0.0001
## BW=White + Black -0.3952 0.0156 -25.37 <0.0001
## BW=Black -0.8198 0.0070 -116.41 <0.0001
##
(bw_model2 = ols(African ~ BW, data = BWs)) #works a little better in this case
## Linear Regression Model
##
## ols(formula = African ~ BW, data = BWs)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 649 LR chi2 2014.40 R2 0.955
## sigma0.0714 d.f. 2 R2 adj 0.955
## d.f. 646 Pr(> chi2) 0.0000 g 0.278
##
## Residuals
##
## Min 1Q Median 3Q Max
## -0.431846 -0.003267 -0.003267 -0.003267 0.888723
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0033 0.0032 1.02 0.3078
## BW=White + Black 0.3969 0.0155 25.53 <0.0001
## BW=Black 0.8179 0.0070 116.33 <0.0001
##
#show the predictions
bw_model1 %>% ggpredict() %>% plot()
## $BW
bw_model2 %>% ggpredict() %>% plot()
## $BW
#of course, this is quite similar to just doing violin plots and seeing the validity
BWs %>%
ggplot(aes(BW, European, fill = BW)) +
geom_violin(scale = "width")
BWs %>%
ggplot(aes(BW, African, fill = BW)) +
geom_violin(scale = "width") +
scale_y_continuous("African ancestry", labels = scales::percent) +
scale_x_discrete("Social race") +
scale_fill_discrete("Social race", guide = "none")
GG_save("figs/SIRE_to_AFR.png")
We use all the data. We use simple models: multinomial regression and dirichlet regression.
Predict self-report from ancestry data.
#predict what distribution?
d$SIRE_standard %>% table2()
#make a recipe
all_recipe = recipe(SIRE_standard ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)
#make a model
#swap to multinom_reg() + nnet
all_model = multinom_reg() %>%
set_engine("nnet")
#resampling method
#actually the same as above
set.seed(1)
all_folds = vfold_cv(d, v = 20)
#make workflow
all_wf = workflow() %>%
add_model(all_model) %>%
add_recipe(all_recipe)
#fit
all_fit = all_wf %>%
fit_resamples(
all_folds,
control = control_resamples(save_pred = TRUE)
)
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold04: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold07: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold16: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold18: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Americ...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Americ...
#metrics
collect_metrics(all_fit)
#concordance in real data
all_fit %>%
collect_predictions() %>%
arrange(.row) %>%
pull(.pred_class) ->
d$fitted
#confusion matrix
table(true = d$SIRE_standard, predicted = d$fitted)
## predicted
## true White African_American American_Indian Asian Hispanic
## White 556 1 0 0 7
## African_American 0 133 0 0 4
## American_Indian 2 1 0 0 1
## Asian 1 0 0 106 2
## Hispanic 37 12 0 5 245
## Multi_ethnic 21 9 0 13 9
## Other 9 5 0 4 2
## Pacific_Islander 0 0 0 0 0
## predicted
## true Multi_ethnic Other Pacific_Islander
## White 7 0 0
## African_American 3 0 0
## American_Indian 0 0 0
## Asian 13 0 0
## Hispanic 29 0 0
## Multi_ethnic 125 3 6
## Other 4 0 0
## Pacific_Islander 7 0 9
#concordances
(d$SIRE_standard == d$fitted) %>% mean()
## [1] 0.84
d %>%
group_by(SIRE_standard) %>%
summarise(concordance = mean(SIRE_standard == fitted))
Predict ancestry values from self-report data.
#direchlet reg
#prep Y data
#check data first, do the rows sum to 1?
d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% rowSums() %>% table()
## .
## 0.99999 1 1.00001
## 32 1241 118
#close enough not to worry!
#transform
direct_y = DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European))
## Warning in DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, : not all rows sum up to 1 => normalization forced
## some entries are 0 or 1 => transformation forced
#fit
dirichreg_fit = DirichReg(direct_y ~ SIRE_standard, data = d, model = "alternative")
dirichreg_fit %>% summary()
## Call:
## DirichReg(formula = direct_y ~ SIRE_standard, data = d, model = "alternative")
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## African -0.91 -0.46 -0.43 -0.18 7.5
## Amerindian -0.86 -0.46 -0.44 -0.43 3.4
## East_Asian -2.91 -0.43 -0.42 -0.15 7.2
## Oceanian -0.76 -0.45 -0.42 -0.42 1.2
## Central_Asian -0.58 -0.44 -0.44 -0.44 7.6
## European -2.90 -0.35 0.66 1.08 6.6
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: African
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Amerindian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0187 0.0585 0.32 0.74979
## SIRE_standardAfrican_American -2.5552 0.1112 -22.98 < 2e-16 ***
## SIRE_standardAmerican_Indian -0.0409 0.6953 -0.06 0.95308
## SIRE_standardAsian -0.0258 0.1392 -0.19 0.85317
## SIRE_standardHispanic 0.3254 0.0934 3.49 0.00049 ***
## SIRE_standardMulti_ethnic -0.2793 0.1172 -2.38 0.01717 *
## SIRE_standardOther -0.6419 0.2852 -2.25 0.02438 *
## SIRE_standardPacific_Islander -0.1629 0.3543 -0.46 0.64562
## ------------------------------------------------------------------
## Coefficients for variable no. 3: East_Asian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0284 0.0585 -0.49 0.627
## SIRE_standardAfrican_American -2.5360 0.1111 -22.82 < 2e-16 ***
## SIRE_standardAmerican_Indian -0.3000 0.6980 -0.43 0.667
## SIRE_standardAsian 2.5754 0.1151 22.37 < 2e-16 ***
## SIRE_standardHispanic -0.5519 0.0954 -5.79 7.2e-09 ***
## SIRE_standardMulti_ethnic 1.1203 0.1117 10.03 < 2e-16 ***
## SIRE_standardOther -0.6342 0.2852 -2.22 0.026 *
## SIRE_standardPacific_Islander 2.3691 0.2842 8.34 < 2e-16 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Oceanian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0496 0.0585 -0.85 0.397
## SIRE_standardAfrican_American -2.5348 0.1111 -22.81 < 2e-16 ***
## SIRE_standardAmerican_Indian -0.0907 0.6965 -0.13 0.896
## SIRE_standardAsian 0.1769 0.1390 1.27 0.203
## SIRE_standardHispanic -0.7576 0.0956 -7.93 2.2e-15 ***
## SIRE_standardMulti_ethnic 0.0243 0.1169 0.21 0.835
## SIRE_standardOther -0.6829 0.2854 -2.39 0.017 *
## SIRE_standardPacific_Islander 1.5380 0.3268 4.71 2.5e-06 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 5: Central_Asian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0194 0.0585 0.33 0.741
## SIRE_standardAfrican_American -2.5764 0.1112 -23.17 <2e-16 ***
## SIRE_standardAmerican_Indian -0.3478 0.6980 -0.50 0.618
## SIRE_standardAsian 0.2067 0.1387 1.49 0.136
## SIRE_standardHispanic -0.8868 0.0956 -9.28 <2e-16 ***
## SIRE_standardMulti_ethnic -0.2691 0.1172 -2.30 0.022 *
## SIRE_standardOther -0.2907 0.2837 -1.02 0.306
## SIRE_standardPacific_Islander -0.1636 0.3543 -0.46 0.644
## ------------------------------------------------------------------
## Coefficients for variable no. 6: European
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.7443 0.0495 55.41 < 2e-16 ***
## SIRE_standardAfrican_American -3.9422 0.1080 -36.50 < 2e-16 ***
## SIRE_standardAmerican_Indian -0.4092 0.5362 -0.76 0.45
## SIRE_standardAsian -2.6218 0.1353 -19.38 < 2e-16 ***
## SIRE_standardHispanic -1.2190 0.0761 -16.02 < 2e-16 ***
## SIRE_standardMulti_ethnic -0.8665 0.0940 -9.22 < 2e-16 ***
## SIRE_standardOther -1.0020 0.2275 -4.40 1.1e-05 ***
## SIRE_standardPacific_Islander -2.3967 0.3502 -6.84 7.7e-12 ***
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.9739 0.0216 45.1 <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 32591 on 41 df (186 BFGS + 2 NR Iterations)
## AIC: -65099, BIC: -64884
## Number of Observations: 1391
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
#prep model preds
dirichreg_fit_preds = fitted(dirichreg_fit) %>%
as.data.frame() %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, values_to = "pred_value") %>%
bind_cols(
d %>%
select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>%
select(-id)
)
#plot results
dirichreg_fit_preds %>%
ggplot(aes(pred_value, true_value, color = name)) +
geom_count() +
geom_smooth(method = "lm") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
scale_color_discrete("Ancestry") +
facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'
#overall correlation, across variables
GG_scatter(dirichreg_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'
#each correlation by ancestry
dirichreg_fit_preds %>% plyr::ddply("name", function(dd) {
wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})
Predict ancestry values from self-report data.
#predict what distribution?
d$SIRE_common %>% table2()
#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)
#make a model
#swap to multinom_reg() + nnet
all_model = multinom_reg() %>%
set_engine("nnet")
#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)
#make workflow
all_wf = workflow() %>%
add_model(all_model) %>%
add_recipe(all_recipe)
#fit
all_fit = all_wf %>%
fit_resamples(all_folds)
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...
#metrics
collect_metrics(all_fit)
Predict ancestry values from self-report data.
#direchlet reg
#prep Y data
#check data first, do the rows sum to 1?
d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>% rowSums() %>% table()
## .
## 0.99999 1 1.00001
## 32 1241 118
#close enough not to worry!
#transform
direct_y = DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European))
## Warning in DR_data(d %>% select(African, Amerindian, East_Asian, Oceanian, : not all rows sum up to 1 => normalization forced
## some entries are 0 or 1 => transformation forced
#fit
dirichreg_fit = DirichReg(direct_y ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d, model = "alternative")
dirichreg_fit %>% summary()
## Call:
## DirichReg(formula = direct_y ~ Hispanic + Pacific_Islander + Asian +
## African_American + American_Indian + Other + White, data = d, model =
## "alternative")
##
## Standardized Residuals:
## Min 1Q Median 3Q Max
## African -2.11 -0.44 -0.44 -0.15 8.37
## Amerindian -1.33 -0.45 -0.45 -0.32 2.90
## East_Asian -3.85 -0.44 -0.44 -0.30 6.64
## Oceanian -1.01 -0.44 -0.42 -0.42 0.59
## Central_Asian -0.58 -0.45 -0.43 -0.40 11.86
## European -3.33 -0.15 0.80 1.08 5.18
##
## MEAN MODELS:
## ------------------------------------------------------------------
## Coefficients for variable no. 1: African
## - variable omitted (reference category) -
## ------------------------------------------------------------------
## Coefficients for variable no. 2: Amerindian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1677 0.0951 -1.76 0.078 .
## HispanicTRUE 0.8352 0.0841 9.92 <2e-16 ***
## Pacific_IslanderTRUE -0.2997 0.1373 -2.18 0.029 *
## AsianTRUE 0.0814 0.1022 0.80 0.426
## African_AmericanTRUE -2.5227 0.1081 -23.35 <2e-16 ***
## American_IndianTRUE 0.1738 0.1613 1.08 0.281
## OtherTRUE -0.4950 0.2937 -1.69 0.092 .
## WhiteTRUE 0.1888 0.0911 2.07 0.038 *
## ------------------------------------------------------------------
## Coefficients for variable no. 3: East_Asian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.29761 0.08929 3.33 0.00086 ***
## HispanicTRUE -0.83067 0.08641 -9.61 < 2e-16 ***
## Pacific_IslanderTRUE 0.74637 0.12082 6.18 6.5e-10 ***
## AsianTRUE 2.18801 0.09410 23.25 < 2e-16 ***
## African_AmericanTRUE -2.74589 0.10610 -25.88 < 2e-16 ***
## American_IndianTRUE 0.00175 0.15871 0.01 0.99121
## OtherTRUE -1.00153 0.29216 -3.43 0.00061 ***
## WhiteTRUE -0.30568 0.08369 -3.65 0.00026 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 4: Oceanian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4077 0.0969 -4.20 2.6e-05 ***
## HispanicTRUE -0.5220 0.0893 -5.85 5.0e-09 ***
## Pacific_IslanderTRUE 1.2534 0.1338 9.37 < 2e-16 ***
## AsianTRUE 0.2949 0.1027 2.87 0.0041 **
## African_AmericanTRUE -2.2268 0.1076 -20.69 < 2e-16 ***
## American_IndianTRUE 0.0109 0.1669 0.07 0.9480
## OtherTRUE -0.3695 0.2946 -1.25 0.2098
## WhiteTRUE 0.3084 0.0920 3.35 0.0008 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 5: Central_Asian
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5133 0.0961 -5.34 9.1e-08 ***
## HispanicTRUE -0.4859 0.0880 -5.52 3.4e-08 ***
## Pacific_IslanderTRUE -0.0153 0.1369 -0.11 0.91
## AsianTRUE 0.4645 0.1014 4.58 4.6e-06 ***
## African_AmericanTRUE -2.0995 0.1068 -19.66 < 2e-16 ***
## American_IndianTRUE 0.0181 0.1666 0.11 0.91
## OtherTRUE 0.2225 0.2923 0.76 0.45
## WhiteTRUE 0.4655 0.0913 5.10 3.4e-07 ***
## ------------------------------------------------------------------
## Coefficients for variable no. 6: European
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2473 0.0862 14.48 < 2e-16 ***
## HispanicTRUE -0.4430 0.0731 -6.06 1.4e-09 ***
## Pacific_IslanderTRUE -0.3185 0.1232 -2.59 0.0097 **
## AsianTRUE -0.5920 0.0904 -6.55 5.9e-11 ***
## African_AmericanTRUE -2.4100 0.0978 -24.64 < 2e-16 ***
## American_IndianTRUE 0.2369 0.1380 1.72 0.0862 .
## OtherTRUE 0.6528 0.2338 2.79 0.0052 **
## WhiteTRUE 1.6093 0.0837 19.22 < 2e-16 ***
## ------------------------------------------------------------------
##
## PRECISION MODEL:
## ------------------------------------------------------------------
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.159 0.023 50.4 <2e-16 ***
## ------------------------------------------------------------------
## Significance codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-likelihood: 32869 on 41 df (201 BFGS + 2 NR Iterations)
## AIC: -65657, BIC: -65442
## Number of Observations: 1391
## Links: Logit (Means) and Log (Precision)
## Parametrization: alternative
#prep model preds
dirichreg_fit_preds = fitted(dirichreg_fit) %>%
as.data.frame() %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, values_to = "pred_value") %>%
bind_cols(
d %>%
select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>%
select(-id)
)
#plot results
dirichreg_fit_preds %>%
ggplot(aes(pred_value, true_value, color = name)) +
geom_count() +
geom_smooth(method = "lm") +
scale_x_continuous("Predicted value", labels = scales::percent) +
scale_y_continuous("Genetic ancestry", labels = scales::percent) +
scale_color_discrete("Ancestry") +
facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/common_dirichlet.png")
## `geom_smooth()` using formula 'y ~ x'
#overall correlation, across variables
GG_scatter(dirichreg_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'
#each correlation by ancestry
cor.test(dirichreg_fit_preds$pred_value, dirichreg_fit_preds$true_value)
##
## Pearson's product-moment correlation
##
## data: dirichreg_fit_preds$pred_value and dirichreg_fit_preds$true_value
## t = 219, df = 8344, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.92 0.93
## sample estimates:
## cor
## 0.92
cor(dirichreg_fit_preds$pred_value, dirichreg_fit_preds$true_value)^2
## [1] 0.85
dirichreg_fit_preds %>% plyr::ddply("name", function(dd) {
wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})
Predict ancestry values from self-report data.
#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)
#make a model
#swap to multinom_reg() + nnet
all_model = rand_forest(mtry = tune()) %>%
set_engine("ranger") %>%
set_mode("classification")
#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)
#make workflow
all_wf = workflow() %>%
add_model(all_model) %>%
add_recipe(all_recipe)
#fit
all_fit = all_wf %>%
tune_grid(
resamples = all_folds,
grid = 25,
control = control_grid(
save_pred = TRUE
)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...
#metrics
collect_metrics(all_fit)
Predict ancestry values from self-report data.
#fit to all variables at once
rf_all_fit = rfsrc(Multivar(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d)
rf_all_fit
## Sample size: 1391
## Number of trees: 500
## Forest terminal node size: 5
## Average no. of terminal nodes: 29.716
## No. of variables tried at each split: 3
## Total no. of variables: 7
## Total no. of responses: 6
## User has requested response: African
## Resampling used to grow trees: swor
## Resample size used to grow trees: 879
## Analysis: mRF-R
## Family: regr+
## Splitting rule: mv.mse *random*
## Number of random split points: 10
## (OOB) R squared: 0.87389042
## (OOB) Requested performance error: 0.009397
#get predictions out of sample
rf_all_fit_preds = rf_all_fit$regrOutput %>% map_df(function(x) {
x$predicted.oob
}) %>% {
#unitize
./rowSums(.)
} %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, values_to = "pred_value") %>%
bind_cols(
d %>%
select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>%
select(-id)
)
#plot results
rf_all_fit_preds %>%
ggplot(aes(pred_value, true_value, color = name)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
scale_color_discrete("Ancestry") +
facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'
#overall correlation, across variables
GG_scatter(rf_all_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'
#each correlation by ancestry
rf_all_fit_preds %>% plyr::ddply("name", function(dd) {
wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})
Predict ancestry values from self-report data.
#make a recipe
all_recipe = recipe(SIRE_common ~ African + Amerindian + East_Asian + Oceanian + Central_Asian, data = d)
#make a model
#swap to multinom_reg() + nnet
all_model = svm_rbf(
cost = tune(),
rbf_sigma = tune()
) %>%
set_mode("classification")
#resampling method
set.seed(1)
all_folds = vfold_cv(d, v = 20)
#make workflow
all_wf = workflow() %>%
add_model(all_model) %>%
add_recipe(all_recipe)
#fit
all_fit = all_wf %>%
tune_grid(
resamples = all_folds,
grid = 25,
control = control_grid(
save_pred = TRUE
)
)
## ! Fold01: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold02: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold03: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold05: internal: No observations were detected in `truth` for level(s): 'Hispan...
## ! Fold06: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold08: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold09: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold10: internal: No observations were detected in `truth` for level(s): 'Asian,...
## ! Fold11: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold12: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold14: internal: No observations were detected in `truth` for level(s): 'Other'...
## ! Fold15: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold17: internal: No observations were detected in `truth` for level(s): 'Africa...
## ! Fold19: internal: No observations were detected in `truth` for level(s): 'Pacifi...
## ! Fold20: internal: No observations were detected in `truth` for level(s): 'Hispan...
#metrics
collect_metrics(all_fit)
show_best(all_fit, metric = "roc_auc")
show_best(all_fit, metric = "accuracy")
This does not enforce that predictions stay within 0-1, or sum to 1. Still, it works reasonably well.
#just regular ols
all_fit = lm(cbind(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) ~ Hispanic + Pacific_Islander + Asian + African_American + American_Indian + Other + White, data = d)
all_fit$coefficients
## African Amerindian East_Asian Oceanian Central_Asian
## (Intercept) 0.1269 0.063 0.227 0.01513 0.041
## HispanicTRUE 0.0127 0.174 -0.091 -0.00721 -0.020
## Pacific_IslanderTRUE -0.0305 -0.031 0.166 0.07589 -0.048
## AsianTRUE -0.0734 -0.048 0.442 -0.01033 0.061
## African_AmericanTRUE 0.5799 -0.069 -0.176 -0.01174 -0.024
## American_IndianTRUE -0.0099 0.013 -0.025 0.00021 -0.015
## OtherTRUE 0.1295 -0.054 -0.220 -0.01382 0.134
## WhiteTRUE -0.1090 -0.045 -0.202 -0.01249 -0.028
## European
## (Intercept) 0.526
## HispanicTRUE -0.069
## Pacific_IslanderTRUE -0.132
## AsianTRUE -0.371
## African_AmericanTRUE -0.298
## American_IndianTRUE 0.037
## OtherTRUE 0.024
## WhiteTRUE 0.397
#prep model preds
all_fit_preds = all_fit$fitted.values %>%
as.data.frame() %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, values_to = "pred_value") %>%
bind_cols(
d %>%
select(African, Amerindian, East_Asian, Oceanian, Central_Asian, European) %>%
mutate(id = 1:n()) %>%
pivot_longer(cols = -id, names_to = "true_name", values_to = "true_value") %>%
select(-id)
)
#plot results
all_fit_preds %>%
ggplot(aes(pred_value, true_value, color = name)) +
geom_count() +
geom_smooth(method = "lm") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
scale_color_discrete("Ancestry") +
facet_wrap("name", nrow = 3)
## `geom_smooth()` using formula 'y ~ x'
#overall correlation, across variables
GG_scatter(all_fit_preds, "pred_value", "true_value")
## `geom_smooth()` using formula 'y ~ x'
#each correlation by ancestry
all_fit_preds %>% plyr::ddply("name", function(dd) {
wtd.cor(dd$pred_value, dd$true_value) %>% as.data.frame()
})
#bounds
(all_fit_preds$pred_value<0) %>% mean()
## [1] 0.15
(all_fit_preds$pred_value>1) %>% mean()
## [1] 0
write_sessioninfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kernlab_0.9-29 ranger_0.13.1 nnet_7.3-16
## [4] vctrs_0.3.8 randomForestSRC_2.14.0 DirichletReg_0.7-1
## [7] yardstick_0.0.8 workflowsets_0.1.0 workflows_0.2.4
## [10] tune_0.1.6 rsample_0.1.1 recipes_0.1.17
## [13] parsnip_0.1.7 modeldata_0.1.1 infer_1.0.0
## [16] dials_0.0.10 scales_1.1.1 broom_0.7.10
## [19] tidymodels_0.1.4 ggeffects_1.1.1 rms_6.2-0
## [22] SparseM_1.81 kirkegaard_2021-12-14 rlang_0.4.12
## [25] metafor_3.0-2 Matrix_1.4-0 psych_2.1.9
## [28] magrittr_2.0.1 assertthat_0.2.1 weights_1.0.4
## [31] Hmisc_4.6-0 Formula_1.2-4 survival_3.2-13
## [34] lattice_0.20-45 forcats_0.5.1 stringr_1.4.0
## [37] dplyr_1.0.7 purrr_0.3.4 readr_2.1.0
## [40] tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5
## [43] tidyverse_1.3.1 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.1.1 lme4_1.1-27.1
## [4] htmlwidgets_1.5.4 grid_4.1.2 pROC_1.18.0
## [7] miscTools_0.6-26 munsell_0.5.0 codetools_0.2-18
## [10] future_1.23.0 withr_2.4.2 colorspace_2.0-2
## [13] highr_0.9 knitr_1.36 rstudioapi_0.13
## [16] stats4_4.1.2 listenv_0.8.0 labeling_0.4.2
## [19] mnormt_2.0.2 DiceDesign_1.9 farver_2.1.0
## [22] parallelly_1.28.1 generics_0.1.1 TH.data_1.1-0
## [25] ipred_0.9-12 xfun_0.28 R6_2.5.1
## [28] lhs_1.1.3 multcomp_1.4-17 gtable_0.3.0
## [31] globals_0.14.0 conquer_1.2.1 sandwich_3.0-1
## [34] timeDate_3043.102 MatrixModels_0.5-0 splines_4.1.2
## [37] ModelMetrics_1.2.2.2 checkmate_2.0.0 yaml_2.2.1
## [40] reshape2_1.4.4 modelr_0.1.8 backports_1.3.0
## [43] caret_6.0-90 DiagrammeR_1.0.6.1 tools_4.1.2
## [46] lava_1.6.10 ellipsis_0.3.2 jquerylib_0.1.4
## [49] multilevel_2.6 RColorBrewer_1.1-2 Rcpp_1.0.7
## [52] plyr_1.8.6 base64enc_0.1-3 visNetwork_2.1.0
## [55] rpart_4.1-15 zoo_1.8-9 haven_2.4.3
## [58] cluster_2.1.2 fs_1.5.0 furrr_0.2.3
## [61] data.table_1.14.2 reprex_2.0.1 GPfit_1.0-8
## [64] tmvnsim_1.0-2 mvtnorm_1.1-3 matrixStats_0.61.0
## [67] hms_1.1.1 evaluate_0.14 jpeg_0.1-9
## [70] readxl_1.3.1 gridExtra_2.3 compiler_4.1.2
## [73] psychometric_2.2 mice_3.13.0 crayon_1.4.2
## [76] minqa_1.2.4 htmltools_0.5.2 mgcv_1.8-38
## [79] tzdb_0.2.0 lubridate_1.8.0 DBI_1.1.1
## [82] sjlabelled_1.1.8 dbplyr_2.1.1 MASS_7.3-54
## [85] boot_1.3-28 data.tree_1.0.0 cli_3.1.0
## [88] gdata_2.18.0 parallel_4.1.2 insight_0.14.5
## [91] gower_0.2.2 pkgconfig_2.0.3 foreign_0.8-81
## [94] xml2_1.3.2 foreach_1.5.1 bslib_0.3.1
## [97] hardhat_0.1.6 prodlim_2019.11.13 rvest_1.0.2
## [100] digest_0.6.28 rmarkdown_2.11 cellranger_1.1.0
## [103] htmlTable_2.3.0 maxLik_1.5-2 gtools_3.9.2
## [106] quantreg_5.86 nloptr_1.2.2.3 lifecycle_1.0.1
## [109] nlme_3.1-152 jsonlite_1.7.2 viridisLite_0.4.0
## [112] fansi_0.5.0 pillar_1.6.4 fastmap_1.1.0
## [115] httr_1.4.2 glue_1.5.0 png_0.1-7
## [118] iterators_1.0.13 class_7.3-19 stringi_1.7.5
## [121] sass_0.4.0 polspline_1.1.19 latticeExtra_0.6-29
## [124] mathjaxr_1.4-0 future.apply_1.8.1
#upload to OSF
#avoid uploading the data in case they freak out again
if (F) {
library(osfr)
#auth
osf_auth(readr::read_lines("~/.config/osf_token"))
#the project we will use
osf_proj = osf_retrieve_node("https://osf.io/qxvg8/")
#upload files
#overwrite existing (versioning)
osf_upload(osf_proj, conflicts = "overwrite",
path = c(
"figs",
"data/public",
"notebook.html",
"notebook.Rmd",
"Family income, parental education and brain structure.pdf"
))
}