library(data.table)
library(knitr)
library(kableExtra)
library(ggplot2)
library(grf)
library(boot)
library(ranger)
Will predict the prevalence of heterosexual (straight) sexual identity (sexid == 1) by state based on 2017 responses.
This notebook only predicts these prevalances for states with 2017 data for the sexual identity question in order to evaluate the quality of the predictions.
Each state’s prediction is generated by fitting a model on the data from all the other states with responses to the relevant question. Finally, I report the states’ prevalence prediction intervals along with the confidence intervals reported by the CDC.
states_combined_dt = fread('~/Downloads/SADCQN_2017_State.csv')
subset_dt = states_combined_dt[year == 2017]
states_w_responses = subset_dt[!is.na(sexid), unique(sitecode)]
subset_dt = subset_dt[sitecode %in% states_w_responses]
subset_dt[, Y := ifelse(sexid==1, 1, 0)]
subset_dt[, table(Y, useNA="always")]
## Y
## 0 1 <NA>
## 13749 69866 3020
get_prediction_prevalence_interval_for_state = function(prediction_state, dt) {
# Filter to training data that does not include prediction state and has responses to relevant question
train_dt = dt[sitecode != prediction_state & !is.na(sexid)]
# Define feature matrix (all other questions; 0s filled for non-response)
qs = grep("qn", names(train_dt), value = T)
preds = c('age', 'sex', 'grade', 'race4', 'race7', 'bmi', 'bmipct', qs)
train_mm = model.matrix.lm(as.formula(paste("~", paste(preds, collapse = "+"))),
data=train_dt, na.action = "na.pass")
train_mm[is.na(train_mm)] = 0
# Train Random Forest (all obs equally weighted; not using response weights)
ranf = ranger(
x = train_mm,
y = train_dt[, as.factor(Y)],
#case.weights = train_dt[, weight/max(weight)],
splitrule = "hellinger",
num.threads = 8,
probability = TRUE,
num.trees = 200,
min.node.size = 100,
class.weights = c(0.5, 0.5)
)
# Make predictions for survey responses from prediction state
predict_dt = dt[sitecode == prediction_state]
predict_mm = model.matrix.lm(as.formula(paste("~", paste(preds, collapse = "+"))),
data=predict_dt, na.action = "na.pass")
predict_mm[is.na(predict_mm)] = 0
predict_dt[, pred := predict(ranf, data=predict_mm)$predictions[,2]]
# Generate prevalence prediction interval by simulating responses with RF probabilities and bootstrapping those responses.
bt = boot(
data = predict_dt,
R = 500,
parallel = "multicore",
ncpus = 8,
statistic = function(data, ind) {
bs_dt = copy(data)[ind]
bs_dt[, sim_outcome := rbinom(.N, 1, pred)]
weighted.mean(bs_dt[, sim_outcome], bs_dt[, weight])
})
btci = boot.ci(bt, type="norm")
return(data.table(
state = prediction_state,
lb = btci$normal[2],
pe = btci$t0,
ub = btci$normal[3]
))
}
# TEST
get_prediction_prevalence_interval_for_state("AR", subset_dt)
## state lb pe ub
## 1: AR 0.7773636 0.8042003 0.8234579
Generate prevalence predictions for all states with reported 2017 prevalences and YRBS data
states_to_predict_prevalences_for = c(
"AZ", "AR", "CA", "CO", "CT", "DE", "FL", "HI",
"IL", "IA", "KY", "ME", "MD", "MA", "MI", "NE",
"NV", "NH", "NM", "NY", "NC", "ND", "OK", "PE",
"RI", "SC", "TX", "VT", "WV", "WI"
)
states_to_predict_prevalences_for = intersect(
states_to_predict_prevalences_for,
states_combined_dt[year == 2017, unique(sitecode)]
)
state_pred_prevalences = lapply(
states_to_predict_prevalences_for,
function(x) get_prediction_prevalence_interval_for_state(x, subset_dt))
state_pred_prevalences_dt = rbindlist(state_pred_prevalences)
state_pred_prevalences_dt = state_pred_prevalences_dt[, .(state, source="Predicted", lb, pe, ub)]
state_pred_prevalences_dt
## state source lb pe ub
## 1: AR Predicted 0.7839105 0.8077443 0.8338004
## 2: CA Predicted 0.7920725 0.8223515 0.8303852
## 3: CO Predicted 0.8015656 0.8192774 0.8436876
## 4: DE Predicted 0.7977792 0.8218855 0.8323944
## 5: FL Predicted 0.8272247 0.8370352 0.8465538
## 6: HI Predicted 0.7349328 0.7645957 0.7676759
## 7: IL Predicted 0.8194219 0.8389399 0.8535150
## 8: IA Predicted 0.8301382 0.8576316 0.8729963
## 9: KY Predicted 0.8122003 0.8297958 0.8495359
## 10: ME Predicted 0.8088160 0.8170099 0.8252589
## 11: MI Predicted 0.8165287 0.8412279 0.8554139
## 12: NE Predicted 0.8278458 0.8503363 0.8721054
## 13: NV Predicted 0.7936053 0.8183229 0.8316333
## 14: NH Predicted 0.7649778 0.7735722 0.7841233
## 15: NY Predicted 0.7198647 0.7342966 0.7529940
## 16: NC Predicted 0.8096432 0.8265769 0.8388776
## 17: ND Predicted 0.7821303 0.8008289 0.8187600
## 18: OK Predicted 0.8429576 0.8587368 0.8868388
## 19: RI Predicted 0.7875980 0.8085268 0.8300954
## 20: SC Predicted 0.7882436 0.8138065 0.8329257
## 21: WV Predicted 0.8487687 0.8593614 0.8884545
## 22: WI Predicted 0.8136601 0.8332330 0.8474723
## state source lb pe ub
Load CDC-reported values for comparison. (Source: https://www.cdc.gov/healthyyouth/data/yrbs/2017_tables/introduction.htm#t4_down)
raw_true_dt = fread("~/Downloads/yrbs2017ReportTable4.csv")
raw_true_dt[, c("lb", "ub") := tstrsplit(HeteroCI, "–", fixed=TRUE)]
raw_true_dt[, lb := as.numeric(gsub("\\(","",lb))/100]
raw_true_dt[, ub := as.numeric(gsub("\\)","",ub))/100]
raw_true_dt[, pe := HeteroPct/100]
# state abbreviations
state_xw = unique(states_combined_dt[, .(sitename, state = sitecode)])
state_xw[, Region := tstrsplit(sitename, "\\(", keep=1)]
state_xw[, Region := trimws(Region)]
true_dt = raw_true_dt[, .(Region, lb, pe, ub)]
true_dt = merge(true_dt, state_xw, by='Region')
true_dt = true_dt[, .(state, source = 'True', lb, pe, ub)]
true_dt = true_dt[state %in% state_pred_prevalences_dt[,unique(state)]]
true_dt
## state source lb pe ub
## 1: AR True 0.766 0.824 0.870
## 2: CA True 0.847 0.872 0.894
## 3: CO True 0.808 0.849 0.883
## 4: DE True 0.842 0.863 0.882
## 5: FL True 0.833 0.843 0.853
## 6: HI True 0.826 0.842 0.856
## 7: IL True 0.827 0.847 0.865
## 8: IA True 0.846 0.871 0.893
## 9: KY True 0.820 0.847 0.872
## 10: ME True 0.834 0.847 0.858
## 11: MI True 0.821 0.851 0.876
## 12: NE True 0.833 0.865 0.893
## 13: NV True 0.805 0.830 0.852
## 14: NH True 0.849 0.858 0.866
## 15: NY True 0.773 0.799 0.823
## 16: NC True 0.830 0.851 0.871
## 17: ND True 0.851 0.868 0.883
## 18: OK True 0.828 0.864 0.893
## 19: RI True 0.810 0.839 0.864
## 20: SC True 0.804 0.836 0.863
## 21: WV True 0.847 0.880 0.907
## 22: WI True 0.834 0.859 0.882
## state source lb pe ub
Merge predicted and true values for evaluation
eval_dt = rbind(state_pred_prevalences_dt, true_dt)
ggplot(eval_dt,
aes(x=NA, color=source, y=pe, ymin=lb, ymax=ub)) + geom_pointrange(position = position_dodge(width=1)) +
facet_wrap(~state, ncol = 11) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
Fitting a Random Forest model on all states with data to see which predictors are most used to generate the predicted responses.
# Define feature matrix (all other questions; 0s filled for non-response)
train_dt = subset_dt[!is.na(sexid)]
train_dt[, Y := ifelse(sexid==1, 1, 0)]
qs = grep("qn", names(train_dt), value = T)
preds = c('age', 'sex', 'grade', 'race4', 'race7', 'bmi', 'bmipct', qs)
train_mm = model.matrix.lm(as.formula(paste("~", paste(preds, collapse = "+"))),
data=train_dt, na.action = "na.pass")
train_mm[is.na(train_mm)] = 0
# Train Random Forest (all obs equally weighted; not using response weights)
ranf = ranger(
x = train_mm,
y = train_dt[, as.factor(Y)],
#case.weights = train_dt[, weight/max(weight)],
splitrule = "hellinger",
num.threads = 8,
probability = TRUE,
num.trees = 200,
min.node.size = 100,
class.weights = c(0.5, 0.5),
importance = "impurity"
)
varimp_dt =
data.table(
var = names(ranf$variable.importance),
imp = as.numeric(ranf$variable.importance)
)
# top 20
varimp_dt[order(-imp)[1:20]]
## var imp
## 1: bmi 30.827257
## 2: bmipct 30.776251
## 3: age 13.872051
## 4: race7 12.338699
## 5: grade 11.097241
## 6: race4 8.977958
## 7: sex 8.393406
## 8: qn25 6.969098
## 9: qn42 6.293655
## 10: qn88 6.251604
## 11: qn11 6.233816
## 12: qn41 6.021210
## 13: qn81 5.983249
## 14: qn43 5.758539
## 15: qn26 5.651594
## 16: qn89 5.559255
## 17: qn83 5.554190
## 18: qn79 5.450251
## 19: qn17 5.280817
## 20: qnpa7day 5.077914
Questions in top 20 (these might change slightly between runs due to randomness):
In total, 173 predictors were given to the model, 166 of which were responses to individual survey questions.