library(data.table)
library(knitr)
library(kableExtra)
library(ggplot2)

Goal: Predict weights with demographics so we can generate prevalence estimates from unweighted data.

Load state survey data

states_combined_dt = fread('~/Downloads/SADCQN_2017_State.csv')

Inspect weights

states_combined_dt[year==2017, summary(weight)]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.38   33.00   84.24  112.47 2247.18
states_combined_dt[year==2017, uniqueN(weight)]
## [1] 30860
ggplot(states_combined_dt[year==2017], aes(x=log(1+weight), color=factor(sex))) +
  facet_grid(grade~race4) + geom_density()

There appears to be a ton of variation in weights so they’re likely difficult to predict.

But let’s try to predict them anyways and generate a prevalance estimate using these predictions as weights to compare with the true estimate with real weights.

library(grf)
mm = model.matrix.lm(~age + sex + grade + race4 + race7, data=states_combined_dt[year==2017], na.action = "na.pass")
mm[is.na(mm)] = 0
rfm = regression_forest(
  X = mm,
  Y = states_combined_dt[year==2017, weight],
  num.threads = 8
)
# Var imp
cbind(colnames(rfm$X.orig), variable_importance(rfm))
##      [,1]          [,2]                
## [1,] "(Intercept)" "0"                 
## [2,] "age"         "0.0551563152273913"
## [3,] "sex"         "0.0211602953825601"
## [4,] "grade"       "0.089729136006405" 
## [5,] "race4"       "0.648300595827476" 
## [6,] "race7"       "0.185653657556167"
# R-sq
RSS = var(states_combined_dt[year==2017, weight] - predict(rfm)$predictions)
TSS = var(states_combined_dt[year==2017, weight])
1 - RSS/TSS
## [1] 0.01773006

Horrible!

Let’s see how these weights do at estimating prevalence

# True prevalence
library(boot)
states_combined_dt[year == 2017, pred_weight := predict(rfm)$predictions]
subset_dt = states_combined_dt[year == 2017 & sexpart2 == 3 & sitename == "New York (NY)", .(qn64, weight, pred_weight)]
bt = boot(
  data = subset_dt,
  R = 500,
  statistic = function(data, ind) weighted.mean(data[ind, qn64==2], data[ind, weight], na.rm=T))
quantile(bt$t, c(.025, 0.5, 0.975))
##      2.5%       50%     97.5% 
## 0.4919417 0.6002045 0.7180812

Prevalence with estimated weights

bt = boot(
  data = subset_dt,
  R = 500,
  statistic = function(data, ind) weighted.mean(data[ind, qn64==2], data[ind, pred_weight], na.rm=T))
quantile(bt$t, c(.025, 0.5, 0.975))
##      2.5%       50%     97.5% 
## 0.5268696 0.5870305 0.6559917

Similar but narrower interval, probably due to RF smoothing weights.

Could maybe fix this by randomly drawing from residuals from predictions.

Let’s continue this work if we think we’ll have access to unweighted data.