library(data.table)
library(knitr)
library(kableExtra)
library(ggplot2)
Load state survey data
states_combined_dt = fread('~/Downloads/SADCQN_2017_State.csv')
Inspect weights
states_combined_dt[year==2017, summary(weight)]
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.38 33.00 84.24 112.47 2247.18
states_combined_dt[year==2017, uniqueN(weight)]
## [1] 30860
ggplot(states_combined_dt[year==2017], aes(x=log(1+weight), color=factor(sex))) +
facet_grid(grade~race4) + geom_density()
There appears to be a ton of variation in weights so they’re likely difficult to predict.
But let’s try to predict them anyways and generate a prevalance estimate using these predictions as weights to compare with the true estimate with real weights.
library(grf)
mm = model.matrix.lm(~age + sex + grade + race4 + race7, data=states_combined_dt[year==2017], na.action = "na.pass")
mm[is.na(mm)] = 0
rfm = regression_forest(
X = mm,
Y = states_combined_dt[year==2017, weight],
num.threads = 8
)
# Var imp
cbind(colnames(rfm$X.orig), variable_importance(rfm))
## [,1] [,2]
## [1,] "(Intercept)" "0"
## [2,] "age" "0.0551563152273913"
## [3,] "sex" "0.0211602953825601"
## [4,] "grade" "0.089729136006405"
## [5,] "race4" "0.648300595827476"
## [6,] "race7" "0.185653657556167"
# R-sq
RSS = var(states_combined_dt[year==2017, weight] - predict(rfm)$predictions)
TSS = var(states_combined_dt[year==2017, weight])
1 - RSS/TSS
## [1] 0.01773006
Horrible!
Let’s see how these weights do at estimating prevalence
# True prevalence
library(boot)
states_combined_dt[year == 2017, pred_weight := predict(rfm)$predictions]
subset_dt = states_combined_dt[year == 2017 & sexpart2 == 3 & sitename == "New York (NY)", .(qn64, weight, pred_weight)]
bt = boot(
data = subset_dt,
R = 500,
statistic = function(data, ind) weighted.mean(data[ind, qn64==2], data[ind, weight], na.rm=T))
quantile(bt$t, c(.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## 0.4919417 0.6002045 0.7180812
Prevalence with estimated weights
bt = boot(
data = subset_dt,
R = 500,
statistic = function(data, ind) weighted.mean(data[ind, qn64==2], data[ind, pred_weight], na.rm=T))
quantile(bt$t, c(.025, 0.5, 0.975))
## 2.5% 50% 97.5%
## 0.5268696 0.5870305 0.6559917
Similar but narrower interval, probably due to RF smoothing weights.
Could maybe fix this by randomly drawing from residuals from predictions.
Let’s continue this work if we think we’ll have access to unweighted data.