Set up environment

#####STEP 0-1: Reset environment #####
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
options(repos = structure(c(CRAN = "http://cran.rstudio.com/")))

#####STEP 0-2: Install packages #####
list.of.packages <- c("grf", "metafor", "splitstackshape", "dplyr", "tidyverse", "foreach", "cowplot",
                     "reshape2", "doParallel", "survival", "readstata13", "ggplot2", "rsample", "DiagrammeR",
                     "e1071", "pscl", "pROC", "caret", "ModelMetrics", "MatchIt", "Hmisc", "scales",
                     "lmtest", "sandwich","haven", "rpms", "randomForest", "fabricatr", "gridExtra", 
                     "VIM", "mice", "missForest", "lmtest", "ivreg", "kableExtra", "missRanger")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

lapply(list.of.packages, library, character.only = TRUE)

## Warning: package 'grf' was built under R version 4.3.3

## Warning: package 'metafor' was built under R version 4.3.3

## Warning: package 'metadat' was built under R version 4.3.3

## Warning: package 'numDeriv' was built under R version 4.3.1

## Warning: package 'splitstackshape' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tibble' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## Warning: package 'foreach' was built under R version 4.3.3

## Warning: package 'cowplot' was built under R version 4.3.3

## Warning: package 'reshape2' was built under R version 4.3.3

## Warning: package 'doParallel' was built under R version 4.3.3

## Warning: package 'iterators' was built under R version 4.3.3

## Warning: package 'readstata13' was built under R version 4.3.3

## Warning: package 'rsample' was built under R version 4.3.3

## Warning: package 'DiagrammeR' was built under R version 4.3.3

## Warning: package 'e1071' was built under R version 4.3.3

## Warning: package 'pscl' was built under R version 4.3.3

## Warning: package 'pROC' was built under R version 4.3.3

## Warning: package 'caret' was built under R version 4.3.3

## Warning: package 'ModelMetrics' was built under R version 4.3.3

## Warning: package 'MatchIt' was built under R version 4.3.3

## Warning: package 'Hmisc' was built under R version 4.3.3

## Warning: package 'scales' was built under R version 4.3.3

## Warning: package 'lmtest' was built under R version 4.3.3

## Warning: package 'zoo' was built under R version 4.3.3

## Warning: package 'sandwich' was built under R version 4.3.3

## Warning: package 'haven' was built under R version 4.3.3

## Warning: package 'rpms' was built under R version 4.3.3

## Warning: package 'randomForest' was built under R version 4.3.3

## Warning: package 'fabricatr' was built under R version 4.3.3

## Warning: package 'gridExtra' was built under R version 4.3.3

## Warning: package 'VIM' was built under R version 4.3.3

## Warning: package 'colorspace' was built under R version 4.3.3

## Warning: package 'mice' was built under R version 4.3.3

## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package

## Warning: package 'missForest' was built under R version 4.3.3

## Warning: package 'ivreg' was built under R version 4.3.3

## Warning: package 'kableExtra' was built under R version 4.3.3

## Warning: package 'missRanger' was built under R version 4.3.3

print(paste("Version of grf package:", packageVersion("grf")))

#####STEP 0-3: Basic information #####
Sys.time()
# Get detailed R session and system information
session_info <- sessionInfo()
system_info <- Sys.info()
# Combine the output
list(session_info = session_info, system_info = system_info)

Non-run-specific parameters

#####STEP 0-4: Set non-run-specific parameters #####
seedset <- 777
numthreadsset <- min(6, parallel::detectCores()) 
if (numthreadsset!= 6) {
  print("the results of grf vary by num.thread (publication paper used num.thread=6)")
} 

cat("number of threads (affects grf results):", numthreadsset,"\n")

## number of threads (affects grf results): 6

# Set printing options
options(digits = 4)

Run-specific parameters

#####STEP 0-5: Set run-specific parameters #####
published_paper_run <- 0 # ANALYST FORM
if (published_paper_run == 1) {
  print("save intermediate files into Cleaned_input_data/As_published/empirical/ folders")
  warning("Changing this setting to 1 overwrites the input files required for replicating on different platforms.")
} else {
  print("save intermediate files into Cleaned_input_data/Testing/empirical/ folder")
}

## [1] "save intermediate files into Cleaned_input_data/Testing/empirical/ folder"

Set file paths

#####STEP 0-6: Set file paths #####
# Set the processed path based on published_paper_run value
# ***Important: for file paths to work, use the drop-down menu on "Knit" to select the "Current Working Directory" under "Knit Directory." This assumes that you have cloned the repo to your local directory.***
processedpath <- if (published_paper_run == 1) {
  "PP_Full_Analysis/Cleaned_input_data/As_published/empirical/"
} else {
  "PP_Full_Analysis/Cleaned_input_data/Testing/empirical/"
}

# Print the processed path
print(paste("Processed data path:", processedpath))

## [1] "Processed data path: PP_Full_Analysis/Cleaned_input_data/Testing/empirical/"

Load the data

#####STEP 2-1: Load data #####
newd_adjusted <- read.csv(paste0(processedpath, "1_Cleaned_Wide_Dataset.csv"))

print("Information on 1_Cleaned_Wide_Dataset:")

## [1] "Information on 1_Cleaned_Wide_Dataset:"

glimpse(newd_adjusted)

## Rows: 12,208
## Columns: 70
## $ X                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id             <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp      <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ gender_inp            <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp               <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ ast_dx_pre_lottery    <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ diabetes              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol       <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression            <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq                   <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk              <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any               <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any                <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend             <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt                  <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow                <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ hispanic_inp          <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp        <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp      <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ a1c                   <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level             <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level             <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi                   <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp                   <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp                   <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any     <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions         <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id          <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility           <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ numhh_list            <int> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num               <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num                <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ num_visit_pre_cens_ed <int> 0, 0, 1, NA, 2, 0, 0, 7, 0, NA, 0, 0, 1, 0, 0, 0…
## $ any_depres_pre_ed     <int> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0…
## $ charg_tot_pre_ed      <dbl> 0.0, 0.0, 1888.2, NA, 1715.3, 0.0, 0.0, 5743.9, …
## $ charg_tot_ed          <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_pre_ed   <dbl> 0.0, 0.0, 1888.2, NA, 1006.3, 0.0, 0.0, 4542.4, …
## $ ed_charg_tot_ed       <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ lessHS                <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED               <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ sbp_neg               <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg               <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg         <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg         <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg               <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg               <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg               <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg          <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg              <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg            <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …

preimp_cov <- c("numhh_list", "gender_inp", "age_inp",
                "hispanic_inp", "race_white_inp", "race_black_inp", "race_nwother_inp", "ast_dx_pre_lottery",
                "dia_dx_pre_lottery", "hbp_dx_pre_lottery", "chl_dx_pre_lottery", "ami_dx_pre_lottery",
                "chf_dx_pre_lottery", "emp_dx_pre_lottery", "kid_dx_pre_lottery", "cancer_dx_pre_lottery",
                "dep_dx_pre_lottery", "charg_tot_pre_ed", "ed_charg_tot_pre_ed", "num_visit_pre_cens_ed", "any_depres_pre_ed",
                "lessHS", "HSorGED")

# Data used for imputation
newd_bsl_preimp <- newd_adjusted[, preimp_cov]

# Get all column names from newd_adjusted that are not in preimp_cov
nonimp_cols <- setdiff(colnames(newd_adjusted), preimp_cov)

# Create data frame with non-imputed columns
newd_bsl_nonimp <- newd_adjusted[, nonimp_cols]

print("Information on data for imputation:")

## [1] "Information on data for imputation:"

glimpse(newd_bsl_preimp)

## Rows: 12,208
## Columns: 23
## $ numhh_list            <int> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender_inp            <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp               <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ hispanic_inp          <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp        <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp      <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ast_dx_pre_lottery    <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ charg_tot_pre_ed      <dbl> 0.0, 0.0, 1888.2, NA, 1715.3, 0.0, 0.0, 5743.9, …
## $ ed_charg_tot_pre_ed   <dbl> 0.0, 0.0, 1888.2, NA, 1006.3, 0.0, 0.0, 4542.4, …
## $ num_visit_pre_cens_ed <int> 0, 0, 1, NA, 2, 0, 0, 7, 0, NA, 0, 0, 1, 0, 0, 0…
## $ any_depres_pre_ed     <int> 0, 0, 0, NA, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, 0, 0…
## $ lessHS                <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED               <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …

print("Missing values in data for imputation:")

## [1] "Missing values in data for imputation:"

colSums(is.na(newd_bsl_preimp))

##            numhh_list            gender_inp               age_inp 
##                     0                     0                     0 
##          hispanic_inp        race_white_inp        race_black_inp 
##                     0                     0                     0 
##      race_nwother_inp    ast_dx_pre_lottery    dia_dx_pre_lottery 
##                     0                     0                     0 
##    hbp_dx_pre_lottery    chl_dx_pre_lottery    ami_dx_pre_lottery 
##                     0                     0                     0 
##    chf_dx_pre_lottery    emp_dx_pre_lottery    kid_dx_pre_lottery 
##                     0                     0                     0 
## cancer_dx_pre_lottery    dep_dx_pre_lottery      charg_tot_pre_ed 
##                     0                     0                  2053 
##   ed_charg_tot_pre_ed num_visit_pre_cens_ed     any_depres_pre_ed 
##                  2058                  2055                  2050 
##                lessHS               HSorGED 
##                     0                     0

print("Information on data not for imputation:")

## [1] "Information on data not for imputation:"

glimpse(newd_bsl_nonimp)

## Rows: 12,208
## Columns: 47
## $ X                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id             <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp      <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ diabetes              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol       <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression            <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq                   <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk              <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any               <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any                <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend             <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt                  <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow                <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ a1c                   <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level             <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level             <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi                   <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp                   <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp                   <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any     <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions         <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id          <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility           <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num               <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num                <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ charg_tot_ed          <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_ed       <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ sbp_neg               <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg               <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg         <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg         <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg               <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg               <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg               <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg          <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg              <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg            <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …

print("Missing values in data not for imputation:")

## [1] "Missing values in data not for imputation:"

colSums(is.na(newd_bsl_nonimp))

##                     X             person_id      weight_total_inp 
##                     0                     0                     0 
##              diabetes          hypertension       highcholesterol 
##                    41                   281                   332 
##            depression                   phq              cvd_risk 
##                   131                    64                  2801 
##               doc_any                ed_any              hosp_any 
##                    17                    18                    17 
##             oop_spend          catastrophic                  debt 
##                    28                   424                   114 
##                borrow                   a1c             hdl_level 
##                    10                    89                    57 
##             chl_level                   bmi                   sbp 
##                    55                    54                    41 
##                   dbp     prescriptions_any         prescriptions 
##                    41                     2                   311 
##      hypertension_med       cholesterol_med          diabetes_med 
##                     0                     0                     0 
##        depression_med          household_id           eligibility 
##                     0                     0                     0 
## ohp_all_ever_inperson               doc_num                ed_num 
##                     0                    64                    47 
##              hosp_num          charg_tot_ed       ed_charg_tot_ed 
##                    47                  2055                  2056 
##               sbp_neg               dbp_neg         chl_level_neg 
##                    41                    41                    55 
##         hdl_level_neg               a1c_neg               bmi_neg 
##                    57                    89                    54 
##               phq_neg          cvd_risk_neg              debt_neg 
##                    64                  2801                   114 
##            borrow_neg      catastrophic_neg 
##                    10                   424

Perform imputation

#####STEP 2-2: Random forest imputation #####

newd_preimp <- missRanger(newd_bsl_preimp, 
                         num.trees = 100, 
                         pmm.k = 5, 
                         verbose = 0, 
                         seed = seedset, 
                         num.threads = numthreadsset)

# Convert 'numhh_list' to factor
newd_preimp$numhh_list <- as.factor(newd_preimp$numhh_list)

# Combine imputed data and nonimputed data
newd_imp <- cbind(newd_preimp, newd_bsl_nonimp)

print("Summary statistics of the imputed data:")

## [1] "Summary statistics of the imputed data:"

summary(newd_preimp)

##  numhh_list   gender_inp       age_inp      hispanic_inp  race_white_inp 
##  1:9239     Min.   :0.000   Min.   :19.0   Min.   :0.00   Min.   :0.000  
##  2:2951     1st Qu.:0.000   1st Qu.:31.0   1st Qu.:0.00   1st Qu.:0.000  
##  3:  18     Median :1.000   Median :41.0   Median :0.00   Median :1.000  
##             Mean   :0.566   Mean   :40.8   Mean   :0.18   Mean   :0.687  
##             3rd Qu.:1.000   3rd Qu.:50.0   3rd Qu.:0.00   3rd Qu.:1.000  
##             Max.   :1.000   Max.   :71.0   Max.   :1.00   Max.   :1.000  
##  race_black_inp  race_nwother_inp ast_dx_pre_lottery dia_dx_pre_lottery
##  Min.   :0.000   Min.   :0.000    Min.   :0.000      Min.   :0.0000    
##  1st Qu.:0.000   1st Qu.:0.000    1st Qu.:0.000      1st Qu.:0.0000    
##  Median :0.000   Median :0.000    Median :0.000      Median :0.0000    
##  Mean   :0.103   Mean   :0.144    Mean   :0.193      Mean   :0.0711    
##  3rd Qu.:0.000   3rd Qu.:0.000    3rd Qu.:0.000      3rd Qu.:0.0000    
##  Max.   :1.000   Max.   :1.000    Max.   :1.000      Max.   :1.0000    
##  hbp_dx_pre_lottery chl_dx_pre_lottery ami_dx_pre_lottery chf_dx_pre_lottery
##  Min.   :0.000      Min.   :0.000      Min.   :0.0000     Min.   :0.0000    
##  1st Qu.:0.000      1st Qu.:0.000      1st Qu.:0.0000     1st Qu.:0.0000    
##  Median :0.000      Median :0.000      Median :0.0000     Median :0.0000    
##  Mean   :0.182      Mean   :0.127      Mean   :0.0197     Mean   :0.0111    
##  3rd Qu.:0.000      3rd Qu.:0.000      3rd Qu.:0.0000     3rd Qu.:0.0000    
##  Max.   :1.000      Max.   :1.000      Max.   :1.0000     Max.   :1.0000    
##  emp_dx_pre_lottery kid_dx_pre_lottery cancer_dx_pre_lottery dep_dx_pre_lottery
##  Min.   :0.000      Min.   :0.0000     Min.   :0.0000        Min.   :0.000     
##  1st Qu.:0.000      1st Qu.:0.0000     1st Qu.:0.0000        1st Qu.:0.000     
##  Median :0.000      Median :0.0000     Median :0.0000        Median :0.000     
##  Mean   :0.022      Mean   :0.0186     Mean   :0.0428        Mean   :0.341     
##  3rd Qu.:0.000      3rd Qu.:0.0000     3rd Qu.:0.0000        3rd Qu.:1.000     
##  Max.   :1.000      Max.   :1.0000     Max.   :1.0000        Max.   :1.000     
##  charg_tot_pre_ed ed_charg_tot_pre_ed num_visit_pre_cens_ed any_depres_pre_ed
##  Min.   :     0   Min.   :    0       Min.   : 0.000        Min.   :0.0000   
##  1st Qu.:     0   1st Qu.:    0       1st Qu.: 0.000        1st Qu.:0.0000   
##  Median :     0   Median :    0       Median : 0.000        Median :0.0000   
##  Mean   :  2216   Mean   :  913       Mean   : 0.798        Mean   :0.0157   
##  3rd Qu.:   774   3rd Qu.:  626       3rd Qu.: 1.000        3rd Qu.:0.0000   
##  Max.   :180055   Max.   :41246       Max.   :17.000        Max.   :1.0000   
##      lessHS         HSorGED     
##  Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000  
##  Mean   :0.205   Mean   :0.454  
##  3rd Qu.:0.000   3rd Qu.:1.000  
##  Max.   :1.000   Max.   :1.000

print("Missing values in the imputed data:")

## [1] "Missing values in the imputed data:"

colSums(is.na(newd_preimp))

##            numhh_list            gender_inp               age_inp 
##                     0                     0                     0 
##          hispanic_inp        race_white_inp        race_black_inp 
##                     0                     0                     0 
##      race_nwother_inp    ast_dx_pre_lottery    dia_dx_pre_lottery 
##                     0                     0                     0 
##    hbp_dx_pre_lottery    chl_dx_pre_lottery    ami_dx_pre_lottery 
##                     0                     0                     0 
##    chf_dx_pre_lottery    emp_dx_pre_lottery    kid_dx_pre_lottery 
##                     0                     0                     0 
## cancer_dx_pre_lottery    dep_dx_pre_lottery      charg_tot_pre_ed 
##                     0                     0                     0 
##   ed_charg_tot_pre_ed num_visit_pre_cens_ed     any_depres_pre_ed 
##                     0                     0                     0 
##                lessHS               HSorGED 
##                     0                     0

print("Information on newd_imp:")

## [1] "Information on newd_imp:"

glimpse(newd_imp)

## Rows: 12,208
## Columns: 70
## $ numhh_list            <fct> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender_inp            <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp               <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ hispanic_inp          <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp        <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp      <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ast_dx_pre_lottery    <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ charg_tot_pre_ed      <dbl> 0.0, 0.0, 1888.2, 0.0, 1715.3, 0.0, 0.0, 5743.9,…
## $ ed_charg_tot_pre_ed   <dbl> 0.0, 0.0, 1888.2, 0.0, 1006.3, 0.0, 0.0, 4542.4,…
## $ num_visit_pre_cens_ed <int> 0, 0, 1, 0, 2, 0, 0, 7, 0, 2, 0, 0, 1, 0, 0, 0, …
## $ any_depres_pre_ed     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ lessHS                <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED               <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ X                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id             <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp      <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ diabetes              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol       <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression            <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq                   <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk              <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any               <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any                <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend             <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt                  <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow                <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ a1c                   <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level             <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level             <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi                   <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp                   <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp                   <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any     <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions         <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id          <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility           <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num               <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num                <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ charg_tot_ed          <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_ed       <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ sbp_neg               <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg               <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg         <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg         <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg               <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg               <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg               <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg          <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg              <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg            <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …

print("Missing values in newd_imp:")

## [1] "Missing values in newd_imp:"

colSums(is.na(newd_imp))

##            numhh_list            gender_inp               age_inp 
##                     0                     0                     0 
##          hispanic_inp        race_white_inp        race_black_inp 
##                     0                     0                     0 
##      race_nwother_inp    ast_dx_pre_lottery    dia_dx_pre_lottery 
##                     0                     0                     0 
##    hbp_dx_pre_lottery    chl_dx_pre_lottery    ami_dx_pre_lottery 
##                     0                     0                     0 
##    chf_dx_pre_lottery    emp_dx_pre_lottery    kid_dx_pre_lottery 
##                     0                     0                     0 
## cancer_dx_pre_lottery    dep_dx_pre_lottery      charg_tot_pre_ed 
##                     0                     0                     0 
##   ed_charg_tot_pre_ed num_visit_pre_cens_ed     any_depres_pre_ed 
##                     0                     0                     0 
##                lessHS               HSorGED                     X 
##                     0                     0                     0 
##             person_id      weight_total_inp              diabetes 
##                     0                     0                    41 
##          hypertension       highcholesterol            depression 
##                   281                   332                   131 
##                   phq              cvd_risk               doc_any 
##                    64                  2801                    17 
##                ed_any              hosp_any             oop_spend 
##                    18                    17                    28 
##          catastrophic                  debt                borrow 
##                   424                   114                    10 
##                   a1c             hdl_level             chl_level 
##                    89                    57                    55 
##                   bmi                   sbp                   dbp 
##                    54                    41                    41 
##     prescriptions_any         prescriptions      hypertension_med 
##                     2                   311                     0 
##       cholesterol_med          diabetes_med        depression_med 
##                     0                     0                     0 
##          household_id           eligibility ohp_all_ever_inperson 
##                     0                     0                     0 
##               doc_num                ed_num              hosp_num 
##                    64                    47                    47 
##          charg_tot_ed       ed_charg_tot_ed               sbp_neg 
##                  2055                  2056                    41 
##               dbp_neg         chl_level_neg         hdl_level_neg 
##                    41                    55                    57 
##               a1c_neg               bmi_neg               phq_neg 
##                    89                    54                    64 
##          cvd_risk_neg              debt_neg            borrow_neg 
##                  2801                   114                    10 
##      catastrophic_neg 
##                   424

Write output file

#####STEP 2-3: Write output file #####
write.csv(newd_imp, paste0(processedpath, "2_Imputed_Wide_Dataset.csv"), row.names = FALSE)

2_Impute_Missing_Data

Kosuke Inoue, Revati Kapshikar, Kristine Koutout

2025-01-13

Set up environment

Non-run-specific parameters

Run-specific parameters

Set file paths

Load the data

Perform imputation

Write output file