Set up environment

#####STEP 0-1: Reset environment #####
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
options(repos = structure(c(CRAN = "http://cran.rstudio.com/")))

#####STEP 0-2: Install packages #####
list.of.packages <- c("dplyr", "tidyverse", "grf")
new.packages <- list.of.packages[!(list.of.packages %in% 
                                 installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

lapply(list.of.packages, library, character.only = TRUE)

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tibble' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## Warning: package 'grf' was built under R version 4.3.3

#####STEP 0-3: Basic information #####
Sys.time()
# Get detailed R session and system information
session_info <- sessionInfo()
system_info <- Sys.info()
# Combine the output
list(session_info = session_info, system_info = system_info)

Set non-run-specific parameters

#####STEP 0-4: Set parameters #####
seedset <- 777
set.seed(seedset)

# Set printing options
options(digits = 4)

Set run-specific parameters

#####STEP 0-5: Set run-specific parameters #####
published_paper_run <- 0
if (published_paper_run == 1) {
  print("save intermediate files into Cleaned_input_data/As_published folders")
  warning("Changing this setting to 1 overwrites the input files required for 
          replicating on different platforms.")
} else {
  print("save intermediate files into Cleaned_input_data/Testing folder")
}

## [1] "save intermediate files into Cleaned_input_data/Testing folder"

# Create a descriptive simulated_run_name
simulated_run_name <- "exp_tot_dx_index"

# List of all outcomes that will be simulated. These are the column names in the dataset
# that will be modified by the simulation process. The "_neg" suffix indicates these 
# outcomes have been reversed (higher values are better)
all_outcomes <- list(
    "sbp_neg"
)

# List of base outcome names (without "_neg") that have a negative treatment effect
# This is used to determine the direction of the effect in the simulation:
# - If an outcome is in this list, the treatment effect will be reversed
# - Kristine may drop this list in the future - we are not currently using it because _neg outcomes are generated in 1_Generate_Simulated_Data.Rmd
negative_treatment_effect <- c(
)

# Need to specify list of continuous and binary outcomes.
# List of continuous outcomes that will be modified by the simulation
continuous_outcomes_to_modify <- c(
    "sbp_neg"
)

# List of binary (0/1) outcomes that will be modified by the simulation
binary_outcomes_to_modify <- c(
)

# Set scalar on the plustau function - alpha
alpha_term <- 5 # Using a decent value from previous runs. Adding code snippet to test different alphas in code chunk below

# Set additive term on the plustau function - gamma
gamma_term <- "ed_charg_tot_pre_ed" # Changed from numeric value to covariate name

# Set the percentile at which to cap the tot_charg_pre_ed variable
percentile <- 0.80

Set file paths

#####STEP 0-6: Set file paths #####
# Set the processed path based on simulate_data and published_paper_run
processedpath <- if (published_paper_run == 1) {
    paste0("PP_Full_Analysis/Cleaned_input_data/As_published/simulated/")
  } else {
    paste0("PP_Full_Analysis/Cleaned_input_data/Testing/simulated/")
  }

print(paste("Processed data path:", processedpath))

## [1] "Processed data path: PP_Full_Analysis/Cleaned_input_data/Testing/simulated/"

Load empirical data

#####STEP 2b-1: Read source data #####
# Read the imputed dataset to get person_ids
source_data <- read.csv("PP_Full_Analysis/Cleaned_input_data/Testing/empirical/2_Imputed_Wide_Dataset.csv")

print("Information on source data:")

## [1] "Information on source data:"

glimpse(source_data)

## Rows: 12,208
## Columns: 70
## $ numhh_list            <int> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender_inp            <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp               <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ hispanic_inp          <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp        <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp      <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ast_dx_pre_lottery    <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ charg_tot_pre_ed      <dbl> 0.0, 0.0, 1888.2, 0.0, 1715.3, 0.0, 0.0, 5743.9,…
## $ ed_charg_tot_pre_ed   <dbl> 0.0, 0.0, 1888.2, 0.0, 1006.3, 0.0, 0.0, 4542.4,…
## $ num_visit_pre_cens_ed <int> 0, 0, 1, 0, 2, 0, 0, 7, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ any_depres_pre_ed     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ lessHS                <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED               <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ X                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id             <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp      <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ diabetes              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol       <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression            <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq                   <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk              <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any               <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any                <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend             <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt                  <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow                <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ a1c                   <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level             <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level             <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi                   <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp                   <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp                   <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any     <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions         <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id          <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility           <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num               <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num                <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ charg_tot_ed          <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_ed       <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ sbp_neg               <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg               <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg         <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg         <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg               <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg               <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg               <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg          <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg              <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg            <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …

Create dx_index:

#####STEP 2b-2: Create diagnosis sum #####
# Create sum of diagnoses
dx_sum <- with(source_data, 
               ast_dx_pre_lottery + 
               dia_dx_pre_lottery +
               hbp_dx_pre_lottery + 
               chl_dx_pre_lottery +
               ami_dx_pre_lottery + 
               chf_dx_pre_lottery +
               emp_dx_pre_lottery + 
               kid_dx_pre_lottery +
               cancer_dx_pre_lottery + 
               dep_dx_pre_lottery)

# Normalize sum between 0 and 1
dx_sum_norm <- (dx_sum - min(dx_sum, na.rm = TRUE)) / 
               (max(dx_sum, na.rm = TRUE) - min(dx_sum, na.rm = TRUE))

print("Summary of dx_sum before normalization:")

## [1] "Summary of dx_sum before normalization:"

print(summary(dx_sum))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    1.00    1.03    2.00    8.00

print("Summary of normalized dx_sum:")

## [1] "Summary of normalized dx_sum:"

print(summary(dx_sum_norm))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.125   0.128   0.250   1.000

# Add to source_data for use in plustauO
source_data$dx_sum_norm <- dx_sum_norm

Helper functions for simulating data

Define plustau(x) to generate a sample-specific perturbation level

#####STEP 2b-3: Define plustau(x) to generate a sample-specific perturbation level #####
plustauO <- function(x, gamma_values) {  
  # Check if the column is entirely NA
  if (all(is.na(x))) {
    stop("Error: charg_tot_pre_ed contains only missing values.")
  }
  
  # Cap the prior health expenditures at the specified percentile
  cap_value <- quantile(x, percentile, na.rm = TRUE)
  capped_values <- pmin(x, cap_value)
  
  # Normalize the values between 0 and 1
  normalized_values <- (capped_values - min(capped_values, na.rm = TRUE)) / 
                      (max(capped_values, na.rm = TRUE) - min(capped_values, na.rm = TRUE))

  print("normalized_values:")
  print(summary(normalized_values))

  # cap gamma_values at the specified percentile
  gamma_values <- pmin(gamma_values, quantile(gamma_values, percentile, na.rm = TRUE))

  # Normalize gamma_values between 0 and 1
  gamma_values <- (gamma_values - min(gamma_values, na.rm = TRUE)) / 
                  (max(gamma_values, na.rm = TRUE) - min(gamma_values, na.rm = TRUE))

  print("gamma_values:")
  print(summary(gamma_values))

  # Use pre-calculated dx_sum_norm
  result <- exp(normalized_values + source_data$dx_sum_norm)
  
  return(result)
}

Define simulate_data to generate simulated data

#####STEP 2b-4: Define simulate_data to generate simulated data #####
simulate_data <- function(alphas, original_data) {
  # Create a copy of the original data to modify
  simulated_data <- data.frame(original_data)
  
  # Initialize an empty list to store the standard deviation scale factors for continuous outcomes
  std_devs <- list()
  
  # Calculate plustau once and store it
  plustau <- plustauO(simulated_data[["charg_tot_pre_ed"]], simulated_data[[gamma_term]])
  simulated_data[["plustau"]] <- plustau  # Save plustau into the dataframe
  
  # from issue: for continuous outcomes, calculate the std deviation of the outcome to scale plustauO(x)
  for (col in continuous_outcomes_to_modify) {
    std_devs[[col]] <- sd(simulated_data[[col]], na.rm = TRUE)
  }
  
  # Step 3: Modify rows where ohp_all_ever_inperson == 1
  for (i in 1:nrow(simulated_data)) {
    
    #if this is a continuous variable
    for (col in continuous_outcomes_to_modify) {
      # status quo is positive treatment effect. for these outcomes a positive treatment effect signifies benefit.
      direction <- 1
    
      if (col %in% negative_treatment_effect) { ## added just for pre "_neg" columns
        direction <- -1
      }
      
      if (simulated_data$ohp_all_ever_inperson[i] == 0) {
        # If ohp_all_ever_inperson (coverage) == 0, perturb in a "negative" direction according to whether direction == 1 or -1. 
          simulated_data[i, col] <- round(simulated_data[i, col] - (alphas * plustau[i] * std_devs[[col]] * 0.5 * direction), 2)
          
      } else if (simulated_data$ohp_all_ever_inperson[i] == 1) {
        # If ohp_all_ever_inperson (coverage) == 0, perturb in a "positive" direction according to whether direction == 1 or -1. 
          simulated_data[i, col] <- round(simulated_data[i, col] + (alphas * plustau[i] * std_devs[[col]] * 0.5 * direction), 2)
      }
    }
    
    #if this is a binary variable
    for (col in binary_outcomes_to_modify) {
      
      # positive or negative treatment effect. 
      positive_treatment_effect <- TRUE
      if (col %in% negative_treatment_effect) { ## added just for pre "_neg" columns
        positive_treatment_effect <- FALSE
      }
      
      # Check to see if original data is missing
      if (!is.na(simulated_data[i, col])) {
          if (simulated_data$ohp_all_ever_inperson[i] == 0 && positive_treatment_effect == FALSE) { ## added just for pre "_neg" columns
            # for 20% of non-compliers, set their outcome to be 1 --> making the outcome result worse for non-compliers
            if (simulated_data[i, col] == 0 && runif(1) < 0.2) {
              simulated_data[i, col] = 1;
            }
          } else if (simulated_data$ohp_all_ever_inperson[i] == 1 && positive_treatment_effect == FALSE) {  ## added just for pre "_neg" columns
            # for 20% of compliers, set their outcome to be 1 --> making the outcome result better for compliers
             if (simulated_data[i, col] == 1 && runif(1) < 0.2) {
              simulated_data[i, col] = 0;
            } 
          }
          
          else if (simulated_data$ohp_all_ever_inperson[i] == 0 && positive_treatment_effect == TRUE) {
            # for 20% of non-compliers, set their outcome to be 0 --> making the outcome result worse for non-compliers.
            if (simulated_data[i, col] == 1 && runif(1) < 0.2) {
              simulated_data[i, col] = 0;
            } 
          }
          else if (simulated_data$ohp_all_ever_inperson[i] == 1 && positive_treatment_effect == TRUE) {
            # for 20% of compliers, set their outcome to be 1 --> making the outcome result better for compliers.
            if (simulated_data[i, col] == 0 && runif(1) < 0.2) {
              simulated_data[i, col] = 1;
            }
          }
      }      
    }
  }
  
  # At the end, return person_id, simulated columns, and plustau
  simulated_cols <- c()
  for (col_name in c(binary_outcomes_to_modify, continuous_outcomes_to_modify)) {
    new_col_name <- paste0("sim_", col_name, "_alpha_", alphas, "_", simulated_run_name)
    simulated_cols <- c(simulated_cols, new_col_name)
    simulated_data[[new_col_name]] <- simulated_data[[col_name]]
  }
  
  return(simulated_data[, c("person_id", "plustau", simulated_cols)])
}

Generate simulated data

#####STEP 2b-5: Generate simulated data #####
# Create simulated dataset with new columns
new_simulated_cols = simulate_data(alpha_term, source_data)

## [1] "normalized_values:"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.262   0.562   1.000 
## [1] "gamma_values:"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   0.000   0.262   0.565   1.000

# Define the output file path
output_file <- paste0(processedpath, "2b_Simulated_Outcomes.csv")

# Merge if the file exists and no duplicate columns are found
if (file.exists(output_file)) {
  # Read the existing simulated outcomes file
  clean_simulated_data <- read.csv(output_file)
  
  # Check if any new columns already exist in clean_simulated_data
  new_col_names <- setdiff(names(new_simulated_cols), "person_id")
  existing_cols <- intersect(new_col_names, names(clean_simulated_data))

  print("new_col_names:")
  print(new_col_names)
  print("existing_cols:")
  print(existing_cols)
  
  print("Checking for duplicate column names in 2_Simulated_Outcomes.csv...")
  if (length(existing_cols) > 0) {
    stop(paste("Error: The following columns already exist in the simulated data:",
               paste(existing_cols, collapse=", ")))
  }  
  print("No duplicates found.")
  # If no duplicates found, merge new columns with existing data
  clean_simulated_data <- merge(clean_simulated_data, new_simulated_cols, by="person_id", all=TRUE)
} else {
  # Print error message if file doesn't exist
  print(paste("Error: File", output_file, "does not exist."))
}

## [1] "new_col_names:"
## [1] "plustau"                             
## [2] "sim_sbp_neg_alpha_5_exp_tot_dx_index"
## [1] "existing_cols:"
## character(0)
## [1] "Checking for duplicate column names in 2_Simulated_Outcomes.csv..."
## [1] "No duplicates found."

print("Information on original data: ")

## [1] "Information on original data: "

glimpse(source_data)

## Rows: 12,208
## Columns: 71
## $ numhh_list            <int> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender_inp            <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp               <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ hispanic_inp          <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp        <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp        <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp      <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ast_dx_pre_lottery    <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery    <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery    <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ charg_tot_pre_ed      <dbl> 0.0, 0.0, 1888.2, 0.0, 1715.3, 0.0, 0.0, 5743.9,…
## $ ed_charg_tot_pre_ed   <dbl> 0.0, 0.0, 1888.2, 0.0, 1006.3, 0.0, 0.0, 4542.4,…
## $ num_visit_pre_cens_ed <int> 0, 0, 1, 0, 2, 0, 0, 7, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ any_depres_pre_ed     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ lessHS                <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED               <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ X                     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id             <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp      <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ diabetes              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension          <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol       <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression            <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq                   <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk              <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any               <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any                <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend             <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt                  <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow                <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ a1c                   <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level             <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level             <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi                   <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp                   <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp                   <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any     <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions         <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id          <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility           <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num               <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num                <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ charg_tot_ed          <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_ed       <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ sbp_neg               <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg               <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg         <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg         <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg               <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg               <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg               <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg          <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg              <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg            <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ dx_sum_norm           <dbl> 0.000, 0.125, 0.000, 0.250, 0.000, 0.000, 0.000,…

print("Information on simulated data:")

## [1] "Information on simulated data:"

print("***NOTE: this data contains all simulated outcomes that have been created - new ones will be at the end of the dataframe.***")

## [1] "***NOTE: this data contains all simulated outcomes that have been created - new ones will be at the end of the dataframe.***"

glimpse(clean_simulated_data)

## Rows: 12,208
## Columns: 12
## $ person_id                              <int> 5, 8, 16, 17, 18, 23, 24, 29, 4…
## $ sim_sbp                                <dbl> 248.979, 160.146, 251.084, 160.…
## $ sim_debt_neg_alpha_5                   <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0…
## $ sim_sbp_neg_alpha_5                    <dbl> -144.00, -134.00, -84.61, -168.…
## $ sim_hdl_level_neg_alpha_5              <dbl> -48.33, -51.33, -5.64, -51.33, …
## $ sim_debt_neg_alpha_5_presentation      <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0…
## $ sim_sbp_neg_alpha_5_presentation       <dbl> -144.00, -134.00, -84.61, -168.…
## $ sim_hdl_level_neg_alpha_5_presentation <dbl> -48.33, -51.33, -5.64, -51.33, …
## $ sim_sbp_neg_alpha_5_testing            <dbl> -185.39, -175.39, -43.21, -209.…
## $ sim_sbp_neg_alpha_5_testing_ed         <dbl> -144.00, -134.00, -43.21, -168.…
## $ plustau                                <dbl> 1.000, 1.133, 2.718, 1.284, 2.7…
## $ sim_sbp_neg_alpha_5_exp_tot_dx_index   <dbl> -185.39, -180.90, -13.48, -221.…

print("Summary of simulated data:")

## [1] "Summary of simulated data:"

summary(clean_simulated_data)

##    person_id        sim_sbp     sim_debt_neg_alpha_5 sim_sbp_neg_alpha_5
##  Min.   :    5   Min.   :-172   Min.   :0.00         Min.   :-240.4     
##  1st Qu.:18944   1st Qu.: 132   1st Qu.:0.00         1st Qu.:-137.0     
##  Median :37494   Median : 198   Median :0.00         Median :-119.0     
##  Mean   :37500   Mean   : 199   Mean   :0.43         Mean   :-122.0     
##  3rd Qu.:55947   3rd Qu.: 267   3rd Qu.:1.00         3rd Qu.:-106.0     
##  Max.   :74911   Max.   : 556   Max.   :1.00         Max.   : -30.6     
##                                 NA's   :114          NA's   :41         
##  sim_hdl_level_neg_alpha_5 sim_debt_neg_alpha_5_presentation
##  Min.   :-139.1            Min.   :0.00                     
##  1st Qu.: -62.5            1st Qu.:0.00                     
##  Median : -48.3            Median :0.00                     
##  Mean   : -50.0            Mean   :0.43                     
##  3rd Qu.: -37.8            3rd Qu.:1.00                     
##  Max.   :  13.9            Max.   :1.00                     
##  NA's   :57                NA's   :114                      
##  sim_sbp_neg_alpha_5_presentation sim_hdl_level_neg_alpha_5_presentation
##  Min.   :-240.4                   Min.   :-139.1                        
##  1st Qu.:-137.0                   1st Qu.: -62.5                        
##  Median :-119.0                   Median : -48.3                        
##  Mean   :-122.0                   Mean   : -50.0                        
##  3rd Qu.:-106.0                   3rd Qu.: -37.8                        
##  Max.   : -30.6                   Max.   :  13.9                        
##  NA's   :41                       NA's   :57                            
##  sim_sbp_neg_alpha_5_testing sim_sbp_neg_alpha_5_testing_ed    plustau    
##  Min.   :-281.8              Min.   :-281.8                 Min.   :1.00  
##  1st Qu.:-175.4              1st Qu.:-139.0                 1st Qu.:1.00  
##  Median :-154.4              Median :-119.0                 Median :1.13  
##  Mean   :-137.7              Mean   :-124.6                 Mean   :1.67  
##  3rd Qu.: -82.6              3rd Qu.:-106.0                 3rd Qu.:2.03  
##  Max.   :  10.8              Max.   :  10.8                 Max.   :7.39  
##  NA's   :41                  NA's   :41                                   
##  sim_sbp_neg_alpha_5_exp_tot_dx_index
##  Min.   :-390.9                      
##  1st Qu.:-184.9                      
##  Median :-158.9                      
##  Mean   :-142.2                      
##  3rd Qu.: -76.6                      
##  Max.   : 184.8                      
##  NA's   :41

# Define perturbation as the difference between the original value and the simulated value for each outcome in list all_outcomes
for (outcome in all_outcomes) {
  clean_simulated_data[[paste0("perturbation_", outcome)]] <- source_data[[outcome]] - clean_simulated_data[[paste0("sim_", outcome, "_alpha_", alpha_term, "_", simulated_run_name)]]

  # Print summary of perturbation statistics
  print(paste("Summary of perturbation statistics for", outcome, ":"))
  summary(clean_simulated_data[[paste0("perturbation_", outcome)]])

  # Print distribution of perturbations
  print(paste("Distribution of perturbations for", outcome, ":"))
  hist(clean_simulated_data[[paste0("perturbation_", outcome)]], main = paste("Perturbation Distribution for", outcome), xlab = "Perturbation Value")

}

## [1] "Summary of perturbation statistics for sbp_neg :"
## [1] "Distribution of perturbations for sbp_neg :"

Check results

#####STEP 2b-6: Check results #####
# Sample 20 random person IDs
set.seed(seedset) # Use same seed for reproducibility
sampled_ids <- sample(source_data$person_id, 20)

# For each outcome, create detailed breakdown
for (outcome in unlist(all_outcomes)) {
  cat("\nChecking calculations for outcome:", outcome, "\n")
  cat("----------------------------------------\n")

  # Filter source_data first to get consistent subsetting
  sampled_data <- source_data[source_data$person_id %in% sampled_ids, ]
  sampled_sim_data <- clean_simulated_data[clean_simulated_data$person_id %in% sampled_ids, ]
  
  # Create simulated column name based on our naming convention
  sim_col_name <- paste0("sim_", outcome, "_alpha_", alpha_term, "_", simulated_run_name)
  
  # Make alpha_term a vector of the same length
  alpha_vec <- rep(alpha_term, length(sampled_data$person_id))
  
  check_df <- data.frame(
    person_id = sampled_data$person_id,
    original_value = sampled_data[[outcome]],
    alpha = alpha_term,
    ohp = sampled_data$ohp_all_ever_inperson,
    original_tot_charg = sampled_data$charg_tot_pre_ed,
    gamma_value = sampled_data[[gamma_term]],
    dx_index = sampled_data$dx_sum_norm,
    plustau = sampled_sim_data$plustau,
    perturbation = sampled_sim_data[[paste0("perturbation_", outcome)]],
    simulated_value = sampled_sim_data[[sim_col_name]]
  )
  
  # Add standard deviation if continuous outcome
  if (outcome %in% continuous_outcomes_to_modify) {
    check_df$sd <- sd(source_data[[outcome]], na.rm = TRUE)
    check_df$perturbation <- with(check_df, 
                                 ifelse(ohp == 1, 
                                        alpha * plustau * sd * 0.5,
                                        -alpha * plustau * sd * 0.5))
  } else {
    check_df$sd <- NA
    check_df$perturbation <- NA
  }
  
  # Print summary
  print(check_df)
}

## 
## Checking calculations for outcome: sbp_neg 
## ----------------------------------------
##    person_id original_value alpha ohp original_tot_charg gamma_value dx_index
## 1       4302           -121     5   0                0.0         0.0    0.125
## 2       8859           -107     5   0              981.9       632.9    0.000
## 3       9677           -121     5   0                0.0         0.0    0.125
## 4      12126           -100     5   0                0.0         0.0    0.125
## 5      23911            -95     5   1                0.0         0.0    0.125
## 6      32548           -129     5   0                0.0         0.0    0.125
## 7      38146           -101     5   0                0.0         0.0    0.125
## 8      38237            -95     5   1                0.0         0.0    0.000
## 9      38272           -189     5   0             7440.5      1528.0    0.250
## 10     41167           -117     5   0             1525.8      1114.0    0.000
## 11     41754           -108     5   1              446.3       254.3    0.000
## 12     46648           -101     5   1                0.0         0.0    0.000
## 13     47189           -119     5   1                0.0         0.0    0.000
## 14     52317           -133     5   1                0.0         0.0    0.000
## 15     55990           -113     5   1              395.0       395.0    0.125
## 16     57064           -131     5   0                0.0         0.0    0.250
## 17     64458           -112     5   0              674.7       674.7    0.000
## 18     67645           -121     5   1                0.0         0.0    0.000
## 19     69119            -96     5   0            11472.2      2805.9    0.125
## 20     71464           -100     5   0                0.0         0.0    0.000
##    plustau perturbation simulated_value    sd
## 1    1.133       -46.90         -167.90 16.56
## 2    2.099       -86.90         -193.90 16.56
## 3    1.133       -46.90         -167.90 16.56
## 4    1.133       -46.90         -146.90 16.56
## 5    1.133        46.90          -48.10 16.56
## 6    1.133       -46.90         -175.90 16.56
## 7    1.133       -46.90         -147.90 16.56
## 8    1.000        41.39          -53.61 16.56
## 9    3.490      -144.47         -333.47 16.56
## 10   2.718      -112.52         -229.52 16.56
## 11   1.401        57.99          -50.01 16.56
## 12   1.000        41.39          -59.61 16.56
## 13   1.000        41.39          -77.61 16.56
## 14   1.000        41.39          -91.61 16.56
## 15   1.527        63.21          -49.79 16.56
## 16   1.284       -53.15         -184.15 16.56
## 17   1.665       -68.91         -180.91 16.56
## 18   1.000        41.39          -79.61 16.56
## 19   3.080      -127.50         -223.50 16.56
## 20   1.000       -41.39         -141.39 16.56

Write output file

#####STEP 2b-7: Write output file #####
write.csv(clean_simulated_data, output_file, row.names = FALSE)
print(paste("Output saved to:", output_file))

## [1] "Output saved to: PP_Full_Analysis/Cleaned_input_data/Testing/simulated/2b_Simulated_Outcomes.csv"

Including code to evaluate different values of alpha

alphas_to_test <- c(0, 1, 5, 10)
outcomes_to_test <- c("sbp", "a1c")
                  
iterate_through_alphas <- function(alphas) {
  for (alpha_candidate in alphas_to_test) {
    simulated_data_alpha_test = simulate_data(alpha_candidate, source_data)
  
    #Dataframe to hold forest results
    single_causal <- data.frame(
      Outcome = character(0),
      MSE = numeric(0),
      MSE_se = numeric(0),
      # RMSE = numeric(0),
      AUTOC = numeric(0),
      AUTOC_se = numeric(0),
      mean_forest_est = numeric(0),
      mean_forest_stderr = numeric(0),
      diff_forest_est = numeric(0),
      diff_forest_stderr = numeric(0)
    )


  for (outcome in outcomes_to_test) {
    tryCatch({
      cat("Processing outcome:", outcome, "\n")
      # Set outcome

      # Drop rows where the current outcome column has missing values (NA)
      simulated_data <- simulated_data_alpha_test %>%
      filter(!is.na(.data[[outcome]]))  # Use .data[[outcome]] for dynamic column referencing
      
      Y <- as.numeric(simulated_data[[outcome]])  # Use [[ ]] to dynamically reference columns
      X <- simulated_data %>%
        select(-c("person_id", "household_id", "weight_total_inp", "eligibility", "ohp_all_ever_inperson", unlist(all_outcomes)))
      

      W <- as.numeric(simulated_data$eligibility)
      cf <- causal_forest(X=X, Y=Y, W=W)

      # Get point predictions of CATE. (RK - some predictions <10 are missing values. is this expected?)
      tau_hat <- predict(cf)$predictions

      m.hat <- cf$Y.hat # E[Y|X] estimates
      e.hat <- cf$W.hat # e(X) := E[W|X] estimates (or known quantity)

      # Calculate the MSE of CATE
      mse_cate <- mean(((Y - m.hat) - ((W - e.hat) * tau_hat))^2, na.rm = TRUE)

      # Calculate the standard error of the MSE of CATE -->
      mse_cate_se <- sd(((Y - m.hat) - ((W - e.hat) * tau_hat))^2, na.rm = TRUE) / sqrt(length(Y))
      

      if (outcome %in% negative_treatment_effect) {
         # If outcome is in the list, adjust tau_hat by flipping the sign
          rate.oob <- rank_average_treatment_effect(cf, -tau_hat)
      } else {
          # Otherwise, use tau_hat as it is
        rate.oob <- rank_average_treatment_effect(cf, tau_hat)
      }

      
      rate.oob = rank_average_treatment_effect(cf, tau_hat)
      # par(mfrow = c(1, 2))
      # plot_title <- paste("TOC evaluation for", outcome, "alpha =", alpha)
      # 
      # # Plot with dynamic title
      # plot(rate.oob, main = plot_title)
      
      t.stat.oob = rate.oob$estimate / rate.oob$std.err

      # Calculate the calibration regression of CATE
      calibration_regression <- test_calibration(cf)
      mean.forest.estimate <- calibration_regression[1,1]
      differential.forest.estimate <-calibration_regression[2,1]
      mean.forest.stderr <- calibration_regression[1, 2]
      differential.forest.stderr <- calibration_regression[2, 2]

      single_causal <- rbind(single_causal, data.frame(Outcome = outcome,
                                                              MSE = round(mse_cate, 4),
                                                              MSE_se = round(mse_cate_se, 4),
                                                              AUTOC = round(rate.oob$estimate, 4),
                                                              AUTOC_se = round(rate.oob$std.err, 4),
                                                              mean_forest_est = round(mean.forest.estimate, 4),
                                                              mean_forest_stderr = round(mean.forest.stderr, 4),
                                                              diff_forest_est = round(differential.forest.estimate, 4),
                                                              diff_forest_stderr =round(differential.forest.stderr, 4)))

    }, error = function(e) {
         cat("Error in outcome:", outcome, "\n")
         cat("Error message:", e$message, "\n")
     })
  }
  print(single_causal)

}
  
}

iterate_through_alphas(alphas_to_test)

2b_Generate_Simulated_Data

Revati Kapshikar, Kristine Koutout

2025-01-21

Set up environment

Set non-run-specific parameters

Set run-specific parameters

Set file paths

Load empirical data

Create dx_index:

Helper functions for simulating data

Define plustau(x) to generate a sample-specific perturbation level

Define simulate_data to generate simulated data

Generate simulated data

Check results

Write output file

Including code to evaluate different values of alpha