Load empirical data
#####STEP 2b-1: Read source data #####
# Read the imputed dataset to get person_ids
source_data <- read.csv("PP_Full_Analysis/Cleaned_input_data/Testing/empirical/2_Imputed_Wide_Dataset.csv")
print("Information on source data:")
## [1] "Information on source data:"
## Rows: 12,208
## Columns: 70
## $ numhh_list <int> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender_inp <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ hispanic_inp <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ast_dx_pre_lottery <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ charg_tot_pre_ed <dbl> 0.0, 0.0, 1888.2, 0.0, 1715.3, 0.0, 0.0, 5743.9,…
## $ ed_charg_tot_pre_ed <dbl> 0.0, 0.0, 1888.2, 0.0, 1006.3, 0.0, 0.0, 4542.4,…
## $ num_visit_pre_cens_ed <int> 0, 0, 1, 0, 2, 0, 0, 7, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ any_depres_pre_ed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ lessHS <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ diabetes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ a1c <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ charg_tot_ed <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_ed <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ sbp_neg <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
Generate simulated data
#####STEP 2b-5: Generate simulated data #####
# Create simulated dataset with new columns
new_simulated_cols = simulate_data(alpha_term, source_data)
## [1] "normalized_values:"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.262 0.562 1.000
## [1] "gamma_values:"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.262 0.565 1.000
# Define the output file path
output_file <- paste0(processedpath, "2b_Simulated_Outcomes.csv")
# Merge if the file exists and no duplicate columns are found
if (file.exists(output_file)) {
# Read the existing simulated outcomes file
clean_simulated_data <- read.csv(output_file)
# Check if any new columns already exist in clean_simulated_data
new_col_names <- setdiff(names(new_simulated_cols), "person_id")
existing_cols <- intersect(new_col_names, names(clean_simulated_data))
print("new_col_names:")
print(new_col_names)
print("existing_cols:")
print(existing_cols)
print("Checking for duplicate column names in 2_Simulated_Outcomes.csv...")
if (length(existing_cols) > 0) {
stop(paste("Error: The following columns already exist in the simulated data:",
paste(existing_cols, collapse=", ")))
}
print("No duplicates found.")
# If no duplicates found, merge new columns with existing data
clean_simulated_data <- merge(clean_simulated_data, new_simulated_cols, by="person_id", all=TRUE)
} else {
# Print error message if file doesn't exist
print(paste("Error: File", output_file, "does not exist."))
}
## [1] "new_col_names:"
## [1] "plustau"
## [2] "sim_sbp_neg_alpha_5_exp_tot_dx_index"
## [1] "existing_cols:"
## character(0)
## [1] "Checking for duplicate column names in 2_Simulated_Outcomes.csv..."
## [1] "No duplicates found."
print("Information on original data: ")
## [1] "Information on original data: "
## Rows: 12,208
## Columns: 71
## $ numhh_list <int> 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ gender_inp <int> 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, …
## $ age_inp <int> 60, 41, 39, 52, 51, 32, 34, 23, 43, 46, 38, 25, …
## $ hispanic_inp <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ race_white_inp <int> 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ race_black_inp <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ race_nwother_inp <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ast_dx_pre_lottery <int> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ dia_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hbp_dx_pre_lottery <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ chl_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ ami_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ chf_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ emp_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ kid_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ cancer_dx_pre_lottery <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ dep_dx_pre_lottery <int> 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, …
## $ charg_tot_pre_ed <dbl> 0.0, 0.0, 1888.2, 0.0, 1715.3, 0.0, 0.0, 5743.9,…
## $ ed_charg_tot_pre_ed <dbl> 0.0, 0.0, 1888.2, 0.0, 1006.3, 0.0, 0.0, 4542.4,…
## $ num_visit_pre_cens_ed <int> 0, 0, 1, 0, 2, 0, 0, 7, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ any_depres_pre_ed <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ lessHS <int> 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ HSorGED <int> 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ person_id <int> 5, 8, 16, 17, 18, 23, 24, 29, 47, 57, 59, 68, 70…
## $ weight_total_inp <dbl> 1.1504, 0.8975, 1.0000, 1.2126, 1.0000, 1.0033, …
## $ diabetes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ hypertension <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ highcholesterol <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression <int> 0, 0, NA, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ phq <int> 1, 9, 2, 13, 2, 3, 2, 14, 11, 8, 7, 3, 2, 0, 2, …
## $ cvd_risk <dbl> 0.1370, 0.1120, 0.0330, 0.2530, 0.1560, 0.0120, …
## $ doc_any <int> 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, …
## $ ed_any <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, …
## $ hosp_any <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ oop_spend <int> 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, …
## $ catastrophic <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, …
## $ debt <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, …
## $ borrow <int> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, …
## $ a1c <dbl> 5.037, 5.201, 5.854, 5.364, 5.527, 5.037, 5.446,…
## $ hdl_level <dbl> 48.33, 51.33, 38.58, 51.33, 28.08, 31.08, 25.83,…
## $ chl_level <dbl> 241.0, 229.9, 229.9, 235.4, 177.7, 173.8, 152.7,…
## $ bmi <dbl> 26.66, 35.23, 37.12, 24.81, 27.02, 26.26, 27.70,…
## $ sbp <int> 144, 134, 126, 168, 119, 98, 108, 125, 100, 104,…
## $ dbp <int> 81, 82, 94, 110, 79, 59, 63, 76, 77, 63, 84, 62,…
## $ prescriptions_any <int> 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, …
## $ prescriptions <int> 0, 2, 2, NA, 0, 0, 0, 3, 0, 3, 4, 0, 4, 2, 1, 4,…
## $ hypertension_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, …
## $ cholesterol_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ diabetes_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ depression_med <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, …
## $ household_id <int> 100005, 102094, 140688, 100017, 100018, 115253, …
## $ eligibility <int> 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ ohp_all_ever_inperson <int> 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, …
## $ doc_num <int> 0, 6, 12, 0, 0, 5, 0, 5, 1, 12, 6, 0, 3, 0, 10, …
## $ ed_num <int> 0, 2, 1, 1, 0, 0, 0, 10, 2, 6, 0, 2, 0, 0, 0, 0,…
## $ hosp_num <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, …
## $ charg_tot_ed <dbl> 0, 2751, 15233, NA, 0, 0, 0, 8436, 0, NA, 0, 0, …
## $ ed_charg_tot_ed <dbl> 0.0, 2751.4, 7100.8, NA, 0.0, 0.0, 0.0, 7067.0, …
## $ sbp_neg <int> -144, -134, -126, -168, -119, -98, -108, -125, -…
## $ dbp_neg <int> -81, -82, -94, -110, -79, -59, -63, -76, -77, -6…
## $ chl_level_neg <dbl> -241.0, -229.9, -229.9, -235.4, -177.7, -173.8, …
## $ hdl_level_neg <dbl> -48.33, -51.33, -38.58, -51.33, -28.08, -31.08, …
## $ a1c_neg <dbl> -5.037, -5.201, -5.854, -5.364, -5.527, -5.037, …
## $ bmi_neg <dbl> -26.66, -35.23, -37.12, -24.81, -27.02, -26.26, …
## $ phq_neg <int> -1, -9, -2, -13, -2, -3, -2, -14, -11, -8, -7, -…
## $ cvd_risk_neg <dbl> -0.1370, -0.1120, -0.0330, -0.2530, -0.1560, -0.…
## $ debt_neg <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ borrow_neg <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, …
## $ catastrophic_neg <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, …
## $ dx_sum_norm <dbl> 0.000, 0.125, 0.000, 0.250, 0.000, 0.000, 0.000,…
print("Information on simulated data:")
## [1] "Information on simulated data:"
print("***NOTE: this data contains all simulated outcomes that have been created - new ones will be at the end of the dataframe.***")
## [1] "***NOTE: this data contains all simulated outcomes that have been created - new ones will be at the end of the dataframe.***"
glimpse(clean_simulated_data)
## Rows: 12,208
## Columns: 12
## $ person_id <int> 5, 8, 16, 17, 18, 23, 24, 29, 4…
## $ sim_sbp <dbl> 248.979, 160.146, 251.084, 160.…
## $ sim_debt_neg_alpha_5 <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0…
## $ sim_sbp_neg_alpha_5 <dbl> -144.00, -134.00, -84.61, -168.…
## $ sim_hdl_level_neg_alpha_5 <dbl> -48.33, -51.33, -5.64, -51.33, …
## $ sim_debt_neg_alpha_5_presentation <int> 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0…
## $ sim_sbp_neg_alpha_5_presentation <dbl> -144.00, -134.00, -84.61, -168.…
## $ sim_hdl_level_neg_alpha_5_presentation <dbl> -48.33, -51.33, -5.64, -51.33, …
## $ sim_sbp_neg_alpha_5_testing <dbl> -185.39, -175.39, -43.21, -209.…
## $ sim_sbp_neg_alpha_5_testing_ed <dbl> -144.00, -134.00, -43.21, -168.…
## $ plustau <dbl> 1.000, 1.133, 2.718, 1.284, 2.7…
## $ sim_sbp_neg_alpha_5_exp_tot_dx_index <dbl> -185.39, -180.90, -13.48, -221.…
print("Summary of simulated data:")
## [1] "Summary of simulated data:"
summary(clean_simulated_data)
## person_id sim_sbp sim_debt_neg_alpha_5 sim_sbp_neg_alpha_5
## Min. : 5 Min. :-172 Min. :0.00 Min. :-240.4
## 1st Qu.:18944 1st Qu.: 132 1st Qu.:0.00 1st Qu.:-137.0
## Median :37494 Median : 198 Median :0.00 Median :-119.0
## Mean :37500 Mean : 199 Mean :0.43 Mean :-122.0
## 3rd Qu.:55947 3rd Qu.: 267 3rd Qu.:1.00 3rd Qu.:-106.0
## Max. :74911 Max. : 556 Max. :1.00 Max. : -30.6
## NA's :114 NA's :41
## sim_hdl_level_neg_alpha_5 sim_debt_neg_alpha_5_presentation
## Min. :-139.1 Min. :0.00
## 1st Qu.: -62.5 1st Qu.:0.00
## Median : -48.3 Median :0.00
## Mean : -50.0 Mean :0.43
## 3rd Qu.: -37.8 3rd Qu.:1.00
## Max. : 13.9 Max. :1.00
## NA's :57 NA's :114
## sim_sbp_neg_alpha_5_presentation sim_hdl_level_neg_alpha_5_presentation
## Min. :-240.4 Min. :-139.1
## 1st Qu.:-137.0 1st Qu.: -62.5
## Median :-119.0 Median : -48.3
## Mean :-122.0 Mean : -50.0
## 3rd Qu.:-106.0 3rd Qu.: -37.8
## Max. : -30.6 Max. : 13.9
## NA's :41 NA's :57
## sim_sbp_neg_alpha_5_testing sim_sbp_neg_alpha_5_testing_ed plustau
## Min. :-281.8 Min. :-281.8 Min. :1.00
## 1st Qu.:-175.4 1st Qu.:-139.0 1st Qu.:1.00
## Median :-154.4 Median :-119.0 Median :1.13
## Mean :-137.7 Mean :-124.6 Mean :1.67
## 3rd Qu.: -82.6 3rd Qu.:-106.0 3rd Qu.:2.03
## Max. : 10.8 Max. : 10.8 Max. :7.39
## NA's :41 NA's :41
## sim_sbp_neg_alpha_5_exp_tot_dx_index
## Min. :-390.9
## 1st Qu.:-184.9
## Median :-158.9
## Mean :-142.2
## 3rd Qu.: -76.6
## Max. : 184.8
## NA's :41
# Define perturbation as the difference between the original value and the simulated value for each outcome in list all_outcomes
for (outcome in all_outcomes) {
clean_simulated_data[[paste0("perturbation_", outcome)]] <- source_data[[outcome]] - clean_simulated_data[[paste0("sim_", outcome, "_alpha_", alpha_term, "_", simulated_run_name)]]
# Print summary of perturbation statistics
print(paste("Summary of perturbation statistics for", outcome, ":"))
summary(clean_simulated_data[[paste0("perturbation_", outcome)]])
# Print distribution of perturbations
print(paste("Distribution of perturbations for", outcome, ":"))
hist(clean_simulated_data[[paste0("perturbation_", outcome)]], main = paste("Perturbation Distribution for", outcome), xlab = "Perturbation Value")
}
## [1] "Summary of perturbation statistics for sbp_neg :"
## [1] "Distribution of perturbations for sbp_neg :"

Check results
#####STEP 2b-6: Check results #####
# Sample 20 random person IDs
set.seed(seedset) # Use same seed for reproducibility
sampled_ids <- sample(source_data$person_id, 20)
# For each outcome, create detailed breakdown
for (outcome in unlist(all_outcomes)) {
cat("\nChecking calculations for outcome:", outcome, "\n")
cat("----------------------------------------\n")
# Filter source_data first to get consistent subsetting
sampled_data <- source_data[source_data$person_id %in% sampled_ids, ]
sampled_sim_data <- clean_simulated_data[clean_simulated_data$person_id %in% sampled_ids, ]
# Create simulated column name based on our naming convention
sim_col_name <- paste0("sim_", outcome, "_alpha_", alpha_term, "_", simulated_run_name)
# Make alpha_term a vector of the same length
alpha_vec <- rep(alpha_term, length(sampled_data$person_id))
check_df <- data.frame(
person_id = sampled_data$person_id,
original_value = sampled_data[[outcome]],
alpha = alpha_term,
ohp = sampled_data$ohp_all_ever_inperson,
original_tot_charg = sampled_data$charg_tot_pre_ed,
gamma_value = sampled_data[[gamma_term]],
dx_index = sampled_data$dx_sum_norm,
plustau = sampled_sim_data$plustau,
perturbation = sampled_sim_data[[paste0("perturbation_", outcome)]],
simulated_value = sampled_sim_data[[sim_col_name]]
)
# Add standard deviation if continuous outcome
if (outcome %in% continuous_outcomes_to_modify) {
check_df$sd <- sd(source_data[[outcome]], na.rm = TRUE)
check_df$perturbation <- with(check_df,
ifelse(ohp == 1,
alpha * plustau * sd * 0.5,
-alpha * plustau * sd * 0.5))
} else {
check_df$sd <- NA
check_df$perturbation <- NA
}
# Print summary
print(check_df)
}
##
## Checking calculations for outcome: sbp_neg
## ----------------------------------------
## person_id original_value alpha ohp original_tot_charg gamma_value dx_index
## 1 4302 -121 5 0 0.0 0.0 0.125
## 2 8859 -107 5 0 981.9 632.9 0.000
## 3 9677 -121 5 0 0.0 0.0 0.125
## 4 12126 -100 5 0 0.0 0.0 0.125
## 5 23911 -95 5 1 0.0 0.0 0.125
## 6 32548 -129 5 0 0.0 0.0 0.125
## 7 38146 -101 5 0 0.0 0.0 0.125
## 8 38237 -95 5 1 0.0 0.0 0.000
## 9 38272 -189 5 0 7440.5 1528.0 0.250
## 10 41167 -117 5 0 1525.8 1114.0 0.000
## 11 41754 -108 5 1 446.3 254.3 0.000
## 12 46648 -101 5 1 0.0 0.0 0.000
## 13 47189 -119 5 1 0.0 0.0 0.000
## 14 52317 -133 5 1 0.0 0.0 0.000
## 15 55990 -113 5 1 395.0 395.0 0.125
## 16 57064 -131 5 0 0.0 0.0 0.250
## 17 64458 -112 5 0 674.7 674.7 0.000
## 18 67645 -121 5 1 0.0 0.0 0.000
## 19 69119 -96 5 0 11472.2 2805.9 0.125
## 20 71464 -100 5 0 0.0 0.0 0.000
## plustau perturbation simulated_value sd
## 1 1.133 -46.90 -167.90 16.56
## 2 2.099 -86.90 -193.90 16.56
## 3 1.133 -46.90 -167.90 16.56
## 4 1.133 -46.90 -146.90 16.56
## 5 1.133 46.90 -48.10 16.56
## 6 1.133 -46.90 -175.90 16.56
## 7 1.133 -46.90 -147.90 16.56
## 8 1.000 41.39 -53.61 16.56
## 9 3.490 -144.47 -333.47 16.56
## 10 2.718 -112.52 -229.52 16.56
## 11 1.401 57.99 -50.01 16.56
## 12 1.000 41.39 -59.61 16.56
## 13 1.000 41.39 -77.61 16.56
## 14 1.000 41.39 -91.61 16.56
## 15 1.527 63.21 -49.79 16.56
## 16 1.284 -53.15 -184.15 16.56
## 17 1.665 -68.91 -180.91 16.56
## 18 1.000 41.39 -79.61 16.56
## 19 3.080 -127.50 -223.50 16.56
## 20 1.000 -41.39 -141.39 16.56