Set up environment

#####STEP 0-1: Reset environment #####
rm(list=ls())
knitr::opts_chunk$set(echo = TRUE)
options(repos = structure(c(CRAN = "http://cran.rstudio.com/")))

#####STEP 0-2: Install packages #####
list.of.packages <- c("grf", "metafor", "splitstackshape", "dplyr", 
                     "tidyverse", "foreach", "cowplot", "reshape2", 
                     "doParallel", "survival", "readstata13", "ggplot2", 
                     "rsample", "DiagrammeR", "e1071", "pscl", "pROC", 
                     "caret", "ModelMetrics", "MatchIt", "Hmisc", "scales",
                     "lmtest", "sandwich", "haven", "rpms", "randomForest",
                     "fabricatr", "gridExtra", "VIM", "mice",
                     "lmtest", "ivreg", "kableExtra", "corrplot", "psych")
new.packages <- list.of.packages[!(list.of.packages %in% 
                                 installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

lapply(list.of.packages, library, character.only = TRUE)

## Warning: package 'grf' was built under R version 4.3.3

## Warning: package 'metafor' was built under R version 4.3.3

## Warning: package 'metadat' was built under R version 4.3.3

## Warning: package 'numDeriv' was built under R version 4.3.1

## Warning: package 'splitstackshape' was built under R version 4.3.3

## Warning: package 'dplyr' was built under R version 4.3.3

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tibble' was built under R version 4.3.3

## Warning: package 'tidyr' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.3

## Warning: package 'purrr' was built under R version 4.3.3

## Warning: package 'stringr' was built under R version 4.3.3

## Warning: package 'forcats' was built under R version 4.3.3

## Warning: package 'lubridate' was built under R version 4.3.3

## Warning: package 'foreach' was built under R version 4.3.3

## Warning: package 'cowplot' was built under R version 4.3.3

## Warning: package 'reshape2' was built under R version 4.3.3

## Warning: package 'doParallel' was built under R version 4.3.3

## Warning: package 'iterators' was built under R version 4.3.3

## Warning: package 'readstata13' was built under R version 4.3.3

## Warning: package 'rsample' was built under R version 4.3.3

## Warning: package 'DiagrammeR' was built under R version 4.3.3

## Warning: package 'e1071' was built under R version 4.3.3

## Warning: package 'pscl' was built under R version 4.3.3

## Warning: package 'pROC' was built under R version 4.3.3

## Warning: package 'caret' was built under R version 4.3.3

## Warning: package 'ModelMetrics' was built under R version 4.3.3

## Warning: package 'MatchIt' was built under R version 4.3.3

## Warning: package 'Hmisc' was built under R version 4.3.3

## Warning: package 'scales' was built under R version 4.3.3

## Warning: package 'lmtest' was built under R version 4.3.3

## Warning: package 'zoo' was built under R version 4.3.3

## Warning: package 'sandwich' was built under R version 4.3.3

## Warning: package 'haven' was built under R version 4.3.3

## Warning: package 'rpms' was built under R version 4.3.3

## Warning: package 'randomForest' was built under R version 4.3.3

## Warning: package 'fabricatr' was built under R version 4.3.3

## Warning: package 'gridExtra' was built under R version 4.3.3

## Warning: package 'VIM' was built under R version 4.3.3

## Warning: package 'colorspace' was built under R version 4.3.3

## Warning: package 'mice' was built under R version 4.3.3

## Warning in check_dep_version(): ABI version mismatch: 
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package

## Warning: package 'ivreg' was built under R version 4.3.3

## Warning: package 'kableExtra' was built under R version 4.3.3

## Warning: package 'corrplot' was built under R version 4.3.3

## Warning: package 'psych' was built under R version 4.3.3

print(paste("Version of grf package:", packageVersion("grf")))

#####STEP 0-3: Basic information #####
Sys.time()
# Get detailed R session and system information
session_info <- sessionInfo()
system_info <- Sys.info()
# Combine the output
list(session_info = session_info, system_info = system_info)

Set non-run-specific parameters

#####STEP 0-4: Set non-run-specific parameters #####
seedset <- 777
numthreadsset <- min(6, parallel::detectCores()) 
if (numthreadsset!= 6) {
  print("the results of grf vary by num.thread (publication paper used num.thread=6)")
} 

cat("number of threads (affects grf results):", numthreadsset,"\n")

## number of threads (affects grf results): 6

# Set printing options
options(digits = 4)

Set run-specific parameters

#####STEP 0-5: Set run-specific parameters #####
published_paper_run <- 0 # ANALYST FORM
if (published_paper_run == 1) {
  print("save output into PP_Full_Analysis/Saved_tables_figures/As_published/ folder")
  warning("Changing this setting to 1 overwrites the input files required for replicating on different platforms.")
} else {
  print("save output into PP_Full_Analysis/Saved_tables_figures/Testing/ folder")
}

## [1] "save output into PP_Full_Analysis/Saved_tables_figures/Testing/ folder"

cw_run_name <- "med" # ANALYST FORM
cw_grid <- 5 # ANALYST FORM

correlations_run_name <- "main_sim" # ANALYST FORM

# List of outcomes to analyze
outcomes_list <- c( # ANALYST FORM
  "sim_sbp_neg_alpha_5_presentation_cw0_lambda_0",
  "sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0",
  "sim_debt_neg_alpha_5_presentation_cw0_lambda_0"
)

# Define cw_name
cw_name <- paste0("cw_", cw_run_name, "_", cw_grid)

Set file paths

#####STEP 0-6: Set file paths #####
# Set the processed path based on published_paper_run
processedpath <- if (published_paper_run == 1) {
  paste0("PP_Full_Analysis/Intermediate_data/As_published/empirical/", cw_name)
} else {
  paste0("PP_Full_Analysis/Intermediate_data/Testing/empirical/", cw_name)
}

# Set the results_dir based on published_paper_run and cw_name
results_dir <- if (published_paper_run == 1) {
  paste0("PP_Full_Analysis/Saved_tables_figures/As_published/", cw_name)
} else {
  paste0("PP_Full_Analysis/Saved_tables_figures/Testing/", cw_name)
}

# If results directory does not exist, create it
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}

# Print processed path and results directory
print(paste("Processed path:", processedpath))

## [1] "Processed path: PP_Full_Analysis/Intermediate_data/Testing/empirical/cw_med_5"

print(paste("Results directory:", results_dir))

## [1] "Results directory: PP_Full_Analysis/Saved_tables_figures/Testing/cw_med_5"

Read data

#####STEP 8-1: Read and prepare data #####
# Lists to store individual outcome dataframes
cate_data <- list()
clate_data <- list()

# Read data for each outcome
for(outcome in outcomes_list) {
  # Extract base outcome name
  base_outcome <- sub("_lambda_.*", "", outcome)
  
  # Read the data file
  filename <- paste0(processedpath, "/cate_clate_results_", base_outcome, ".csv")
  if(file.exists(filename)) {
    data <- read.csv(filename)
    
    # Extract lambda number
    lambda_num <- sub(".*lambda_", "", outcome)
    
    # Get CATE column
    cate_col <- paste0("cate_lambda_", lambda_num, "_ranking_5")
    if(cate_col %in% names(data)) {
      # Create temporary dataframe with person_id and CATE value
      temp_cate <- data.frame(
        person_id = data$person_id,
        temp_val = data[[cate_col]]
      )
      names(temp_cate)[2] <- outcome  # Rename the value column to outcome name 
      cate_data[[outcome]] <- temp_cate
    } else {
      print(paste("CATE column not found for outcome:", outcome))
    }
    
    # Get CLATE column if it exists and outcome is not OHP
    if(!"ohp_all_ever_inperson" %in% outcome && "clate" %in% names(data)) {
      # Create temporary dataframe with person_id and CLATE value
      temp_clate <- data.frame(
        person_id = data$person_id,
        temp_val = data[["clate_ranking_5"]]
      )
      names(temp_clate)[2] <- outcome  # Rename the value column to outcome name
      clate_data[[outcome]] <- temp_clate
    } else {
      print(paste("CLATE column not found for outcome:", outcome))
    }
  }
}

# Merge all CATE dataframes with full outer join to preserve all person_ids
cate_df <- Reduce(function(x, y) merge(x, y, by = "person_id", all = TRUE), cate_data)

# Merge all CLATE dataframes with full outer join to preserve all person_ids
clate_df <- Reduce(function(x, y) merge(x, y, by = "person_id", all = TRUE), clate_data)

### Step 8-2: Add ohp_all_ever_inperson to cate and clate dataframes###
# Read data for ohp_all_ever_inperson
ohp_data <- read.csv(paste0(processedpath, "/cate_clate_results_ohp_all_ever_inperson_cw0.csv"))

# Create temporary dataframe for OHP data
ohp_temp <- data.frame(
  person_id = ohp_data$person_id,
  ohp_all_ever_inperson = ohp_data$cate_lambda_0_ranking_5
)

# Add OHP data to CATE dataframe
cate_df <- merge(cate_df, ohp_temp, by = "person_id", all = TRUE)

# Add OHP data to CLATE dataframe if it exists
clate_df <- merge(clate_df, ohp_temp, by = "person_id", all = TRUE)

# Print information on dataframes
print("Information on cate_df:")

## [1] "Information on cate_df:"

glimpse(cate_df)

## Rows: 12,208
## Columns: 5
## $ person_id                                           <int> 5, 8, 16, 17, 18, …
## $ sim_sbp_neg_alpha_5_presentation_cw0_lambda_0       <int> 3, 2, 5, 1, 5, 1, …
## $ sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0 <int> 1, 2, 5, 2, 5, 3, …
## $ sim_debt_neg_alpha_5_presentation_cw0_lambda_0      <int> 5, 1, 1, 5, 2, 1, …
## $ ohp_all_ever_inperson                               <int> 2, 2, 4, 5, 4, 1, …

print("Information on clate_df:")

## [1] "Information on clate_df:"

glimpse(clate_df)

## Rows: 12,208
## Columns: 5
## $ person_id                                           <int> 5, 8, 16, 17, 18, …
## $ sim_sbp_neg_alpha_5_presentation_cw0_lambda_0       <int> 2, 2, 5, 1, 5, 2, …
## $ sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0 <int> 1, 1, 5, 2, 5, 3, …
## $ sim_debt_neg_alpha_5_presentation_cw0_lambda_0      <int> 5, 1, 1, 5, 2, 1, …
## $ ohp_all_ever_inperson                               <int> 2, 2, 4, 5, 4, 1, …

Create correlation tables

CATE table

#####STEP 8-2: Create correlation tables #####
# Calculate correlations with standard errors for cates
cate_cors_with_se <- corr.test(cate_df[, -which(names(cate_df) == "person_id")], 
                              use = "pairwise.complete.obs")

# Create dataframes for correlations and SEs
cate_correlations <- cate_cors_with_se$r
cate_standard_errors <- cate_cors_with_se$se

# Convert matrices to data frames
cate_correlations_df <- data.frame(
  correlations = cate_correlations,
  standard_errors = cate_standard_errors
)

# Function to format correlation with SE
format_corr_se <- function(corr, se) {
  paste0(round(corr, 3), "<br>(", round(se, 3), ")")
}

# Create matrix to store formatted values
n <- nrow(cate_correlations)
formatted_matrix <- matrix("", n, n)
rownames(formatted_matrix) <- rownames(cate_correlations)
colnames(formatted_matrix) <- colnames(cate_correlations)

# Fill matrix with formatted values
for(i in 1:n) {
  for(j in 1:n) {
    formatted_matrix[i,j] <- format_corr_se(cate_correlations[i,j], cate_standard_errors[i,j])
  }
}

# Convert to data frame for saving
cate_correlations_df <- as.data.frame(formatted_matrix)

kable(cate_correlations_df)

	sim_sbp_neg_alpha_5_presentation_cw0_lambda_0	sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0	sim_debt_neg_alpha_5_presentation_cw0_lambda_0	ohp_all_ever_inperson
sim_sbp_neg_alpha_5_presentation_cw0_lambda_0	1 (0)	0.573 (0.007)	0.058 (0.009)	0.497 (0.008)
sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0	0.573 (0.007)	1 (0)	-0.081 (0.009)	0.308 (0.009)
sim_debt_neg_alpha_5_presentation_cw0_lambda_0	0.058 (0.009)	-0.081 (0.009)	1 (0)	0.543 (0.008)
ohp_all_ever_inperson	0.497 (0.008)	0.308 (0.009)	0.543 (0.008)	1 (0)

# Save to CSV
write.csv(cate_correlations_df,
          file.path(paste0(results_dir, correlations_run_name, "cate.csv")))

CLATE table

#####STEP 8-2: Create correlation tables #####
# Calculate correlations with standard errors for clates
clate_cors_with_se <- corr.test(clate_df[, -which(names(clate_df) == "person_id")], 
                              use = "pairwise.complete.obs")

# Create dataframes for correlations and SEs
clate_correlations <- clate_cors_with_se$r
clate_standard_errors <- clate_cors_with_se$se

# Convert matrices to data frames
clate_correlations_df <- data.frame(
  correlations = clate_correlations,
  standard_errors = clate_standard_errors
)

# Function to format correlation with SE
format_corr_se <- function(corr, se) {
  paste0(round(corr, 3), "<br>(", round(se, 3), ")")
}

# Create matrix to store formatted values
n <- nrow(clate_correlations)
formatted_matrix <- matrix("", n, n)
rownames(formatted_matrix) <- rownames(clate_correlations)
colnames(formatted_matrix) <- colnames(clate_correlations)

# Fill matrix with formatted values
for(i in 1:n) {
  for(j in 1:n) {
    formatted_matrix[i,j] <- format_corr_se(clate_correlations[i,j], clate_standard_errors[i,j])
  }
}

# Convert to data frame for saving
clate_correlations_df <- as.data.frame(formatted_matrix)

kable(clate_correlations_df)

	sim_sbp_neg_alpha_5_presentation_cw0_lambda_0	sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0	sim_debt_neg_alpha_5_presentation_cw0_lambda_0	ohp_all_ever_inperson
sim_sbp_neg_alpha_5_presentation_cw0_lambda_0	1 (0)	0.555 (0.008)	-0.14 (0.009)	0.351 (0.008)
sim_hdl_level_neg_alpha_5_presentation_cw0_lambda_0	0.555 (0.008)	1 (0)	-0.219 (0.009)	0.214 (0.009)
sim_debt_neg_alpha_5_presentation_cw0_lambda_0	-0.14 (0.009)	-0.219 (0.009)	1 (0)	0.241 (0.009)
ohp_all_ever_inperson	0.351 (0.008)	0.214 (0.009)	0.241 (0.009)	1 (0)

# Save to CSV
write.csv(clate_correlations_df,
          file.path(paste0(results_dir, correlations_run_name, "_clate.csv")))

Create correlation plot functions

#####STEP 8-3: Create CATE correlation plots #####
create_heatmap <- function(cor_matrix, se_matrix,title) {
  # Create data frame with correlations and SEs
  plot_data <- reshape2::melt(cor_matrix)
  plot_data$SE <- reshape2::melt(se_matrix)$value
  
  # Format the text labels
  plot_data$label <- sprintf("%.3f\n(%.3f)", plot_data$value, plot_data$SE)
  
  # Create heatmap
  heatmap <- ggplot(data = plot_data, 
                    aes(x = Var1, y = Var2, fill = value)) +
    geom_tile() +
    geom_text(aes(label = label), size = 3) +  # Add text labels
    scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                        midpoint = 0, limit = c(-1,1), name = "Correlation") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          plot.title = element_text(hjust = 0.5)) +
    labs(title = title)
  
  return(heatmap)
}

Create plots

#####STEP 8-4: Create plots #####
# Create CATE heatmap
cate_heatmap <- create_heatmap(cate_correlations, cate_standard_errors, "CATE Correlations")

print(cate_heatmap)

# Save CATE plot
ggsave(file.path(results_dir, paste0(correlations_run_name, "_cate_correlations.png")), 
       cate_heatmap, width = 10, height = 8)
ggsave(file.path(results_dir, paste0(correlations_run_name, "_cate_correlations.pdf")), 
       cate_heatmap, width = 10, height = 8)

# Create CLATE heatmap if it exists
clate_heatmap <- create_heatmap(clate_correlations, clate_standard_errors, "CLATE Correlations")

print(clate_heatmap)

# Save CLATE plot
ggsave(file.path(results_dir, paste0(correlations_run_name, "_clate_correlations.png")), 
       clate_heatmap, width = 10, height = 8)
ggsave(file.path(results_dir, paste0(correlations_run_name, "_clate_correlations.pdf")), 
       clate_heatmap, width = 10, height = 8)

9_Correlations

Revati Kapshikar, Kristine Koutout

2025-01-20

Set up environment

Set non-run-specific parameters

Set run-specific parameters

Set file paths

Read data

Create correlation tables

CATE table

CLATE table

Create correlation plot functions

Create plots