# Calculate the mean e' using septal and lateral values
data_final$mean_e <- rowMeans(sapply(data_final[, c("ECHO_VAL52", "ECHO_VAL62")], as.numeric), na.rm = TRUE)

# Calculate the E/e' ratio using E and mean e' values
data_final$E_e_ratio <- data_final$ECHO_VAL32 / data_final$mean_e

# Compute the body surface area (BSA) where weight is in kg and height in cm
data_final$BSA <- 0.007184 * (data_final$VS_HGT1^0.725) * (data_final$VS_WGT1^0.425)

# Compute LAVI from LAV and BSA values
data_final$LAVI <- data_final$ECHO_VAL92 / data_final$BSA

# Classify E/e' status
data_final$E_e_class <- with(data_final, ifelse(is.na(E_e_ratio), NA,
                                   ifelse(E_e_ratio < 8, "Normal",
                                   ifelse(E_e_ratio <= 14, "Borderline", "Elevated"))))
library(dplyr)

data_final <- data_final %>%
  rowwise() %>%
  mutate(
    # ASE flags (ensure numeric comparisons; convert factors/characters if needed)
    LAVI_flag = ifelse(!is.na(LAVI), LAVI > 34, NA),
    E_e_flag = ifelse(!is.na(E_e_ratio), E_e_ratio > 14, NA),
    septal_e_flag = ifelse(!is.na(ECHO_VAL52), ECHO_VAL52 < 7, NA),
    lateral_e_flag = ifelse(!is.na(ECHO_VAL62), ECHO_VAL62 < 10, NA),
    
    # Count how many ASE criteria are positive
    criteria_positive = sum(c(LAVI_flag, E_e_flag, septal_e_flag, lateral_e_flag), na.rm = TRUE),
    
    # LA strain evaluation
    strain_comment = case_when(
      is.na(A4C_R_RR1) ~ "Reservoir strain missing",
      A4C_R_RR1 < 18 ~ "Abnormal LA strain – supports dysfunction",
      A4C_R_RR1 >= 18 ~ "Normal LA strain",
      TRUE ~ "Strain unclear"
    ),
    
    # Base dysfunction classification
    diastolic_function = case_when(
      criteria_positive >= 2 ~ "Diastolic Dysfunction Present",
      criteria_positive == 1 ~ "Indeterminate Diastolic Function",
      criteria_positive == 0 ~ "Normal Diastolic Function",
      TRUE ~ "Insufficient Data"
    ),
    
    # Grading diastolic dysfunction
    diastolic_grade = case_when(
      diastolic_function == "Normal Diastolic Function" ~ "Grade 0 (Normal)",
      diastolic_function == "Indeterminate Diastolic Function" ~ "Indeterminate",
      
      # Grade I: low E/e', normal LAVI, normal LA strain
      E_e_ratio <= 14 & LAVI <= 34 & A4C_R_RR1 >= 18 ~ "Grade I (Impaired relaxation)",
      
      # Grade III: assume only if reservoir strain is very low and E/e′ very high
      E_e_ratio >= 20 & A4C_R_RR1 < 12 ~ "Grade III (Restrictive)",
      
      # Grade II: 2+ abnormal criteria or abnormal LA strain
      criteria_positive >= 2 | A4C_R_RR1 < 18 ~ "Grade II (Pseudonormal)",
      
      TRUE ~ "Cannot determine grade"
    )
  ) %>%
  ungroup()
# Replace commas with dots and convert to numeric
cols_to_fix <- c("Pre_QFR", "Pre_Length", "Pre_FlowVelocity")

data_final <- data_final %>%
  mutate(across(all_of(cols_to_fix), ~ as.numeric(gsub(",", ".", .))))
data_final <- data_final%>%
  mutate(
    CMD = case_when(
      is.na(Post_IMR) ~ NA_real_,     # Preserve NA if IMR is missing
      Post_IMR >= 25 ~ 1,             # CMD present
      Post_IMR < 25 ~ 0               # CMD absent
    )
  )
library(dplyr)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# Your existing processing code simplified for the table
data_final <- data_final %>%
  mutate(
    CMD = case_when(
      is.na(Post_IMR) ~ NA_real_,
      Post_IMR >= 25 ~ 1,
      Post_IMR < 25 ~ 0
    )
  )

# Define continuous variables
cont_vars <- c(
  "AGE_AT_INCLUSION", "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", "Complete_LVEF",
  "ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
  "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", "HBA1C_22",
  "LDL2", "EGFR2", "VS_HGT1", "VS_WGT1", "VS_meanBP1", "VS_meanBP2", "Pre_QFR",
  "Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", "Post_Length",
  "Post_FlowVelocity", "BSA"
)

# Convert to numeric safely (if needed)
data_final <- data_final %>%
  mutate(across(all_of(cont_vars), ~ as.numeric(gsub(",", ".", as.character(.)))))

# Calculate descriptive stats
cont_summary <- lapply(cont_vars, function(var) {
  values <- data_final[[var]]
  values <- values[!is.na(values)]
  q <- quantile(values, probs = c(0.25, 0.75), na.rm = TRUE)
  med <- median(values, na.rm = TRUE)
  data.frame(
    Variable = var,
    Valid_n = length(values),
    Mean = round(mean(values, na.rm = TRUE), 1),
    SD = round(sd(values, na.rm = TRUE), 1),
    Median = round(med, 1),
    `Median [IQR]` = paste0(round(med,1), " [", round(q[1],1), "–", round(q[2],1), "]"),
    `Min–Max` = paste0(round(min(values, na.rm = TRUE),1), "–", round(max(values, na.rm = TRUE),1))
  )
}) %>%
  bind_rows()

# Rename variables for readability
rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "AGE_AT_INCLUSION"= "Age at inclusion",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "ECHO_VAL32"      = "E",
  "CAP_NUM1"        = "Total ischemic time",
  "Complete_LVEF"   = "Left ventricular ejection fraction",
  "VS_WGT1"         = "Weight",
  "VS_HGT1"         = "Height",
  "VS_meanBP1"      = "Pre Mean blood pressure",
  "VS_meanBP2"      = "Post Mean blood pressure",
  "Pre_QFR"         = "Pre QFR",
  "Post_QFR"        = "Post QFR",
  "Pre_FlowVelocity"  = "Pre flow velocity",
  "Post_FlowVelocity" = "Post flow velocity",
  "Pre_IMR"         = "Pre IMR",
  "Post_IMR"        = "Post IMR",
  "BSA"             = "BSA",
  "EGFR2"           = "Renal function (eGFR)",
  "MRI1_INF_SZ_LV"  = "Infarct size (%)",
  "VS_BMI1"         = "BMI",
  "LDL2"            = "LDL cholesterol",
  "HBA1C_22"        = "HbA1c",
  "Pre_Length"      = "Pre vessel length",
  "Post_Length"     = "Post vessel length"
)

cont_summary$Variable <- rename_map[cont_summary$Variable]
cont_summary$Variable[is.na(cont_summary$Variable)] <- cont_summary$Variable[is.na(cont_summary$Variable)]

# Print nice table with kable and kableExtra
cont_summary %>%
  kable(
    caption = "Descriptive statistics for continuous variables",
    col.names = c("Variable", "Valid n", "Mean", "SD", "Median", "Median [IQR]", "Min–Max"),
    align = "lrrrrrr"
  ) %>%
  kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover"))
Descriptive statistics for continuous variables
Variable Valid n Mean SD Median Median [IQR] Min–Max
Age at inclusion 380 61.8 11.7 62.0 62 [53–70] 27–93
Post IMR 249 35.1 16.3 35.0 35 [22–46] 8–94
Infarct size (%) 257 8.8 7.7 7.0 7 [2.1–13.6] 0–32.2
Total ischemic time 380 211.6 153.6 161.0 161 [109–249.2] 30–759
Left ventricular ejection fraction 372 48.2 9.5 48.0 48 [42.8–55] 15–72
Septal e’ 337 7.6 2.1 7.6 7.6 [6.1–9] 2.3–13.7
Lateral e’ 336 9.4 3.0 9.1 9.1 [7.3–11.3] 2.8–19.1
Mean e’ 340 8.5 2.3 8.4 8.4 [6.8–9.9] 3.5–14.8
E/e’ ratio 331 0.1 0.0 0.1 0.1 [0.1–0.1] 0–0.2
LAV 303 55.6 16.7 53.0 53 [44–65] 16–134
LAVI 303 27.8 8.1 26.1 26.1 [22.4–32] 8.2–65.2
Reservoir strain 288 27.5 8.0 27.0 27 [22–33] 4–57
Conduit Strain 288 -14.8 6.0 -14.0 -14 [-18–-11] -33–13
Contraction Strain 288 -12.4 6.1 -12.0 -12 [-16–-8] -37–21
E 344 0.7 0.2 0.7 0.7 [0.5–0.8] 0.3–1.2
BMI 380 26.9 3.8 26.6 26.6 [24.2–29.2] 18.6–49.6
HbA1c 374 41.1 8.6 40.0 40 [38–42] 28–123
LDL cholesterol 377 3.8 1.0 3.8 3.8 [3.2–4.4] 1.3–9
Renal function (eGFR) 333 93.8 21.4 93.0 93 [80–106] 46–170
Height 380 176.5 9.0 176.0 176 [170–183] 153–204
Weight 380 84.2 14.3 82.2 82.2 [75–94] 53–147
Pre Mean blood pressure 354 103.3 16.3 102.5 102.5 [93–113] 61–154
Post Mean blood pressure 326 94.8 16.4 94.0 94 [82–106.8] 55–166
Pre QFR 96 0.6 0.2 0.6 0.6 [0.5–0.7] 0.2–1
Post QFR 279 1.0 0.1 1.0 1 [1–1] 0.6–1
Pre IMR 89 26.3 16.8 22.0 22 [14–34] 4–107
Pre vessel length 96 64.9 14.7 61.8 61.8 [56.4–71] 28.3–107.1
Pre flow velocity 96 17.1 8.3 14.8 14.8 [10.8–21.9] 3.9–44
Post vessel length 279 67.5 14.3 64.8 64.8 [58–74] 12.4–124.9
Post flow velocity 279 18.9 10.4 16.1 16.1 [11.9–23.1] 6.5–86.6
BSA 380 2.0 0.2 2.0 2 [1.9–2.1] 1.5–2.7
library(dplyr)
library(knitr)
library(kableExtra)

# Your Shapiro-Wilk normality test code
normality_results <- lapply(cont_vars, function(var) {
  values <- data_final[[var]]
  values <- values[!is.na(values)]
  
  if (length(values) < 3) {
    return(data.frame(Variable = var, W = NA, p_value = NA))
  }
  
  test <- shapiro.test(values)
  data.frame(Variable = var, W = as.numeric(test$statistic), p_value = as.numeric(test$p.value))
}) %>% bind_rows()

# Apply human-readable variable names
normality_results$Variable <- rename_map[normality_results$Variable]
normality_results$Variable[is.na(normality_results$Variable)] <- cont_vars[is.na(rename_map[normality_results$Variable])]

# Round the numbers nicely
normality_results <- normality_results %>%
  mutate(
    W = round(W, 3),
    p_value = signif(p_value, 3)
  )

# Create a pretty table
normality_results %>%
  kable(
    caption = "Shapiro-Wilk Normality Test Results",
    col.names = c("Variable", "W Statistic", "p-value"),
    align = c("l", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Shapiro-Wilk Normality Test Results
Variable W Statistic p-value
Age at inclusion 0.991 2.48e-02
Post IMR 0.967 1.68e-05
Infarct size (%) 0.915 0.00e+00
Total ischemic time 0.807 0.00e+00
Left ventricular ejection fraction 0.992 4.24e-02
Septal e’ 0.992 6.97e-02
Lateral e’ 0.988 6.18e-03
Mean e’ 0.989 1.15e-02
E/e’ ratio 0.896 0.00e+00
LAV 0.961 3.00e-07
LAVI 0.960 3.00e-07
Reservoir strain 0.988 1.52e-02
Conduit Strain 0.975 6.75e-05
Contraction Strain 0.960 4.00e-07
E 0.983 4.81e-04
BMI 0.953 0.00e+00
HbA1c 0.497 0.00e+00
LDL cholesterol 0.965 1.00e-07
Renal function (eGFR) 0.987 5.55e-03
Height 0.995 2.05e-01
Weight 0.982 1.01e-04
Pre Mean blood pressure 0.993 1.27e-01
Post Mean blood pressure 0.982 4.67e-04
Pre QFR 0.980 1.46e-01
Post QFR 0.649 0.00e+00
Pre IMR 0.862 1.00e-07
Pre vessel length 0.949 9.04e-04
Pre flow velocity 0.913 8.50e-06
Post vessel length 0.934 0.00e+00
Post flow velocity 0.809 0.00e+00
BSA 0.993 9.01e-02

Shapiro-Wilk normality test results: variables that do not significantly variate from the normal include septal e’, age, left ventricular ejection fraction, height, pre mean blooed pressure, pre QFR and BSA.

library(ggplot2)

# Histograms
for (var in cont_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found in dataframe! Skipping..."))
    next
  }
  
  var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
  
  p <- ggplot(data_final, aes_string(x = var)) +
    geom_histogram(bins = 30, fill = "steelblue", color = "black") +
    ggtitle(paste0("Histogram of ", var_name)) +
    theme_minimal()
  
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Boxplots
for (var in cont_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found in dataframe! Skipping..."))
    next
  }
  
  var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
  
  p <- ggplot(data_final, aes_string(x = "1", y = var)) +
    geom_boxplot(fill = "steelblue", color = "black", outlier.color = "red", outlier.shape = 16) +
    labs(title = paste("Boxplot of", var_name),
         x = "",
         y = var_name) +
    theme_minimal() +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
  print(p)
}

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

library(dplyr)

# 1. Categorical variables
cat_vars <- c(
  "MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
  "MHCR_NPC",    # Smoking
  "MHCR_NY1",    # Hypertension
  "MHCR_NY2",    # Hypercholesterolemia
  "MHCR_NY4",    # CVA
  "MHCR_NY10",   # CABG
  "MHCR_NY11",   # PCI (other)
  "MHCR_NY13",   # Cardiovascular Family History
  "diastolic_function", "diastolic_grade", "CMD",
  "E_e_class", "criteria_positive", "strain_comment"
)

# 2. Variable label mapping
rename_map <- c(
  MORT_ALL = "All-cause mortality",
  REVASC = "Revascularization",
  NEW_ONSET_HF = "New-onset heart failure",
  HF_HOSP = "HF hospitalization",
  DEMO_SEX = "Sex",
  MHCR_NPC = "Smoking",
  MHCR_NY1 = "Hypertension",
  MHCR_NY2 = "Hypercholesterolemia",
  MHCR_NY4 = "CVA (Stroke)",
  MHCR_NY10 = "CABG history",
  MHCR_NY11 = "PCI (other vessels)",
  MHCR_NY13 = "Cardiovascular family history <65 years",
  diastolic_function = "Diastolic function",
  diastolic_grade = "Diastolic grade",
  CMD = "Coronary microvascular dysfunction",
  E_e_class = "E/e' classification",
  criteria_positive = "diastolic dysfunction criteria positive",
  strain_comment = "Strain interpretation"
)

# 3. Binary variable names to convert 0 = No, 1 = Yes
binary_vars <- c(
  "All-cause mortality", "Revascularization", "New-onset heart failure",
  "HF hospitalization", "Hypertension", "Hypercholesterolemia",
  "CVA (Stroke)", "CABG history", "PCI (other vessels)",
  "Cardiovascular family history <65 years", "Coronary microvascular dysfunction"
)

# 4. Initialize list to hold summaries
summary_list <- list()

# 5. Loop through variables
for (var in cat_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found. Skipping..."))
    next
  }

  label <- rename_map[[var]]
  counts <- table(data_final[[var]], useNA = "ifany")
  percentages <- prop.table(counts) * 100

  df_summary <- data.frame(
    Variable = label,
    Category = names(counts),
    Count = as.vector(counts),
    Percentage = round(as.vector(percentages), 2),
    stringsAsFactors = FALSE
  )

  # Recode categories
  if (label %in% binary_vars) {
    df_summary$Category[df_summary$Category == "0"] <- "No"
    df_summary$Category[df_summary$Category == "1"] <- "Yes"
  } else if (label == "Sex") {
    df_summary$Category[df_summary$Category == "1"] <- "Male"
    df_summary$Category[df_summary$Category == "2"] <- "Female"
  } else if (label == "Smoking") {
    df_summary$Category[df_summary$Category == "1"] <- "Never smoked"
    df_summary$Category[df_summary$Category == "2"] <- "Former smoker"
    df_summary$Category[df_summary$Category == "3"] <- "Current smoker"
  }

  summary_list[[label]] <- df_summary
}

# 6. Combine and clean table
summary_table <- bind_rows(summary_list)

summary_table <- summary_table %>%
  group_by(Variable) %>%
  mutate(Variable = ifelse(row_number() == 1, Variable, "")) %>%
  ungroup()

# 7. View the final table
print(summary_table)
## # A tibble: 43 × 4
##    Variable                  Category Count Percentage
##    <chr>                     <chr>    <int>      <dbl>
##  1 "All-cause mortality"     No         336      88.4 
##  2 ""                        Yes         44      11.6 
##  3 "Revascularization"       No         337      88.7 
##  4 ""                        Yes         43      11.3 
##  5 "New-onset heart failure" No         355      93.4 
##  6 ""                        Yes         25       6.58
##  7 "HF hospitalization"      No         373      98.2 
##  8 ""                        Yes          7       1.84
##  9 "Sex"                     Male       285      75   
## 10 ""                        Female      95      25   
## # ℹ 33 more rows
# Optional: Export
# write.csv(summary_table, "categorical_summary_final.csv", row.names = FALSE)

library(kableExtra)
library(dplyr)
library(knitr)

summary_table <- summary_table %>%
  mutate(VarGroup = cumsum(Variable != ""))  # mark groups

colors <- rep(c("#F0F8FF", "#FAFAD2"), length.out = length(unique(summary_table$VarGroup)))

row_colors <- colors[summary_table$VarGroup]

summary_table %>%
  select(-VarGroup) %>%
  kable("html", escape = FALSE, align = "lccc") %>%
  kable_styling(full_width = FALSE, position = "left") %>%
  row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
  row_spec(1:nrow(summary_table), background = row_colors)
Variable Category Count Percentage
All-cause mortality No 336 88.42
Yes 44 11.58
Revascularization No 337 88.68
Yes 43 11.32
New-onset heart failure No 355 93.42
Yes 25 6.58
HF hospitalization No 373 98.16
Yes 7 1.84
Sex Male 285 75.00
Female 95 25.00
Smoking Never 106 27.89
Former 65 17.11
Current 209 55.00
Hypertension No 267 70.26
Yes 113 29.74
Hypercholesterolemia No 141 37.11
Yes 239 62.89
CVA (Stroke) No 377 99.21
Yes 3 0.79
CABG history No 380 100.00
PCI (other vessels) No 376 98.95
Yes 4 1.05
Cardiovascular family history No 253 66.58
Yes 127 33.42
Diastolic function Diastolic Dysfunction Present 127 33.42
Indeterminate Diastolic Function 114 30.00
Normal Diastolic Function 139 36.58
Diastolic grade Grade 0 (Normal) 139 36.58
Grade I (Impaired relaxation) 50 13.16
Grade II (Pseudonormal) 77 20.26
Indeterminate 114 30.00
Coronary microvascular dysfunction No 79 20.79
Yes 170 44.74
NA 131 34.47
E/e’ classification Normal 331 87.11
NA 49 12.89
diastolic dysfunction criteria positive 0 139 36.58
1 114 30.00
2 104 27.37
3 23 6.05
Strain interpretation Abnormal LA strain – supports dysfunction 22 5.79
Normal LA strain 266 70.00
Reservoir strain missing 92 24.21
library(dplyr)
library(knitr)
library(kableExtra)

# Your summary results dataframe
results_df <- data.frame(
  Variable = c("Septal e'", "Age at inclusion", "Left ventricular ejection fraction",
               "Height (cm)", "Pre mean blood pressure (mmHg)", "Pre QFR", "BSA"),
  Mean_CMD_0 = c(7.82, 59.8, 47.53, 175.58, 99.84, 0.56, 1.98),
  Mean_CMD_1 = c(7.72, 61.02, 48.27, 177.5, 104.32, 0.6, 2.01),
  P_value = c(0.7463, 0.3944, 0.5586, 0.1333, 0.0601, 0.347, 0.2636)
)

# Round means and p-values nicely
results_df <- results_df %>%
  mutate(
    Mean_CMD_0 = round(Mean_CMD_0, 2),
    Mean_CMD_1 = round(Mean_CMD_1, 2),
    P_value = signif(P_value, 3)
  )

# Print a nice table
results_df %>%
  kable(
    caption = "T-test results comparing CMD groups for selected normal variables",
    col.names = c("Variable", "Mean (CMD = 0)", "Mean (CMD = 1)", "P-value"),
    align = c("l", "r", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
T-test results comparing CMD groups for selected normal variables
Variable Mean (CMD = 0) Mean (CMD = 1) P-value
Septal e’ 7.82 7.72 0.7460
Age at inclusion 59.80 61.02 0.3940
Left ventricular ejection fraction 47.53 48.27 0.5590
Height (cm) 175.58 177.50 0.1330
Pre mean blood pressure (mmHg) 99.84 104.32 0.0601
Pre QFR 0.56 0.60 0.3470
BSA 1.98 2.01 0.2640
# Interpretation text you can include in your report
cat(
  "- Septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups.\n",
  "- Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant difference.\n",
  "- Left ventricular ejection fraction: Means close, p > 0.05 → no significant difference.\n",
  "- Height: Slightly taller in CMD=1, p > 0.05 → no significant difference.\n",
  "- Pre mean blood pressure: Higher in CMD=1; p-value ~0.06, close but not significant.\n",
  "- Pre QFR: Small difference, p > 0.05 → no significant difference.\n",
  "- BSA: Means very close; p > 0.05 → no significant difference.\n"
)
## - Septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups.
##  - Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant difference.
##  - Left ventricular ejection fraction: Means close, p > 0.05 → no significant difference.
##  - Height: Slightly taller in CMD=1, p > 0.05 → no significant difference.
##  - Pre mean blood pressure: Higher in CMD=1; p-value ~0.06, close but not significant.
##  - Pre QFR: Small difference, p > 0.05 → no significant difference.
##  - BSA: Means very close; p > 0.05 → no significant difference.
library(dplyr)
library(knitr)
library(kableExtra)

non_normal_vars <- c(
  "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", 
  "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
  "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", 
  "HBA1C_22", "LDL2", "EGFR2", "VS_WGT1", "VS_meanBP2", 
  "Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", 
  "Post_Length", "Post_FlowVelocity"
)

rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "AGE_AT_INCLUSION"= "Age at inclusion",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "ECHO_VAL32"      = "E",
  "CAP_NUM1"        = "Total ischemic time",
  "Complete_LVEF"   = "Left ventricular ejection fraction",
  "VS_WGT1"         = "Weight",
  "VS_HGT1"         = "Height",
  "VS_SBP1"         = "Pre Systolic blood pressure",
  "VS_DBP1"         = "Pre Diastolic blood pressure",
  "VS_meanBP1"      = "Pre Mean blood pressure",
  "VS_SBP2"         = "Post Systolic blood pressure",
  "VS_DBP2"         = "Post Diastolic blood pressure",
  "VS_meanBP2"      = "Post Mean blood pressure",
  "Pre_QFR"         = "Pre QFR",
  "Post_QFR"        = "Post QFR",
  "Pre_FlowVelocity"  = "Pre flow velocity",
  "Post_FlowVelocity" = "Post flow velocity",
  "Pre_IMR"         = "Pre IMR",
  "Post_IMR"        = "Post IMR",
  "BSA"             = "BSA",
  "EGFR2"           = "Renal function (eGFR)",
  "MRI1_INF_SZ_LV"  = "Infarct size (%)",
  "VS_BMI1"         = "BMI",
  "LDL2"            = "LDL cholesterol",
  "HBA1C_22"        = "HbA1c",
  "Pre_Length"      = "Pre vessel length",
  "Post_Length"     = "Post vessel length"
)

results <- data.frame(
  Variable = character(),
  Median_CMD_0 = numeric(),
  Median_CMD_1 = numeric(),
  P_value = numeric(),
  stringsAsFactors = FALSE
)

for (var in non_normal_vars) {
  data <- data_final %>%
    select(CMD, !!sym(var)) %>%
    filter(!is.na(CMD) & !is.na(!!sym(var)))
  
  group0 <- data %>% filter(CMD == 0) %>% pull(!!sym(var))
  group1 <- data %>% filter(CMD == 1) %>% pull(!!sym(var))
  
  if (length(group0) < 2 || length(group1) < 2) {
    warning(paste("Skipping", var, "- not enough data"))
    next
  }
  
  test <- wilcox.test(group0, group1, exact=FALSE)
  
  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[var],  # Rename variable here
      Median_CMD_0 = round(median(group0), 2),
      Median_CMD_1 = round(median(group1), 2),
      P_value = round(test$p.value, 4)
    )
  )
}

results %>%
  kable(
    caption = "Wilcoxon Rank-Sum Test Results Comparing CMD Groups on Non-Normal Variables",
    col.names = c("Variable", "Median (CMD = 0)", "Median (CMD = 1)", "P-value"),
    digits = 2,
    align = c("l", "r", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wilcoxon Rank-Sum Test Results Comparing CMD Groups on Non-Normal Variables
Variable Median (CMD = 0) Median (CMD = 1) P-value
Post_IMR Post IMR 18.00 41.50 0.00
MRI1_INF_SZ_LV Infarct size (%) 8.12 8.32 1.00
CAP_NUM1 Total ischemic time 140.00 166.50 0.11
ECHO_VAL62 Lateral e’ 9.40 9.35 0.60
mean_e Mean e’ 8.50 8.45 0.70
E_e_ratio E/e’ ratio 0.08 0.07 0.95
ECHO_VAL92 LAV 50.00 53.50 0.37
LAVI LAVI 26.23 25.97 0.70
A4C_R_RR1 Reservoir strain 26.00 27.00 0.26
A4C_CD_RR1 Conduit Strain -14.00 -15.00 0.28
A4C_CT_RR1 Contraction Strain -11.50 -12.00 0.40
ECHO_VAL32 E 0.68 0.64 0.45
VS_BMI1 BMI 26.50 26.20 0.78
HBA1C_22 HbA1c 40.00 39.00 0.87
LDL2 LDL cholesterol 3.80 3.80 0.56
EGFR2 Renal function (eGFR) 92.50 94.00 0.79
VS_WGT1 Weight 80.00 81.50 0.43
VS_meanBP2 Post Mean blood pressure 87.00 96.00 0.00
Post_QFR Post QFR 0.98 0.99 0.00
Pre_IMR Pre IMR 17.00 25.00 0.09
Pre_Length Pre vessel length 61.80 62.95 0.37
Pre_FlowVelocity Pre flow velocity 19.30 14.60 0.46
Post_Length Post vessel length 64.10 65.80 0.97
Post_FlowVelocity Post flow velocity 27.00 13.20 0.00

Interpretation of results: 1. Post IMR: Significant difference; CMD=1 group has much higher Post IMR values.

  1. Post Mean blood pressure: Significant difference; CMD=1 group has higher post mean blood pressure.

  2. Post QFR: Significant different; CMD=1 group has higher post QFR

  3. Post flow velocity: Significant difference; CMD=1 group has markedly lower post flow velocity.

  4. Pre IMR: Trend toward higher Pre IMR in CMD=1 group, but not statistically significant.

  5. Infract size:: No difference between groups; very similar infarct sizes.

  6. Total ischemic time:: No statistically significant difference, though CMD=1 tends to have longer ischemic time.

  7. all other cont. variables:: no statistically significant difference.

library(dplyr)

# 1. Variables
cat_vars <- c(
  "MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
  "MHCR_NPC", "MHCR_NY1", "MHCR_NY2", "MHCR_NY4",
  "MHCR_NY10", "MHCR_NY11", "MHCR_NY13",
  "diastolic_function", "diastolic_grade",
  "E_e_class", "criteria_positive", "strain_comment"
)

# 2. Mapping readable labels
rename_map <- c(
  MORT_ALL = "All-cause mortality",
  REVASC = "Revascularization",
  NEW_ONSET_HF = "New-onset heart failure",
  HF_HOSP = "HF hospitalization",
  DEMO_SEX = "Sex",
  MHCR_NPC = "Smoking",
  MHCR_NY1 = "Hypertension",
  MHCR_NY2 = "Hypercholesterolemia",
  MHCR_NY4 = "CVA (Stroke)",
  MHCR_NY10 = "CABG history",
  MHCR_NY11 = "PCI (other vessels)",
  MHCR_NY13 = "Cardiovascular family history <65 years",
  diastolic_function = "Diastolic function",
  diastolic_grade = "Diastolic grade",
  E_e_class = "E/e' classification",
  criteria_positive = "Diastolic dysfunction criteria positive",
  strain_comment = "Strain interpretation"
)

# 3. Run Chi-square or Fisher's test for each variable
results <- data.frame(
  Variable = character(),
  P_value = numeric(),
  Test = character(),
  stringsAsFactors = FALSE
)

for (var in cat_vars) {
  tbl <- table(data_final[[var]], data_final$CMD)
  
  # Skip if only 1 row level
  if (nrow(tbl) < 2) {
    warning(paste("Skipping", var, "- only one category present"))
    next
  }
  
  # Attempt Chi-square, fallback to Fisher if needed
  expected <- suppressWarnings(chisq.test(tbl)$expected)
  if (any(expected < 5)) {
    test <- fisher.test(tbl)
    test_name <- "Fisher's exact"
  } else {
    test <- chisq.test(tbl)
    test_name <- "Chi-square"
  }
  
  # Append results
  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[[var]],
      P_value = round(test$p.value, 4),
      Test = test_name,
      stringsAsFactors = FALSE
    )
  )
}
## Warning: Skipping MHCR_NY10 - only one category present
## Warning: Skipping E_e_class - only one category present
# 4. Print table
print(results)
##                                   Variable P_value           Test
## 1                      All-cause mortality  0.8900     Chi-square
## 2                        Revascularization  0.3971     Chi-square
## 3                  New-onset heart failure  0.8140     Chi-square
## 4                       HF hospitalization  1.0000 Fisher's exact
## 5                                      Sex  0.4337     Chi-square
## 6                                  Smoking  0.9920     Chi-square
## 7                             Hypertension  0.9179     Chi-square
## 8                     Hypercholesterolemia  0.3756     Chi-square
## 9                             CVA (Stroke)  0.5348 Fisher's exact
## 10                     PCI (other vessels)  0.5935 Fisher's exact
## 11 Cardiovascular family history <65 years  0.3128     Chi-square
## 12                      Diastolic function  0.8834     Chi-square
## 13                         Diastolic grade  0.9236     Chi-square
## 14 Diastolic dysfunction criteria positive  0.9591 Fisher's exact
## 15                   Strain interpretation  0.8597 Fisher's exact
# Optional: Pretty table using kableExtra
# install.packages("kableExtra")  # if not installed
library(kableExtra)
results %>%
  kable("html", align = "lcc", caption = "Bivariate analysis of categorical variables by CMD status") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Bivariate analysis of categorical variables by CMD status
Variable P_value Test
All-cause mortality 0.8900 Chi-square
Revascularization 0.3971 Chi-square
New-onset heart failure 0.8140 Chi-square
HF hospitalization 1.0000 Fisher’s exact
Sex 0.4337 Chi-square
Smoking 0.9920 Chi-square
Hypertension 0.9179 Chi-square
Hypercholesterolemia 0.3756 Chi-square
CVA (Stroke) 0.5348 Fisher’s exact
PCI (other vessels) 0.5935 Fisher’s exact
Cardiovascular family history <65 years 0.3128 Chi-square
Diastolic function 0.8834 Chi-square
Diastolic grade 0.9236 Chi-square
Diastolic dysfunction criteria positive 0.9591 Fisher’s exact
Strain interpretation 0.8597 Fisher’s exact

Interpretation: p value >0.05 which means there is no significant association between CMD status and baseline characteristics i.e. no difference in mortality between CMD groups, similar revascularization rates, no association of new onset heart failure with CMD, HF hospitalizations show identical distributions - P value is 1, CMD not linked to sex differences, Smoking patterns are similar across CMD groups. No difference in diastolic function between groups.

results <- data.frame(
  Variable = character(),
  CMD_1 = character(),
  CMD_0 = character(),
  P_value = numeric(),
  Test = character(),
  stringsAsFactors = FALSE
)

for (var in cat_vars) {
  tbl <- table(data_final[[var]], data_final$CMD)

  if (nrow(tbl) < 2) {
    warning(paste("Skipping", var, "- only one category present"))
    next
  }

  # Choose statistical test
  expected <- suppressWarnings(chisq.test(tbl)$expected)
  if (any(expected < 5)) {
    test <- fisher.test(tbl)
    test_name <- "Fisher's exact"
  } else {
    test <- chisq.test(tbl)
    test_name <- "Chi-square"
  }

  # Determine the "event" row (usually coded as 1, or "Yes", etc.)
  rownames_tbl <- rownames(tbl)
  target_row <- if ("1" %in% rownames_tbl) "1" else rownames_tbl[1]

  # Calculate percentages for CMD = 1 and CMD = 0
  perc_1 <- if ("1" %in% colnames(tbl)) {
    100 * tbl[target_row, "1"] / sum(tbl[, "1"])
  } else {
    100 * tbl[target_row, 2] / sum(tbl[, 2])
  }

  perc_0 <- if ("0" %in% colnames(tbl)) {
    100 * tbl[target_row, "0"] / sum(tbl[, "0"])
  } else {
    100 * tbl[target_row, 1] / sum(tbl[, 1])
  }

  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[[var]],
      CMD_1 = sprintf("%.1f%%", perc_1),
      CMD_0 = sprintf("%.1f%%", perc_0),
      P_value = round(test$p.value, 4),
      Test = test_name,
      stringsAsFactors = FALSE
    )
  )
}
## Warning: Skipping MHCR_NY10 - only one category present
## Warning: Skipping E_e_class - only one category present
# Optional: Styled table
library(kableExtra)
results %>%
  kable("html", align = "lcccl", caption = "Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)
Variable CMD_1 CMD_0 P_value Test
All-cause mortality 12.9% 11.4% 0.8900 Chi-square
Revascularization 9.4% 13.9% 0.3971 Chi-square
New-onset heart failure 5.9% 7.6% 0.8140 Chi-square
HF hospitalization 2.4% 2.5% 1.0000 Fisher’s exact
Sex 78.8% 73.4% 0.4337 Chi-square
Smoking 24.7% 25.3% 0.9920 Chi-square
Hypertension 70.6% 72.2% 0.9179 Chi-square
Hypercholesterolemia 37.1% 30.4% 0.3756 Chi-square
CVA (Stroke) 99.4% 98.7% 0.5348 Fisher’s exact
PCI (other vessels) 98.8% 97.5% 0.5935 Fisher’s exact
Cardiovascular family history <65 years 69.4% 62.0% 0.3128 Chi-square
Diastolic function 32.9% 35.4% 0.8834 Chi-square
Diastolic grade 41.2% 38.0% 0.9236 Chi-square
Diastolic dysfunction criteria positive 25.9% 26.6% 0.9591 Fisher’s exact
Strain interpretation 5.3% 6.3% 0.8597 Fisher’s exact
library(ggplot2)
library(dplyr)

# Step 1: Filter the dataset for the two groups of interest
df_strat <- data_final %>%
  filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present")) %>%
  select(Post_IMR, diastolic_function)

# Step 2: Create the boxplot with jittered individual data points
ggplot(df_strat, aes(x = diastolic_function, y = Post_IMR, fill = diastolic_function)) +
  # Boxplot with narrower width and no outliers (to avoid duplicate points with jitter)
  geom_boxplot(width = 0.4, position = position_dodge(width = 0.6), outlier.shape = NA) +
  
  # Jittered points for individual data visibility
  geom_jitter(width = 0.15, alpha = 0.4, color = "black") +
  
  # Labels and appearance
  labs(
    title = "Post-IMR by Diastolic Function Status",
    x = "Diastolic Function",
    y = "Post-IMR"
  ) +
  
  # Custom colors
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  
  # Clean theme
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(size = 12, angle = 0, vjust = 0.5)
  )
## Warning: Removed 82 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 82 rows containing missing values or values outside the scale range
## (`geom_point()`).

wilcox.test(Post_IMR ~ diastolic_function, data = df_strat)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Post_IMR by diastolic_function
## W = 4154, p-value = 0.8993
## alternative hypothesis: true location shift is not equal to 0

Interpretation Wilcoxon rank rum test: there is no signifficant differente in post IMR between patients wit diastolic dysfunction and patients with normal diastolic function.

library(dplyr)
library(tibble)

# Filter data to exclude 'Indeterminate'
filtered <- data_final %>%
  filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present"))

# Create contingency table
tab <- table(filtered$CMD, filtered$diastolic_function)
colnames(tab) <- c("Diastolic Dysfunction Present", "Normal Diastolic Function")
rownames(tab) <- c("CMD Absent (Post_IMR < 25)", "CMD Present (Post_IMR ≥ 25)")

# Convert contingency table to data frame
summary_table <- as.data.frame.matrix(tab) %>%
  rownames_to_column(var = "CMD Status") %>%
  mutate(Total = `Diastolic Dysfunction Present` + `Normal Diastolic Function`)

# Run chi-square test with Yates continuity correction
chi_res <- chisq.test(tab, correct = TRUE)

# Print the table
print(summary_table)
##                    CMD Status Diastolic Dysfunction Present
## 1  CMD Absent (Post_IMR < 25)                            28
## 2 CMD Present (Post_IMR ≥ 25)                            56
##   Normal Diastolic Function Total
## 1                        30    58
## 2                        70   126
# Create a data frame for test results
test_results <- data.frame(
  Statistic = c("Chi-squared", "Degrees of freedom", "p-value"),
  Value = c(round(chi_res$statistic, 3), chi_res$parameter, round(chi_res$p.value, 4))
)

# Print test results
cat("\nChi-square test results:\n")
## 
## Chi-square test results:
print(test_results)
##                    Statistic  Value
## X-squared        Chi-squared 0.1060
## df        Degrees of freedom 1.0000
##                      p-value 0.7448

Interpretation: CMD prevalence is not significantly higher among patients with diastolic dysfunction.

CMD (binary or continuous) does not differ significantly across diastolic function groups.

install.packages("corrplot", repos = "https://cloud.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/3g/ln4t2cvs3250dv7p12p13c_c0000gn/T//RtmpFGl0km/downloaded_packages
library(dplyr)
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(tibble)
library(knitr)
library(kableExtra)

# Define variables
vars <- c(
  "Post_IMR",
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1",    # Contraction Strain
  "MRI1_INF_SZ_LV", # Infarct size
  "Complete_LVEF"   # Complete LVEF
)

# Filter complete cases
corr_data <- data_final %>%
  select(all_of(vars)) %>%
  filter(complete.cases(.))

# Calculate Spearman correlation
corr_res <- rcorr(as.matrix(corr_data), type = "spearman")
corr_mat <- corr_res$r
p_mat <- corr_res$P

# Variables excluding Post_IMR
vars_except_post_imr <- setdiff(colnames(corr_mat), "Post_IMR")

# Build results dataframe
post_imr_corr <- data.frame(
  Variable = vars_except_post_imr,
  Correlation = corr_mat["Post_IMR", vars_except_post_imr],
  P_value = p_mat["Post_IMR", vars_except_post_imr]
)

# Rename variables for readability
rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "MRI1_INF_SZ_LV"  = "Infarct size",
  "Complete_LVEF"   = "Complete LVEF"
)

post_imr_corr$Variable <- rename_map[post_imr_corr$Variable]

# Round correlation and p-values for display
post_imr_corr <- post_imr_corr %>%
  mutate(
    Correlation = round(Correlation, 3),
    P_value = signif(P_value, 3)
  )

# Create a nice formatted table
post_imr_corr %>%
  kable(
    caption = "Spearman Correlation of Post_IMR with Other Variables",
    col.names = c("Variable", "Spearman Correlation (rho)", "P-value"),
    align = c("l", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  column_spec(3, color = ifelse(post_imr_corr$P_value < 0.05, "red", "black")) %>% # highlight significant p-values in red
  footnote(general = "P-values < 0.05 indicate statistically significant correlations.")
Spearman Correlation of Post_IMR with Other Variables
Variable Spearman Correlation (rho) P-value
ECHO_VAL52 Septal e’ -0.188 0.0543
ECHO_VAL62 Lateral e’ -0.168 0.0860
mean_e Mean e’ -0.206 0.0352
E_e_ratio E/e’ ratio 0.040 0.6840
ECHO_VAL92 LAV 0.092 0.3510
LAVI LAVI 0.073 0.4600
A4C_R_RR1 Reservoir strain -0.146 0.1360
A4C_CD_RR1 Conduit Strain 0.146 0.1370
A4C_CT_RR1 Contraction Strain 0.065 0.5090
MRI1_INF_SZ_LV Infarct size -0.083 0.4020
Complete_LVEF Complete LVEF -0.008 0.9350
Note:
P-values < 0.05 indicate statistically significant correlations.

Interpretation: The Mean e’ (average early diastolic mitral annular velocity) shows a statistically significant weak negative correlation with Post_IMR, meaning higher microvascular resistance (Post_IMR) tends to be associated with worse diastolic function (lower Mean e’).

Septal e’ shows a similar negative trend but just misses statistical significance (p = 0.054).

Other diastolic function markers (like E/e’ ratio, LAVI) and infarct size or LVEF do not show significant correlations with Post_IMR in your data.

Overall, the associations are weak, indicating that Post_IMR may have a subtle relationship mainly with early diastolic function markers but not strongly related to infarct size or overall systolic function (LVEF).

library(dplyr)
library(knitr)
library(kableExtra)

# Create the CMD_20 variable
data_sensitivity <- data_final %>%
  mutate(
    CMD_20 = case_when(
      is.na(Post_IMR) ~ NA_real_,
      Post_IMR >= 20 ~ 1,
      Post_IMR < 20 ~ 0
    )
  )

# Create contingency table
table_CMD20 <- table(data_sensitivity$CMD_20, data_sensitivity$diastolic_function)

# Perform Chi-squared test
chi_sq_result <- chisq.test(table_CMD20)

# Convert the contingency table to a data frame for nicer printing
table_df <- as.data.frame.matrix(table_CMD20)

# Add row names as a column
table_df <- table_df %>% 
  tibble::rownames_to_column(var = "CMD_20")

# Rename CMD_20 values for clarity
table_df$CMD_20 <- ifelse(table_df$CMD_20 == 0, "Post_IMR < 20", "Post_IMR >= 20")

# Print the contingency table nicely
cat("### Contingency Table: CMD_20 vs Diastolic Function\n")
## ### Contingency Table: CMD_20 vs Diastolic Function
table_df %>%
  kable(caption = "Contingency Table of CMD_20 and Diastolic Function") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Contingency Table of CMD_20 and Diastolic Function
CMD_20 Diastolic Dysfunction Present Indeterminate Diastolic Function Normal Diastolic Function
Post_IMR < 20 14 15 21
Post_IMR >= 20 70 50 79
# Print Chi-squared test results
cat("\n### Chi-squared Test Result\n")
## 
## ### Chi-squared Test Result
cat("Chi-squared =", round(chi_sq_result$statistic, 3), "\n")
## Chi-squared = 1.026
cat("Degrees of freedom =", chi_sq_result$parameter, "\n")
## Degrees of freedom = 2
cat("P-value =", signif(chi_sq_result$p.value, 4), "\n")
## P-value = 0.5986

Sensitivity analysis: The p-value (0.60) is not statistically significant.

This means there is no strong evidence of an association between CMD (using the more inclusive IMR ≥ 20) and diastolic dysfunction categories in your dataset.

This result is consistent with your primary analysis using IMR ≥ 25 (which also had a non-significant result).

Conclusion: Changing the CMD cutoff from 25 to 20 does not materially change the result — this suggests your original finding is robust to this threshold change.

# Calculate median ischemic time
median_time <- median(data_final$CAP_NUM1, na.rm = TRUE)

# Create stratified variable: early vs late
data_final <- data_final %>%
  mutate(
    ischemic_group = case_when(
      is.na(CAP_NUM1) ~ NA_character_,
      CAP_NUM1 <= median_time ~ "Early",
      CAP_NUM1 > median_time ~ "Late"
    )
  )

# Boxplot of Post_IMR by ischemic time group
library(ggplot2)

ggplot(data_final, aes(x = ischemic_group, y = Post_IMR)) +
  geom_boxplot(fill = "#69b3a2", alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.4, color = "black") +
  theme_minimal() +
  labs(title = "Post_IMR by Ischemic Time Group", x = "Ischemic Time Group", y = "Post IMR")
## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).

wilcox.test(Post_IMR ~ ischemic_group, data = data_final)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Post_IMR by ischemic_group
## W = 6965, p-value = 0.1704
## alternative hypothesis: true location shift is not equal to 0
library(knitr)
library(kableExtra)

# Prepare the test result as a data frame
wilcox_summary <- data.frame(
  Test = "Wilcoxon rank sum test",
  Data = "Post_IMR by ischemic_group",
  Statistic_W = 6965,
  P_value = 0.1704,
  Alternative_Hypothesis = "true location shift is not equal to 0"
)

# Print the table nicely
wilcox_summary %>%
  kable(caption = "Wilcoxon Rank Sum Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wilcoxon Rank Sum Test Result
Test Data Statistic_W P_value Alternative_Hypothesis
Wilcoxon rank sum test Post_IMR by ischemic_group 6965 0.1704 true location shift is not equal to 0

Sensitivity analysis using ischemic time: The p-value of 0.1704 indicates that there is no statistically significant difference in Post_IMR between patients who had early vs late revascularization (based on your median split of ischemic time).

ischemic time (as a binary early/late grouping) does not appear to significantly influence CMD severity (as measured by IMR) in this dataset.

library(dplyr)
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
library(knitr)
library(kableExtra)

data_final_backup <- readRDS("data_final_backup.rds")

# Convert to labeled factors
data_final5 <- data_final_backup %>%
  mutate(
    Hypertension = factor(MHCR_NY1, levels = c(0, 1), labels = c("No", "Yes")),
    Sex = factor(DEMO_SEX, levels = c(1, 2), labels = c("Male", "Female"))
  )

# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1 + Hypertension + Sex, data = data_final5)

# Tidy model output
model_table <- broom::tidy(model)

# Rename variables for readability
model_table <- model_table %>%
  mutate(
    term = case_when(
      term == "(Intercept)" ~ "Intercept",
      term == "Post_IMR" ~ "Post IMR",
      term == "CAP_NUM1" ~ "Ischemic Time",
      term == "Post_IMR:CAP_NUM1" ~ "Post IMR : Ischemic Time (Interaction)",
      term == "HypertensionNo" ~ "Hypertension: No",
      term == "HypertensionYes" ~ "Hypertension: Yes",
      term == "SexMale" ~ "Sex: Male",
      term == "SexFemale" ~ "Sex: Female",
      TRUE ~ term
    ),
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3)
  )

# Print the table with a title
cat("\n### Regression Model Results: Infarct Size ~ Post IMR * Ischemic Time + Hypertension + Sex\n\n")
## 
## ### Regression Model Results: Infarct Size ~ Post IMR * Ischemic Time + Hypertension + Sex
kable(model_table, caption = "Regression Model Summary", booktabs = TRUE) %>%
  kable_styling(full_width = FALSE)
Regression Model Summary
term estimate std.error statistic p.value
Intercept 7.5361 2.4220 3.112 0.0022
Post IMR -0.0015 0.0614 -0.025 0.9800
Ischemic Time 0.0077 0.0098 0.780 0.4370
Hypertension: Yes 0.6194 1.4026 0.442 0.6590
Sex: Female 1.1456 1.5485 0.740 0.4600
Post IMR : Ischemic Time (Interaction) 0.0000 0.0002 -0.130 0.8960

None of the variables or interaction terms are statistically significant (all p-values > 0.05).

The interaction between Post_IMR and ischemic time is not significant → this suggests that the relationship between IMR and infarct size doesn’t change significantly depending on ischemic time.

Model Fit R-squared = 0.023 → the model explains only 2.3% of the variance in infarct size.

Adjusted R-squared = -0.007 → adjusting for predictors makes the model slightly worse.

F-test p = 0.5719 → the overall model is not statistically significant.

# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)

# Show model summary
summary(model)
## 
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.066  -6.593  -1.186   4.954  22.096 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        7.845e+00  2.389e+00   3.284  0.00125 **
## Post_IMR          -1.430e-04  6.082e-02  -0.002  0.99813   
## CAP_NUM1           7.968e-03  9.789e-03   0.814  0.41681   
## Post_IMR:CAP_NUM1 -3.372e-05  2.269e-04  -0.149  0.88208   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.829 on 164 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.01805,    Adjusted R-squared:  8.288e-05 
## F-statistic: 1.005 on 3 and 164 DF,  p-value: 0.3923

ischemic time does not appear to modify the relationship between IMR and infarct size.

# Create log-transformed infarct size
data_final <- data_final %>%
  mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV))  # log1p handles 0 values safely

# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1 + 
                  factor(MHCR_NY1) + factor(DEMO_SEX), data = data_final)

summary(model_log)
## 
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1 + factor(MHCR_NY1) + 
##     factor(DEMO_SEX), data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1839 -0.5855  0.2618  0.7926  1.6269 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.754e+00  3.180e-01   5.516 1.35e-07 ***
## Post_IMR               -2.658e-03  8.066e-03  -0.330    0.742    
## CAP_NUM1                1.039e-03  1.290e-03   0.806    0.422    
## factor(MHCR_NY1)Yes     1.424e-01  1.841e-01   0.773    0.441    
## factor(DEMO_SEX)Female -4.678e-02  2.033e-01  -0.230    0.818    
## Post_IMR:CAP_NUM1       1.399e-06  2.990e-05   0.047    0.963    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.031 on 162 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.03197,    Adjusted R-squared:  0.002095 
## F-statistic:  1.07 on 5 and 162 DF,  p-value: 0.3789
library(broom)
library(dplyr)

# Create log-transformed infarct size
data_final <- data_final %>%
  mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV))  # handles zeros safely

# Run the model with interaction
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_final)

# Tidy model output
model_log_table <- broom::tidy(model_log)

# Rename variables for readability
rename_map <- c(
  "(Intercept)"       = "Intercept",
  "Post_IMR"          = "Post IMR",
  "CAP_NUM1"          = "Ischemic Time",
  "Post_IMR:CAP_NUM1" = "Post IMR : Ischemic Time (Interaction)"
)

model_log_table <- model_log_table %>%
  mutate(
    term = rename_map[term],
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3)
  )

print(model_log_table)
## # A tibble: 4 × 5
##   term                                   estimate std.error statistic    p.value
##   <chr>                                     <dbl>     <dbl>     <dbl>      <dbl>
## 1 Intercept                                1.76      0.313      5.61     8.59e-8
## 2 Post IMR                                -0.0019    0.008     -0.244    8.07e-1
## 3 Ischemic Time                            0.0011    0.0013     0.818    4.14e-1
## 4 Post IMR : Ischemic Time (Interaction)   0         0          0.042    9.67e-1
library(broom)
library(dplyr)

# Identify extreme IMR values (e.g., > 99th percentile)
imr_cutoff <- quantile(data_final$Post_IMR, 0.99, na.rm = TRUE)

# Create filtered dataset excluding extreme IMR
data_no_outliers <- data_final %>%
  filter(Post_IMR <= imr_cutoff)

# Fit model: infarct size ~ Post_IMR
model_no_outliers <- lm(MRI1_INF_SZ_LV ~ Post_IMR, data = data_no_outliers)

# Fit model with log-transformed infarct size and interaction
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_no_outliers)

# Create tidy summaries
summary_no_outliers <- broom::tidy(model_no_outliers) %>%
  mutate(Model = "Infarct Size ~ Post_IMR")

summary_log <- broom::tidy(model_log) %>%
  mutate(Model = "Log(Infarct Size) ~ Post_IMR * CAP_NUM1")

# Combine both tables
combined_results <- bind_rows(summary_no_outliers, summary_log) %>%
  select(Model, term, estimate, std.error, statistic = statistic, p.value) %>%
  mutate(
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3)
  )

print(combined_results)
## # A tibble: 6 × 6
##   Model                               term  estimate std.error statistic p.value
##   <chr>                               <chr>    <dbl>     <dbl>     <dbl>   <dbl>
## 1 Infarct Size ~ Post_IMR             (Int…   9.19      1.43       6.42  1.35e-9
## 2 Infarct Size ~ Post_IMR             Post…   0.0022    0.0374     0.058 9.54e-1
## 3 Log(Infarct Size) ~ Post_IMR * CAP… (Int…   1.76      0.313      5.61  8.59e-8
## 4 Log(Infarct Size) ~ Post_IMR * CAP… Post…  -0.0019    0.008     -0.244 8.07e-1
## 5 Log(Infarct Size) ~ Post_IMR * CAP… CAP_…   0.0011    0.0013     0.818 4.14e-1
## 6 Log(Infarct Size) ~ Post_IMR * CAP… Post…   0         0          0.042 9.67e-1
# Vector of diastolic dysfunction variables
diastolic_vars <- c("ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI", "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32")
# Select all variables of interest including dependent, independent, and covariates
vars_for_model <- c(
  diastolic_vars,      # your dependent variables (e.g. ECHO_VAL52, etc.)
  "Post_IMR",
  "AGE_AT_INCLUSION",
  "DEMO_SEX",
  "VS_BMI1",
  "MHCR_NY1",
  "MHCR_NPC",
  "HBA1C_22",
  "LDL2",
  "MHCR_NY2",
  "MHCR_NY4",
  "EGFR2",
  "CAP_NUM1"
)

# Count how many complete cases you have in these vars
sum(complete.cases(data_final[, vars_for_model]))
## [1] 128
# List of diastolic dysfunction dependent variables
diastolic_vars <- c(
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1"     # Contraction Strain
)

# Loop through each diastolic variable, run unadjusted regression, store results
results_list <- list()

for (var in diastolic_vars) {
  formula <- as.formula(paste(var, "~ Post_IMR"))
  
  model_data <- data_final %>%
    select(all_of(c(var, "Post_IMR"))) %>%
    filter(complete.cases(.))
  
  model <- lm(formula, data = model_data)
  
  tidy_res <- broom::tidy(model)
  tidy_res$Dependent_Var <- var
  
  results_list[[var]] <- tidy_res
}

all_results <- do.call(rbind, results_list)
all_results <- all_results[, c("Dependent_Var", names(all_results)[names(all_results) != "Dependent_Var"])]

print(all_results)
## # A tibble: 18 × 6
##    Dependent_Var term          estimate std.error statistic  p.value
##    <chr>         <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 ECHO_VAL52    (Intercept)   8.21      0.343     24.0     6.39e-63
##  2 ECHO_VAL52    Post_IMR     -0.0135    0.00907   -1.49    1.39e- 1
##  3 ECHO_VAL62    (Intercept)  10.2       0.508     20.1     2.84e-51
##  4 ECHO_VAL62    Post_IMR     -0.0206    0.0134    -1.54    1.25e- 1
##  5 mean_e        (Intercept)   9.17      0.384     23.9     7.16e-63
##  6 mean_e        Post_IMR     -0.0166    0.0101    -1.64    1.03e- 1
##  7 E_e_ratio     (Intercept)   0.0781    0.00473   16.5     6.59e-40
##  8 E_e_ratio     Post_IMR      0.000121  0.000124   0.973   3.31e- 1
##  9 ECHO_VAL92    (Intercept)  55.4       2.88      19.2     1.80e-47
## 10 ECHO_VAL92    Post_IMR      0.0126    0.0751     0.168   8.66e- 1
## 11 LAVI          (Intercept)  27.9       1.38      20.3     1.90e-50
## 12 LAVI          Post_IMR     -0.00107   0.0359    -0.0299  9.76e- 1
## 13 A4C_R_RR1     (Intercept)  27.6       1.37      20.1     5.90e-49
## 14 A4C_R_RR1     Post_IMR      0.000211  0.0350     0.00602 9.95e- 1
## 15 A4C_CD_RR1    (Intercept) -15.8       1.01     -15.8     2.57e-36
## 16 A4C_CD_RR1    Post_IMR      0.0124    0.0256     0.484   6.29e- 1
## 17 A4C_CT_RR1    (Intercept) -11.7       0.948    -12.4     3.92e-26
## 18 A4C_CT_RR1    Post_IMR     -0.0129    0.0242    -0.535   5.93e- 1
library(dplyr)
library(broom)

# Rename map for dependent vars
rename_map <- c(
  "ECHO_VAL52"   = "Septal e'",
  "ECHO_VAL62"   = "Lateral e'",
  "mean_e"       = "Mean e'",
  "E_e_ratio"    = "E/e' ratio",
  "ECHO_VAL92"   = "LAV",
  "LAVI"         = "LAVI",
  "A4C_R_RR1"    = "Reservoir strain",
  "A4C_CD_RR1"   = "Conduit strain",
  "A4C_CT_RR1"   = "Contraction strain"
)

# Format and clean up the results
formatted_results <- all_results %>%
  filter(term %in% c("(Intercept)", "Post_IMR")) %>%
  mutate(
    Dependent_Var = rename_map[Dependent_Var],
    term = ifelse(term == "(Intercept)", "Intercept", "Post_IMR"),
    estimate = round(estimate, 3),
    std.error = round(std.error, 3),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3),
    Significance = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      TRUE           ~ ""
    )
  ) %>%
  select(Dependent_Variable = Dependent_Var, Term = term, Estimate = estimate, 
         `Std. Error` = std.error, `t value` = statistic, `p value` = p.value, Significance)

print(formatted_results)
## # A tibble: 18 × 7
##    Dependent_Variable Term      Estimate `Std. Error` `t value` `p value`
##    <chr>              <chr>        <dbl>        <dbl>     <dbl>     <dbl>
##  1 Septal e'          Intercept    8.22         0.343    24.0    6.39e-63
##  2 Septal e'          Post_IMR    -0.013        0.009    -1.49   1.39e- 1
##  3 Lateral e'         Intercept   10.2          0.508    20.1    2.84e-51
##  4 Lateral e'         Post_IMR    -0.021        0.013    -1.54   1.25e- 1
##  5 Mean e'            Intercept    9.17         0.384    23.9    7.16e-63
##  6 Mean e'            Post_IMR    -0.017        0.01     -1.64   1.03e- 1
##  7 E/e' ratio         Intercept    0.078        0.005    16.5    6.59e-40
##  8 E/e' ratio         Post_IMR     0            0         0.973  3.31e- 1
##  9 LAV                Intercept   55.4          2.88     19.2    1.80e-47
## 10 LAV                Post_IMR     0.013        0.075     0.168  8.66e- 1
## 11 LAVI               Intercept   27.9          1.38     20.3    1.90e-50
## 12 LAVI               Post_IMR    -0.001        0.036    -0.03   9.76e- 1
## 13 Reservoir strain   Intercept   27.6          1.37     20.1    5.90e-49
## 14 Reservoir strain   Post_IMR     0            0.035     0.006  9.95e- 1
## 15 Conduit strain     Intercept  -15.8          1.00    -15.8    2.57e-36
## 16 Conduit strain     Post_IMR     0.012        0.026     0.484  6.29e- 1
## 17 Contraction strain Intercept  -11.7          0.948   -12.4    3.92e-26
## 18 Contraction strain Post_IMR    -0.013        0.024    -0.535  5.93e- 1
## # ℹ 1 more variable: Significance <chr>
library(dplyr)
library(broom)
library(knitr)
library(kableExtra)

# List of diastolic dysfunction dependent variables
diastolic_vars <- c(
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1"     # Contraction Strain
)

# Run regressions and collect results
results_list <- list()
for (var in diastolic_vars) {
  formula <- as.formula(paste(var, "~ Post_IMR"))
  model_data <- data_final %>% select(all_of(c(var, "Post_IMR"))) %>% filter(complete.cases(.))
  model <- lm(formula, data = model_data)
  tidy_res <- broom::tidy(model)
  tidy_res$Dependent_Var <- var
  results_list[[var]] <- tidy_res
}

all_results <- do.call(rbind, results_list)

# Rename map for dependent vars
rename_map <- c(
  "ECHO_VAL52"   = "Septal e'",
  "ECHO_VAL62"   = "Lateral e'",
  "mean_e"       = "Mean e'",
  "E_e_ratio"    = "E/e' ratio",
  "ECHO_VAL92"   = "LAV",
  "LAVI"         = "LAVI",
  "A4C_R_RR1"    = "Reservoir strain",
  "A4C_CD_RR1"   = "Conduit strain",
  "A4C_CT_RR1"   = "Contraction strain"
)

# Prepare the table
formatted_results <- all_results %>%
  filter(term %in% c("(Intercept)", "Post_IMR")) %>%
  mutate(
    Dependent_Var = rename_map[Dependent_Var],
    Term = ifelse(term == "(Intercept)", "Intercept", "Post_IMR"),
    Estimate = round(estimate, 3),
    `Std. Error` = round(std.error, 3),
    `t value` = round(statistic, 3),
    `p value` = signif(p.value, 3),
    Significance = case_when(
      `p value` < 0.001 ~ "***",
      `p value` < 0.01  ~ "**",
      `p value` < 0.05  ~ "*",
      TRUE              ~ ""
    )
  ) %>%
  select(`Dependent Variable` = Dependent_Var, Term, Estimate, `Std. Error`, `t value`, `p value`, Significance)

# Print with a title
cat("\n### Table: Linear regression results for diastolic variables vs Post_IMR\n\n")
## 
## ### Table: Linear regression results for diastolic variables vs Post_IMR
formatted_results %>%
  kable(format = "html", caption = "Regression Results: Diastolic Variables ~ Post_IMR") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Regression Results: Diastolic Variables ~ Post_IMR
Dependent Variable Term Estimate Std. Error t value p value Significance
Septal e’ Intercept 8.215 0.343 23.976 0.000 ***
Septal e’ Post_IMR -0.013 0.009 -1.486 0.139
Lateral e’ Intercept 10.198 0.508 20.072 0.000 ***
Lateral e’ Post_IMR -0.021 0.013 -1.538 0.125
Mean e’ Intercept 9.171 0.384 23.873 0.000 ***
Mean e’ Post_IMR -0.017 0.010 -1.639 0.103
E/e’ ratio Intercept 0.078 0.005 16.505 0.000 ***
E/e’ ratio Post_IMR 0.000 0.000 0.973 0.331
LAV Intercept 55.379 2.878 19.241 0.000 ***
LAV Post_IMR 0.013 0.075 0.168 0.866
LAVI Intercept 27.881 1.376 20.259 0.000 ***
LAVI Post_IMR -0.001 0.036 -0.030 0.976
Reservoir strain Intercept 27.591 1.372 20.116 0.000 ***
Reservoir strain Post_IMR 0.000 0.035 0.006 0.995
Conduit strain Intercept -15.837 1.005 -15.753 0.000 ***
Conduit strain Post_IMR 0.012 0.026 0.484 0.629
Contraction strain Intercept -11.721 0.948 -12.359 0.000 ***
Contraction strain Post_IMR -0.013 0.024 -0.535 0.593

Interpretations None of the diastolic dysfunction parameters showed a statistically significant association with Post_IMR in these unadjusted models.

The direction of effect for septal, lateral, and mean e’ is negative, consistent with worse diastolic function with higher Post_IMR, but these are not statistically significant.

The E/e’ ratio and left atrial volume indices show small and non-significant effects.

Strain measures (reservoir, conduit, contraction) also show no significant relationship.

# Load necessary libraries
library(dplyr)

# Create a new dataset so original is not altered
data_final2 <- data_final %>%
  mutate(
    # Log-transform non-normal continuous dependent variables (excluding Septal e')
    log_ECHO_VAL62 = ifelse(ECHO_VAL62 > 0, log(ECHO_VAL62), NA),
log_mean_e = ifelse(mean_e > 0, log(mean_e), NA),
log_E_e_ratio = ifelse(E_e_ratio > 0, log(E_e_ratio), NA),
log_LAVI = ifelse(LAVI > 0, log(LAVI), NA),
log_A4C_R_RR1 = ifelse(A4C_R_RR1 > 0, log(A4C_R_RR1), NA),
log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA),
log_A4C_CT_RR1 = ifelse(A4C_CT_RR1 > 0, log(A4C_CT_RR1), NA)

  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA)`.
## Caused by warning in `log()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# List of dependent variables
dependent_vars <- list(
  "ECHO_VAL52",        # Septal e' (no transform)
  "log_ECHO_VAL62",    # Lateral e' (log)
  "log_mean_e",        # Mean e' (log)
  "log_E_e_ratio",     # E/e' ratio (log)
  "log_LAVI",          # LAVI (log)
  "log_A4C_R_RR1",     # Reservoir strain (log)
  "log_A4C_CD_RR1",    # Conduit strain (log)
  "log_A4C_CT_RR1"     # Contraction strain (log)
)

# Function to run a model for each dependent variable
run_model <- function(dep_var) {
  model_formula <- as.formula(paste(dep_var, "~ Post_IMR + AGE_AT_INCLUSION + factor(DEMO_SEX) + factor(MHCR_NY1) + VS_BMI1 + 
                                     factor(MHCR_NY2) + factor(MHCR_NY4) + factor(MHCR_NY11) + factor(MHCR_NY13) + 
                                     factor(MHCR_NPC) + CAP_NUM1"))

  # Filter only rows with complete data for this model
  model_data <- data_final2 %>%
    select(all.vars(model_formula)) %>%
    filter(complete.cases(.))
  
  # Only run the model if enough data is left
  if (nrow(model_data) > 10) {
    model <- lm(model_formula, data = model_data)
    return(summary(model))
  } else {
    return(paste("Not enough data for:", dep_var))
  }
}

# Run all models
results <- lapply(dependent_vars, run_model)
names(results) <- dependent_vars

# Print results
results
## $ECHO_VAL52
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6749 -1.2902 -0.0376  1.1029  5.4068 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.302e+01  1.567e+00   8.310 1.27e-14 ***
## Post_IMR                -9.664e-03  8.035e-03  -1.203  0.23043    
## AGE_AT_INCLUSION        -7.701e-02  1.410e-02  -5.461 1.36e-07 ***
## factor(DEMO_SEX)Female  -1.878e-02  3.069e-01  -0.061  0.95126    
## factor(MHCR_NY1)Yes     -7.721e-01  2.925e-01  -2.640  0.00893 ** 
## VS_BMI1                 -2.115e-02  3.820e-02  -0.554  0.58048    
## factor(MHCR_NY2)Yes     -1.727e-02  2.677e-01  -0.065  0.94862    
## factor(MHCR_NY4)Yes      9.309e-01  1.906e+00   0.488  0.62573    
## factor(MHCR_NY11)Yes    -1.286e+00  9.444e-01  -1.362  0.17462    
## factor(MHCR_NY13)Yes    -1.434e-02  2.789e-01  -0.051  0.95905    
## factor(MHCR_NPC)Former   7.765e-01  3.791e-01   2.048  0.04182 *  
## factor(MHCR_NPC)Current  7.466e-01  3.278e-01   2.277  0.02379 *  
## CAP_NUM1                -4.797e-05  8.370e-04  -0.057  0.95435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.82 on 206 degrees of freedom
## Multiple R-squared:  0.2925, Adjusted R-squared:  0.2513 
## F-statistic: 7.098 on 12 and 206 DF,  p-value: 8.913e-11
## 
## 
## $log_ECHO_VAL62
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05876 -0.21812  0.04793  0.22229  0.73778 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.0261402  0.2774712  10.906  < 2e-16 ***
## Post_IMR                -0.0015792  0.0014137  -1.117   0.2653    
## AGE_AT_INCLUSION        -0.0100952  0.0024641  -4.097 6.03e-05 ***
## factor(DEMO_SEX)Female  -0.0035316  0.0538089  -0.066   0.9477    
## factor(MHCR_NY1)Yes     -0.1218967  0.0511617  -2.383   0.0181 *  
## VS_BMI1                 -0.0070952  0.0066448  -1.068   0.2869    
## factor(MHCR_NY2)Yes     -0.0223579  0.0471547  -0.474   0.6359    
## factor(MHCR_NY4)Yes      0.2081541  0.3342594   0.623   0.5342    
## factor(MHCR_NY11)Yes    -0.1061490  0.1654612  -0.642   0.5219    
## factor(MHCR_NY13)Yes     0.0382071  0.0491873   0.777   0.4382    
## factor(MHCR_NPC)Former   0.1572549  0.0668871   2.351   0.0197 *  
## factor(MHCR_NPC)Current  0.1090616  0.0582831   1.871   0.0627 .  
## CAP_NUM1                -0.0001493  0.0001472  -1.014   0.3116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3192 on 205 degrees of freedom
## Multiple R-squared:  0.2166, Adjusted R-squared:  0.1707 
## F-statistic: 4.722 on 12 and 205 DF,  p-value: 8.512e-07
## 
## 
## $log_mean_e
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6517 -0.1725  0.0000  0.1785  0.6115 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.844e+00  2.135e-01  13.319  < 2e-16 ***
## Post_IMR                -1.348e-03  1.092e-03  -1.235  0.21836    
## AGE_AT_INCLUSION        -9.863e-03  1.908e-03  -5.169 5.51e-07 ***
## factor(DEMO_SEX)Female  -4.480e-03  4.202e-02  -0.107  0.91519    
## factor(MHCR_NY1)Yes     -1.038e-01  3.991e-02  -2.602  0.00993 ** 
## VS_BMI1                 -5.044e-03  5.181e-03  -0.973  0.33145    
## factor(MHCR_NY2)Yes     -1.950e-02  3.655e-02  -0.533  0.59428    
## factor(MHCR_NY4)Yes      1.810e-01  2.612e-01   0.693  0.48910    
## factor(MHCR_NY11)Yes    -1.402e-01  1.293e-01  -1.084  0.27952    
## factor(MHCR_NY13)Yes     2.500e-02  3.811e-02   0.656  0.51260    
## factor(MHCR_NPC)Former   1.548e-01  5.147e-02   3.007  0.00296 ** 
## factor(MHCR_NPC)Current  1.273e-01  4.436e-02   2.870  0.00453 ** 
## CAP_NUM1                -7.967e-05  1.147e-04  -0.695  0.48804    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2495 on 208 degrees of freedom
## Multiple R-squared:  0.296,  Adjusted R-squared:  0.2554 
## F-statistic: 7.287 on 12 and 208 DF,  p-value: 4.199e-11
## 
## 
## $log_E_e_ratio
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65266 -0.20691 -0.02816  0.17271  0.85327 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -3.3617119  0.2388125 -14.077  < 2e-16 ***
## Post_IMR                 0.0003635  0.0012227   0.297 0.766581    
## AGE_AT_INCLUSION         0.0082685  0.0021354   3.872 0.000146 ***
## factor(DEMO_SEX)Female   0.1381846  0.0472004   2.928 0.003810 ** 
## factor(MHCR_NY1)Yes      0.1015971  0.0452403   2.246 0.025810 *  
## VS_BMI1                  0.0092503  0.0057972   1.596 0.112136    
## factor(MHCR_NY2)Yes      0.0013379  0.0412303   0.032 0.974145    
## factor(MHCR_NY4)Yes      0.1955202  0.2900993   0.674 0.501100    
## factor(MHCR_NY11)Yes    -0.0733406  0.1438800  -0.510 0.610797    
## factor(MHCR_NY13)Yes    -0.0265422  0.0437074  -0.607 0.544357    
## factor(MHCR_NPC)Former  -0.0552739  0.0582650  -0.949 0.343931    
## factor(MHCR_NPC)Current -0.0518744  0.0502307  -1.033 0.302975    
## CAP_NUM1                 0.0002109  0.0001279   1.649 0.100660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2768 on 201 degrees of freedom
## Multiple R-squared:  0.2251, Adjusted R-squared:  0.1789 
## F-statistic: 4.866 on 12 and 201 DF,  p-value: 5.047e-07
## 
## 
## $log_LAVI
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1287 -0.1678  0.0117  0.1749  0.9616 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.202e+00  2.638e-01  12.138   <2e-16 ***
## Post_IMR                -3.935e-04  1.304e-03  -0.302   0.7631    
## AGE_AT_INCLUSION         2.012e-03  2.430e-03   0.828   0.4086    
## factor(DEMO_SEX)Female   1.079e-02  5.165e-02   0.209   0.8347    
## factor(MHCR_NY1)Yes      8.972e-02  4.797e-02   1.870   0.0630 .  
## VS_BMI1                  1.092e-03  6.404e-03   0.170   0.8648    
## factor(MHCR_NY2)Yes     -1.041e-01  4.522e-02  -2.302   0.0224 *  
## factor(MHCR_NY4)Yes      2.256e-01  2.166e-01   1.042   0.2988    
## factor(MHCR_NY11)Yes     2.405e-02  1.530e-01   0.157   0.8753    
## factor(MHCR_NY13)Yes    -4.597e-02  4.700e-02  -0.978   0.3293    
## factor(MHCR_NPC)Former   6.123e-03  6.457e-02   0.095   0.9245    
## factor(MHCR_NPC)Current  1.783e-02  5.457e-02   0.327   0.7442    
## CAP_NUM1                -7.802e-05  1.357e-04  -0.575   0.5661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2939 on 190 degrees of freedom
## Multiple R-squared:  0.08126,    Adjusted R-squared:  0.02323 
## F-statistic:   1.4 on 12 and 190 DF,  p-value: 0.1686
## 
## 
## $log_A4C_R_RR1
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28726 -0.16347 -0.00903  0.23255  0.58418 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.6204528  0.2662794  13.596   <2e-16 ***
## Post_IMR                 0.0010891  0.0013722   0.794   0.4285    
## AGE_AT_INCLUSION        -0.0064413  0.0024951  -2.582   0.0106 *  
## factor(DEMO_SEX)Female  -0.0353346  0.0531805  -0.664   0.5073    
## factor(MHCR_NY1)Yes     -0.0524587  0.0517049  -1.015   0.3117    
## VS_BMI1                  0.0023758  0.0062718   0.379   0.7053    
## factor(MHCR_NY2)Yes     -0.1173030  0.0481184  -2.438   0.0158 *  
## factor(MHCR_NY4)Yes     -0.3085844  0.2245066  -1.375   0.1710    
## factor(MHCR_NY11)Yes    -0.2250985  0.1579848  -1.425   0.1560    
## factor(MHCR_NY13)Yes     0.0778675  0.0482985   1.612   0.1087    
## factor(MHCR_NPC)Former   0.1179118  0.0699447   1.686   0.0936 .  
## factor(MHCR_NPC)Current  0.0574994  0.0566515   1.015   0.3115    
## CAP_NUM1                -0.0001480  0.0001452  -1.020   0.3092    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3035 on 179 degrees of freedom
## Multiple R-squared:  0.1469, Adjusted R-squared:  0.08968 
## F-statistic: 2.568 on 12 and 179 DF,  p-value: 0.003665
## 
## 
## $log_A4C_CD_RR1
## [1] "Not enough data for: log_A4C_CD_RR1"
## 
## $log_A4C_CT_RR1
## [1] "Not enough data for: log_A4C_CT_RR1"

Septal E and mean e: - Age: Significant negative association. Older age is associated with lower - Septal e’ velocity (worse diastolic function). Hypertension: Significant. Hypertensive patients tend to have lower Septal e’. - Smoking: current and former smokers have a higher septal E’. Residual confousing or selection bias?

E/e ratio age, female, hypertension

LAVI: - hypercholesterolemia

# Check levels and counts for all your categorical vars
table(data_final2$MHCR_NY2)
## 
##  No Yes 
## 141 239
table(data_final2$MHCR_NY4)
## 
##  No Yes 
## 377   3
table(data_final2$MHCR_NY11)
## 
##  No Yes 
## 376   4
table(data_final2$MHCR_NY13)
## 
##  No Yes 
## 253 127
table(data_final2$MHCR_NY1)
## 
##  No Yes 
## 267 113
table(data_final2$DEMO_SEX)
## 
##   Male Female 
##    285     95
table(data_final2$MHCR_NPC)
## 
##   Never  Former Current 
##     106      65     209
table(data_final$HF_HOSP)
## 
##   0   1 
## 373   7
table(data_final$MORT_ALL)
## 
##   0   1 
## 336  44
table(data_final$REVASC)
## 
##   0   1 
## 337  43
table(data_final$NEW_ONSET_HF)
## 
##   0   1 
## 355  25
table(data_final$MHCR_NY2)
## 
##  No Yes 
## 141 239
table(data_final$MHCR_NY4)
## 
##  No Yes 
## 377   3
table(data_final$MHCR_NY10)
## 
##   0 
## 380
table(data_final$MHCR_NY11)
## 
##  No Yes 
## 376   4
table(data_final$MHCR_NY13)
## 
##  No Yes 
## 253 127
table(data_final$MHCR_NY1)
## 
##  No Yes 
## 267 113
table(data_final$DEMO_SEX)
## 
##   Male Female 
##    285     95
table(data_final$MHCR_NPC)
## 
##   Never  Former Current 
##     106      65     209
library(dplyr)
library(broom)
library(knitr)
library(kableExtra)

# Select only the needed columns and filter complete cases without modifying data_final
model_vars <- c("MRI1_INF_SZ_LV", "Post_IMR", "AGE_AT_INCLUSION", 
                "DEMO_SEX", "MHCR_NPC", "MHCR_NY1", "CAP_NUM1", "MHCR_NY11")

model_data <- data_final %>%
  select(all_of(model_vars)) %>%
  filter(complete.cases(.))

# Run the linear regression model on filtered data (no change to original dataset)
model_infarct <- lm(
  MRI1_INF_SZ_LV ~ Post_IMR + AGE_AT_INCLUSION + DEMO_SEX +
    MHCR_NPC + MHCR_NY1 + CAP_NUM1 + MHCR_NY11,
  data = model_data
)

# Tidy model output
tidy_res <- broom::tidy(model_infarct)

# Rename variables/factor terms only in the output table
rename_map <- c(
  "Post_IMR" = "Post IMR",
  "AGE_AT_INCLUSION" = "Age",
  "DEMO_SEXfemale" = "Sex: Female",
  "DEMO_SEXmale" = "Sex: Male (ref)",
  "MHCR_NPCformer" = "Smoking: Former",
  "MHCR_NPCnever" = "Smoking: Never",
  "MHCR_NPCcurrent" = "Smoking: Current",
  "MHCR_NY1yes" = "Hypertension: Yes",
  "MHCR_NY1no" = "Hypertension: No (ref)",
  "CAP_NUM1" = "Total Ischemic Time",
  "MHCR_NY11yes" = "Prior PCI: Yes",
  "MHCR_NY11no" = "Prior PCI: No (ref)"
)

# Create readable term names for factors
tidy_res$Term_Nice <- tidy_res$term
for (nm in names(rename_map)) {
  tidy_res$Term_Nice <- gsub(nm, rename_map[[nm]], tidy_res$Term_Nice)
}

# Create and display a nice HTML table with caption
tidy_res %>%
  select(Term = Term_Nice, Estimate = estimate, `Std. Error` = std.error, `t value` = statistic, `p value` = p.value) %>%
  kable(format = "html", caption = "Linear Regression Results: Infarct Size Model (No Dataset Changes)") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Linear Regression Results: Infarct Size Model (No Dataset Changes)
Term Estimate Std. Error t value p value
(Intercept) 2.9478511 4.4972869 0.6554732 0.5131104
Post IMR -0.0076330 0.0382051 -0.1997907 0.8418996
Age 0.1022016 0.0643504 1.5882046 0.1142266
DEMO_SEXFemale 1.1208379 1.5568780 0.7199267 0.4726271
MHCR_NPCFormer -1.9633776 1.8187009 -1.0795495 0.2819777
MHCR_NPCCurrent -1.3414150 1.5259429 -0.8790729 0.3806883
MHCR_NY1Yes -0.2376209 1.4442343 -0.1645307 0.8695223
Total Ischemic Time 0.0065865 0.0038386 1.7158668 0.0881342
MHCR_NY11Yes 3.9197696 4.6390524 0.8449505 0.3994081

no significant association between IMR en infract size

data_final <- data_final %>%
  mutate(
    MORT_ALL_DATE = format(mdy(MORT_ALL_DATE), "%d/%m/%Y"),
    NEW_ONSET_HF_DATE = format(mdy(NEW_ONSET_HF_DATE), "%d/%m/%Y"),
    HF_HOSP_DATE = format(mdy(HF_HOSP_DATE), "%d/%m/%Y"),
    REVASC_DATE = format(mdy(REVASC_DATE), "%d/%m/%Y")
  )

library(dplyr)
library(lubridate)

# Define your reference date
reference_date <- dmy("07/08/2015")

# Create a new dataset with calculated time differences in days
data_final_with_time_diff <- data_final %>%
  mutate(
    MORT_ALL_DATE = mdy(MORT_ALL_DATE),
    NEW_ONSET_HF_DATE = mdy(NEW_ONSET_HF_DATE),
    HF_HOSP_DATE = mdy(HF_HOSP_DATE),
    REVASC_DATE = mdy(REVASC_DATE),

    # Calculate difference in days
    days_since_mortality = as.numeric(MORT_ALL_DATE - reference_date),
    days_since_new_onset_HF = as.numeric(NEW_ONSET_HF_DATE - reference_date),
    days_since_HF_hosp = as.numeric(HF_HOSP_DATE - reference_date),
    days_since_revasc = as.numeric(REVASC_DATE - reference_date)
  )