New statistics

# Calculate the mean e' using septal and lateral values
data_final$mean_e <- rowMeans(sapply(data_final[, c("ECHO_VAL52", "ECHO_VAL62")], as.numeric), na.rm = TRUE)

# Calculate the E/e' ratio using E and mean e' values
data_final$E_e_ratio <- data_final$ECHO_VAL32 / data_final$mean_e

# Compute the body surface area (BSA) where weight is in kg and height in cm
data_final$BSA <- 0.007184 * (data_final$VS_HGT1^0.725) * (data_final$VS_WGT1^0.425)

# Compute LAVI from LAV and BSA values
data_final$LAVI <- data_final$ECHO_VAL92 / data_final$BSA

# Classify E/e' status
data_final$E_e_class <- with(data_final, ifelse(is.na(E_e_ratio), NA,
                                   ifelse(E_e_ratio < 8, "Normal",
                                   ifelse(E_e_ratio <= 14, "Borderline", "Elevated"))))

library(dplyr)

data_final <- data_final %>%
  rowwise() %>%
  mutate(
    # ASE flags (ensure numeric comparisons; convert factors/characters if needed)
    LAVI_flag = ifelse(!is.na(LAVI), LAVI > 34, NA),
    E_e_flag = ifelse(!is.na(E_e_ratio), E_e_ratio > 14, NA),
    septal_e_flag = ifelse(!is.na(ECHO_VAL52), ECHO_VAL52 < 7, NA),
    lateral_e_flag = ifelse(!is.na(ECHO_VAL62), ECHO_VAL62 < 10, NA),
    
    # Count how many ASE criteria are positive
    criteria_positive = sum(c(LAVI_flag, E_e_flag, septal_e_flag, lateral_e_flag), na.rm = TRUE),
    
    # LA strain evaluation
    strain_comment = case_when(
      is.na(A4C_R_RR1) ~ "Reservoir strain missing",
      A4C_R_RR1 < 18 ~ "Abnormal LA strain – supports dysfunction",
      A4C_R_RR1 >= 18 ~ "Normal LA strain",
      TRUE ~ "Strain unclear"
    ),
    
    # Base dysfunction classification
    diastolic_function = case_when(
      criteria_positive >= 2 ~ "Diastolic Dysfunction Present",
      criteria_positive == 1 ~ "Indeterminate Diastolic Function",
      criteria_positive == 0 ~ "Normal Diastolic Function",
      TRUE ~ "Insufficient Data"
    ),
    
    # Grading diastolic dysfunction
    diastolic_grade = case_when(
      diastolic_function == "Normal Diastolic Function" ~ "Grade 0 (Normal)",
      diastolic_function == "Indeterminate Diastolic Function" ~ "Indeterminate",
      
      # Grade I: low E/e', normal LAVI, normal LA strain
      E_e_ratio <= 14 & LAVI <= 34 & A4C_R_RR1 >= 18 ~ "Grade I (Impaired relaxation)",
      
      # Grade III: assume only if reservoir strain is very low and E/e′ very high
      E_e_ratio >= 20 & A4C_R_RR1 < 12 ~ "Grade III (Restrictive)",
      
      # Grade II: 2+ abnormal criteria or abnormal LA strain
      criteria_positive >= 2 | A4C_R_RR1 < 18 ~ "Grade II (Pseudonormal)",
      
      TRUE ~ "Cannot determine grade"
    )
  ) %>%
  ungroup()

# Replace commas with dots and convert to numeric
cols_to_fix <- c("Pre_QFR", "Pre_Length", "Pre_FlowVelocity")

data_final <- data_final %>%
  mutate(across(all_of(cols_to_fix), ~ as.numeric(gsub(",", ".", .))))

data_final <- data_final%>%
  mutate(
    CMD = case_when(
      is.na(Post_IMR) ~ NA_real_,     # Preserve NA if IMR is missing
      Post_IMR >= 25 ~ 1,             # CMD present
      Post_IMR < 25 ~ 0               # CMD absent
    )
  )

# Load necessary libraries
library(dplyr)

# Define continuous variables
cont_vars <- c(
  "AGE_AT_INCLUSION", "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", "Complete_LVEF",
  "ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
  "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", "HBA1C_22",
  "LDL2", "EGFR2", "VS_HGT1", "VS_WGT1", "VS_meanBP1", "VS_meanBP2", "Pre_QFR",
  "Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", "Post_Length",
  "Post_FlowVelocity", "BSA"
)

# Ensure numeric conversion where needed
data_final <- data_final %>%
  mutate(across(all_of(cont_vars), ~ as.numeric(gsub(",", ".", .))))

# Descriptive statistics
cont_summary <- cont_vars %>%
  setNames(cont_vars) %>%
  lapply(function(var) {
    values <- data_final[[var]]
    values <- values[!is.na(values)]
    if (length(values) == 0) {
      return(data.frame(
        Variable = var,
        Valid_n = 0,
        Mean = NA, SD = NA, Median = NA,
        Q1 = NA, Q3 = NA,
        IQR = NA, Min = NA, Max = NA,
        `Median [IQR]` = NA,
        `Min–Max` = NA
      ))
    }
    q <- quantile(values, probs = c(0.25, 0.75), na.rm = TRUE)
    med <- median(values)
    data.frame(
      Variable = var,
      Valid_n = length(values),
      Mean = mean(values),
      SD = sd(values),
      Median = med,
      Q1 = q[1],
      Q3 = q[2],
      IQR = q[2] - q[1],
      Min = min(values),
      Max = max(values),
      `Median [IQR]` = paste0(round(med, 1), " [", round(q[1], 1), "–", round(q[2], 1), "]"),
      `Min–Max` = paste0(round(min(values), 1), "–", round(max(values), 1))
    )
  }) %>%
  bind_rows() %>%
  mutate(across(c(Mean, SD, Median, Q1, Q3, IQR, Min, Max), ~ round(., 1)))

# Rename variables for readability
rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "AGE_AT_INCLUSION"= "Age at inclusion",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "ECHO_VAL32"      = "E",
  "CAP_NUM1"        = "Total ischemic time",
  "Complete_LVEF"   = "Left ventricular ejection fraction",
  "VS_WGT1"         = "Weight",
  "VS_HGT1"         = "Height",
  "VS_SBP1"         = "Pre Systolic blood pressure",
  "VS_DBP1"         = "Pre Diastolic blood pressure",
  "VS_meanBP1"      = "Pre Mean blood pressure",
  "VS_SBP2"         = "Post Systolic blood pressure",
  "VS_DBP2"         = "Post Diastolic blood pressure",
  "VS_meanBP2"      = "Post Mean blood pressure",
  "Pre_QFR"         = "Pre QFR",
  "Post_QFR"        = "Post QFR",
  "Pre_FlowVelocity"  = "Pre flow velocity",
  "Post_FlowVelocity" = "Post flow velocity",
  "Pre_IMR"         = "Pre IMR",
  "Post_IMR"        = "Post IMR",
  "BSA"             = "BSA",
  "EGFR2"           = "Renal function (eGFR)",
  "MRI1_INF_SZ_LV"  = "Infract size (%)",
  "VS_BMI1"         = "BMI",
  "LDL2"            = "LDL cholesterol",
  "HBA1C_22"        = "HbA1c",
  "Pre_Length"      = "Pre vessel length",
  "Post_Length"     = "Post vessel length"
)

# Apply renaming
cont_summary$Variable <- rename_map[cont_summary$Variable]
cont_summary$Variable[is.na(cont_summary$Variable)] <- cont_summary$Variable[is.na(cont_summary$Variable)]

# Print the final summary
print(cont_summary)

##                                    Variable Valid_n  Mean    SD Median    Q1
## 25%...1                    Age at inclusion     380  61.8  11.7   62.0  53.0
## 25%...2                            Post IMR     249  35.1  16.3   35.0  22.0
## 25%...3                    Infract size (%)     257   8.8   7.7    7.0   2.1
## 25%...4                 Total ischemic time     380 211.6 153.6  161.0 109.0
## 25%...5  Left ventricular ejection fraction     372  48.2   9.5   48.0  42.8
## 25%...6                           Septal e'     337   7.6   2.1    7.6   6.1
## 25%...7                          Lateral e'     336   9.4   3.0    9.1   7.3
## 25%...8                             Mean e'     340   8.5   2.3    8.4   6.8
## 25%...9                          E/e' ratio     331   0.1   0.0    0.1   0.1
## 25%...10                                LAV     303  55.6  16.7   53.0  44.0
## 25%...11                               LAVI     303  27.8   8.1   26.1  22.4
## 25%...12                   Reservoir strain     288  27.5   8.0   27.0  22.0
## 25%...13                     Conduit Strain     288 -14.8   6.0  -14.0 -18.0
## 25%...14                 Contraction Strain     288 -12.4   6.1  -12.0 -16.0
## 25%...15                                  E     344   0.7   0.2    0.7   0.5
## 25%...16                                BMI     380  26.9   3.8   26.6  24.2
## 25%...17                              HbA1c     374  41.1   8.6   40.0  38.0
## 25%...18                    LDL cholesterol     377   3.8   1.0    3.8   3.2
## 25%...19              Renal function (eGFR)     333  93.8  21.4   93.0  80.0
## 25%...20                             Height     380 176.5   9.0  176.0 170.0
## 25%...21                             Weight     380  84.2  14.3   82.2  75.0
## 25%...22            Pre Mean blood pressure     354 103.3  16.3  102.5  93.0
## 25%...23           Post Mean blood pressure     326  94.8  16.4   94.0  82.0
## 25%...24                            Pre QFR      96   0.6   0.2    0.6   0.5
## 25%...25                           Post QFR     279   1.0   0.1    1.0   1.0
## 25%...26                            Pre IMR      89  26.3  16.8   22.0  14.0
## 25%...27                  Pre vessel length      96  64.9  14.7   61.8  56.4
## 25%...28                  Pre flow velocity      96  17.1   8.3   14.8  10.8
## 25%...29                 Post vessel length     279  67.5  14.3   64.8  58.0
## 25%...30                 Post flow velocity     279  18.9  10.4   16.1  11.9
## 25%...31                                BSA     380   2.0   0.2    2.0   1.9
##             Q3   IQR   Min   Max     Median..IQR.    Min.Max
## 25%...1   70.0  17.0  27.0  93.0       62 [53–70]      27–93
## 25%...2   46.0  24.0   8.0  94.0       35 [22–46]       8–94
## 25%...3   13.6  11.5   0.0  32.2     7 [2.1–13.6]     0–32.2
## 25%...4  249.2 140.2  30.0 759.0  161 [109–249.2]     30–759
## 25%...5   55.0  12.2  15.0  72.0     48 [42.8–55]      15–72
## 25%...6    9.0   2.9   2.3  13.7      7.6 [6.1–9]   2.3–13.7
## 25%...7   11.3   4.0   2.8  19.1   9.1 [7.3–11.3]   2.8–19.1
## 25%...8    9.9   3.1   3.5  14.8    8.4 [6.8–9.9]   3.5–14.8
## 25%...9    0.1   0.0   0.0   0.2    0.1 [0.1–0.1]      0–0.2
## 25%...10  65.0  21.0  16.0 134.0       53 [44–65]     16–134
## 25%...11  32.0   9.6   8.2  65.2   26.1 [22.4–32]   8.2–65.2
## 25%...12  33.0  11.0   4.0  57.0       27 [22–33]       4–57
## 25%...13 -11.0   7.0 -33.0  13.0    -14 [-18–-11]     -33–13
## 25%...14  -8.0   8.0 -37.0  21.0     -12 [-16–-8]     -37–21
## 25%...15   0.8   0.3   0.3   1.2    0.7 [0.5–0.8]    0.3–1.2
## 25%...16  29.2   5.0  18.6  49.6 26.6 [24.2–29.2]  18.6–49.6
## 25%...17  42.0   4.0  28.0 123.0       40 [38–42]     28–123
## 25%...18   4.4   1.2   1.3   9.0    3.8 [3.2–4.4]      1.3–9
## 25%...19 106.0  26.0  46.0 170.0      93 [80–106]     46–170
## 25%...20 183.0  13.0 153.0 204.0    176 [170–183]    153–204
## 25%...21  94.0  19.0  53.0 147.0     82.2 [75–94]     53–147
## 25%...22 113.0  20.0  61.0 154.0   102.5 [93–113]     61–154
## 25%...23 106.8  24.8  55.0 166.0    94 [82–106.8]     55–166
## 25%...24   0.7   0.2   0.2   1.0    0.6 [0.5–0.7]      0.2–1
## 25%...25   1.0   0.0   0.6   1.0          1 [1–1]      0.6–1
## 25%...26  34.0  20.0   4.0 107.0       22 [14–34]      4–107
## 25%...27  71.0  14.6  28.3 107.1   61.8 [56.4–71] 28.3–107.1
## 25%...28  21.9  11.1   3.9  44.0 14.8 [10.8–21.9]     3.9–44
## 25%...29  74.0  16.0  12.4 124.9     64.8 [58–74] 12.4–124.9
## 25%...30  23.1  11.2   6.5  86.6 16.1 [11.9–23.1]   6.5–86.6
## 25%...31   2.1   0.3   1.5   2.7      2 [1.9–2.1]    1.5–2.7

library(dplyr)
library(ggplot2)

# Perform Shapiro-Wilk normality test and store p-values
normality_results <- lapply(cont_vars, function(var) {
  values <- data_final[[var]]
  values <- values[!is.na(values)]
  
  if (length(values) < 3) {
    return(data.frame(Variable = var, W = NA, p_value = NA))
  }
  
  test <- shapiro.test(values)
  data.frame(Variable = var, W = as.numeric(test$statistic), p_value = as.numeric(test$p.value))
}) %>% bind_rows()

# Apply human-readable variable names
normality_results$Variable <- rename_map[normality_results$Variable]
normality_results$Variable[is.na(normality_results$Variable)] <- cont_vars[is.na(rename_map[normality_results$Variable])]

# Print final result
print(normality_results)

##                              Variable         W      p_value
## 1                    Age at inclusion 0.9913013 2.476416e-02
## 2                            Post IMR 0.9670925 1.681776e-05
## 3                    Infract size (%) 0.9149106 6.299865e-11
## 4                 Total ischemic time 0.8068021 5.598844e-21
## 5  Left ventricular ejection fraction 0.9919819 4.237354e-02
## 6                           Septal e' 0.9920835 6.974405e-02
## 7                          Lateral e' 0.9877486 6.177146e-03
## 8                             Mean e' 0.9890059 1.152987e-02
## 9                          E/e' ratio 0.8960622 2.928381e-14
## 10                                LAV 0.9607447 2.742724e-07
## 11                               LAVI 0.9604630 2.513231e-07
## 12                   Reservoir strain 0.9877433 1.523506e-02
## 13                     Conduit Strain 0.9751776 6.750848e-05
## 14                 Contraction Strain 0.9601121 4.146433e-07
## 15                                  E 0.9831892 4.805298e-04
## 16                                BMI 0.9534062 1.357699e-09
## 17                              HbA1c 0.4972163 3.602033e-31
## 18                    LDL cholesterol 0.9650271 7.681380e-08
## 19              Renal function (eGFR) 0.9874452 5.547691e-03
## 20                             Height 0.9946164 2.046767e-01
## 21                             Weight 0.9818021 1.011212e-04
## 22            Pre Mean blood pressure 0.9934416 1.272261e-01
## 23           Post Mean blood pressure 0.9822412 4.667785e-04
## 24                            Pre QFR 0.9798262 1.459434e-01
## 25                           Post QFR 0.6491108 1.376403e-23
## 26                            Pre IMR 0.8616745 1.270658e-07
## 27                  Pre vessel length 0.9486955 9.043549e-04
## 28                  Pre flow velocity 0.9125430 8.519495e-06
## 29                 Post vessel length 0.9343518 8.692440e-10
## 30                 Post flow velocity 0.8085450 7.694020e-18
## 31                                BSA 0.9933110 9.005425e-02

Shapiro-Wilk normality test results: variables that do not significantly variate from the normal include septal e’, age, left ventricular ejection fraction, height, pre mean blooed pressure, pre QFR and BSA.

library(ggplot2)

# Histograms
for (var in cont_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found in dataframe! Skipping..."))
    next
  }
  
  var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
  
  p <- ggplot(data_final, aes_string(x = var)) +
    geom_histogram(bins = 30, fill = "steelblue", color = "black") +
    ggtitle(paste0("Histogram of ", var_name)) +
    theme_minimal()
  
  print(p)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Boxplots
for (var in cont_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found in dataframe! Skipping..."))
    next
  }
  
  var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
  
  p <- ggplot(data_final, aes_string(x = "1", y = var)) +
    geom_boxplot(fill = "steelblue", color = "black", outlier.color = "red", outlier.shape = 16) +
    labs(title = paste("Boxplot of", var_name),
         x = "",
         y = var_name) +
    theme_minimal() +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
  print(p)
}

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

library(dplyr)

# 1. Categorical variables
cat_vars <- c(
  "MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
  "MHCR_NPC",    # Smoking
  "MHCR_NY1",    # Hypertension
  "MHCR_NY2",    # Hypercholesterolemia
  "MHCR_NY4",    # CVA
  "MHCR_NY10",   # CABG
  "MHCR_NY11",   # PCI (other)
  "MHCR_NY13",   # Cardiovascular Family History
  "diastolic_function", "diastolic_grade", "CMD",
  "E_e_class", "criteria_positive", "strain_comment"
)

# 2. Variable label mapping
rename_map <- c(
  MORT_ALL = "All-cause mortality",
  REVASC = "Revascularization",
  NEW_ONSET_HF = "New-onset heart failure",
  HF_HOSP = "HF hospitalization",
  DEMO_SEX = "Sex",
  MHCR_NPC = "Smoking",
  MHCR_NY1 = "Hypertension",
  MHCR_NY2 = "Hypercholesterolemia",
  MHCR_NY4 = "CVA (Stroke)",
  MHCR_NY10 = "CABG history",
  MHCR_NY11 = "PCI (other vessels)",
  MHCR_NY13 = "Cardiovascular family history <65 years",
  diastolic_function = "Diastolic function",
  diastolic_grade = "Diastolic grade",
  CMD = "Coronary microvascular dysfunction",
  E_e_class = "E/e' classification",
  criteria_positive = "diastolic dysfunction criteria positive",
  strain_comment = "Strain interpretation"
)

# 3. Binary variable names to convert 0 = No, 1 = Yes
binary_vars <- c(
  "All-cause mortality", "Revascularization", "New-onset heart failure",
  "HF hospitalization", "Hypertension", "Hypercholesterolemia",
  "CVA (Stroke)", "CABG history", "PCI (other vessels)",
  "Cardiovascular family history <65 years", "Coronary microvascular dysfunction"
)

# 4. Initialize list to hold summaries
summary_list <- list()

# 5. Loop through variables
for (var in cat_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found. Skipping..."))
    next
  }

  label <- rename_map[[var]]
  counts <- table(data_final[[var]], useNA = "ifany")
  percentages <- prop.table(counts) * 100

  df_summary <- data.frame(
    Variable = label,
    Category = names(counts),
    Count = as.vector(counts),
    Percentage = round(as.vector(percentages), 2),
    stringsAsFactors = FALSE
  )

  # Recode categories
  if (label %in% binary_vars) {
    df_summary$Category[df_summary$Category == "0"] <- "No"
    df_summary$Category[df_summary$Category == "1"] <- "Yes"
  } else if (label == "Sex") {
    df_summary$Category[df_summary$Category == "1"] <- "Male"
    df_summary$Category[df_summary$Category == "2"] <- "Female"
  } else if (label == "Smoking") {
    df_summary$Category[df_summary$Category == "1"] <- "Never smoked"
    df_summary$Category[df_summary$Category == "2"] <- "Former smoker"
    df_summary$Category[df_summary$Category == "3"] <- "Current smoker"
  }

  summary_list[[label]] <- df_summary
}

# 6. Combine and clean table
summary_table <- bind_rows(summary_list)

summary_table <- summary_table %>%
  group_by(Variable) %>%
  mutate(Variable = ifelse(row_number() == 1, Variable, "")) %>%
  ungroup()

# 7. View the final table
print(summary_table)

## # A tibble: 43 × 4
##    Variable                  Category Count Percentage
##    <chr>                     <chr>    <int>      <dbl>
##  1 "All-cause mortality"     No         336      88.4 
##  2 ""                        Yes         44      11.6 
##  3 "Revascularization"       No         337      88.7 
##  4 ""                        Yes         43      11.3 
##  5 "New-onset heart failure" No         355      93.4 
##  6 ""                        Yes         25       6.58
##  7 "HF hospitalization"      No         373      98.2 
##  8 ""                        Yes          7       1.84
##  9 "Sex"                     Male       285      75   
## 10 ""                        Female      95      25   
## # ℹ 33 more rows

# Optional: Export
# write.csv(summary_table, "categorical_summary_final.csv", row.names = FALSE)

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(dplyr)
library(knitr)

summary_table <- summary_table %>%
  mutate(VarGroup = cumsum(Variable != ""))  # mark groups

colors <- rep(c("#F0F8FF", "#FAFAD2"), length.out = length(unique(summary_table$VarGroup)))

row_colors <- colors[summary_table$VarGroup]

summary_table %>%
  select(-VarGroup) %>%
  kable("html", escape = FALSE, align = "lccc") %>%
  kable_styling(full_width = FALSE, position = "left") %>%
  row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
  row_spec(1:nrow(summary_table), background = row_colors)

Variable	Category	Count	Percentage
All-cause mortality	No	336	88.42
	Yes	44	11.58
Revascularization	No	337	88.68
	Yes	43	11.32
New-onset heart failure	No	355	93.42
	Yes	25	6.58
HF hospitalization	No	373	98.16
	Yes	7	1.84
Sex	Male	285	75.00
	Female	95	25.00
Smoking	Never smoked	106	27.89
	Former smoker	65	17.11
	Current smoker	209	55.00
Hypertension	No	267	70.26
	Yes	113	29.74
Hypercholesterolemia	No	141	37.11
	Yes	239	62.89
CVA (Stroke)	No	377	99.21
	Yes	3	0.79
CABG history	No	380	100.00
PCI (other vessels)	No	376	98.95
	Yes	4	1.05
Cardiovascular family history	No	253	66.58
	Yes	127	33.42
Diastolic function	Diastolic Dysfunction Present	127	33.42
	Indeterminate Diastolic Function	114	30.00
	Normal Diastolic Function	139	36.58
Diastolic grade	Grade 0 (Normal)	139	36.58
	Grade I (Impaired relaxation)	50	13.16
	Grade II (Pseudonormal)	77	20.26
	Indeterminate	114	30.00
Coronary microvascular dysfunction	No	79	20.79
	Yes	170	44.74
	NA	131	34.47
E/e’ classification	Normal	331	87.11
	NA	49	12.89
diastolic dysfunction criteria positive	0	139	36.58
	1	114	30.00
	2	104	27.37
	3	23	6.05
Strain interpretation	Abnormal LA strain – supports dysfunction	22	5.79
	Normal LA strain	266	70.00
	Reservoir strain missing	92	24.21

library(dplyr)

normal_vars <- c(
  "ECHO_VAL52",  # Septal e'
  "AGE_AT_INCLUSION",
  "Complete_LVEF",
  "VS_HGT1",      # Height
  "VS_meanBP1",   # Pre mean blood pressure
  "Pre_QFR",
  "BSA"
)

for (var in normal_vars) {
  # Select the variable and CMD
  data <- data_final %>%
    select(CMD, !!sym(var)) %>%
    filter(!is.na(CMD) & !is.na(!!sym(var)))

  # Separate into CMD groups
  group0 <- data %>% filter(CMD == 0) %>% pull(!!sym(var))
  group1 <- data %>% filter(CMD == 1) %>% pull(!!sym(var))

  # ✅ CHECK: do both groups have at least 2 values?
  if (length(group0) < 2 || length(group1) < 2) {
    cat("\n⚠️ Skipping", var, "- not enough data in one of the groups\n")
    next  # Skip to next variable
  }

  # Run t-test
  test <- t.test(group0, group1)

  # Print result
  cat("\nVariable:", var,
      "\nTest: t-test",
      "\nMean (CMD = 0):", round(mean(group0), 2),
      "\nMean (CMD = 1):", round(mean(group1), 2),
      "\nP-value:", round(test$p.value, 4), "\n")
}

## 
## Variable: ECHO_VAL52 
## Test: t-test 
## Mean (CMD = 0): 7.82 
## Mean (CMD = 1): 7.72 
## P-value: 0.7463 
## 
## Variable: AGE_AT_INCLUSION 
## Test: t-test 
## Mean (CMD = 0): 59.8 
## Mean (CMD = 1): 61.02 
## P-value: 0.3944 
## 
## Variable: Complete_LVEF 
## Test: t-test 
## Mean (CMD = 0): 47.53 
## Mean (CMD = 1): 48.27 
## P-value: 0.5586 
## 
## Variable: VS_HGT1 
## Test: t-test 
## Mean (CMD = 0): 175.58 
## Mean (CMD = 1): 177.5 
## P-value: 0.1333 
## 
## Variable: VS_meanBP1 
## Test: t-test 
## Mean (CMD = 0): 99.84 
## Mean (CMD = 1): 104.32 
## P-value: 0.0601 
## 
## Variable: Pre_QFR 
## Test: t-test 
## Mean (CMD = 0): 0.56 
## Mean (CMD = 1): 0.6 
## P-value: 0.347 
## 
## Variable: BSA 
## Test: t-test 
## Mean (CMD = 0): 1.98 
## Mean (CMD = 1): 2.01 
## P-value: 0.2636

# Create a summary data frame with your t-test results
results_df <- data.frame(
  Variable = c("ECHO_VAL52", "AGE_AT_INCLUSION", "Complete_LVEF", "VS_HGT1", 
               "VS_meanBP1", "Pre_QFR", "BSA"),
  Mean_CMD_0 = c(7.82, 59.8, 47.53, 175.58, 99.84, 0.56, 1.98),
  Mean_CMD_1 = c(7.72, 61.02, 48.27, 177.5, 104.32, 0.6, 2.01),
  P_value = c(0.7463, 0.3944, 0.5586, 0.1333, 0.0601, 0.347, 0.2636)
)

# Print the table nicely
print(results_df)

##           Variable Mean_CMD_0 Mean_CMD_1 P_value
## 1       ECHO_VAL52       7.82       7.72  0.7463
## 2 AGE_AT_INCLUSION      59.80      61.02  0.3944
## 3    Complete_LVEF      47.53      48.27  0.5586
## 4          VS_HGT1     175.58     177.50  0.1333
## 5       VS_meanBP1      99.84     104.32  0.0601
## 6          Pre_QFR       0.56       0.60  0.3470
## 7              BSA       1.98       2.01  0.2636

cat(
  "- septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups for Septal e'\n",
  "- Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant age difference between groups.\n",
  "- Complete_LVEF:Means close, p > 0.05 → no significant difference in left ventricular ejection fraction between groups.\n",
  "- Height: Slightly taller in CMD=1 group, but p > 0.05 → no significant height difference. \n",
  "- Pre mean blood pressure: Mean blood pressure somewhat higher in CMD=1; p-value close to 0.05 but still not quite statistically significant. \n",
  "- Pre QFR: Small mean difference, p > 0.05 → no significant difference in Pre_QFR between groups.\n",
  "- BSA: Means very close; p > 0.05 → no significant difference in BSA.\n"
)

## - septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups for Septal e'
##  - Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant age difference between groups.
##  - Complete_LVEF:Means close, p > 0.05 → no significant difference in left ventricular ejection fraction between groups.
##  - Height: Slightly taller in CMD=1 group, but p > 0.05 → no significant height difference. 
##  - Pre mean blood pressure: Mean blood pressure somewhat higher in CMD=1; p-value close to 0.05 but still not quite statistically significant. 
##  - Pre QFR: Small mean difference, p > 0.05 → no significant difference in Pre_QFR between groups.
##  - BSA: Means very close; p > 0.05 → no significant difference in BSA.

library(dplyr)

non_normal_vars <- c(
  "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", 
  "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
  "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", 
  "HBA1C_22", "LDL2", "EGFR2", "VS_WGT1", "VS_meanBP2", 
  "Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", 
  "Post_Length", "Post_FlowVelocity"
)

rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "AGE_AT_INCLUSION"= "Age at inclusion",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "ECHO_VAL32"      = "E",
  "CAP_NUM1"        = "Total ischemic time",
  "Complete_LVEF"   = "Left ventricular ejection fraction",
  "VS_WGT1"         = "Weight",
  "VS_HGT1"         = "Height",
  "VS_SBP1"         = "Pre Systolic blood pressure",
  "VS_DBP1"         = "Pre Diastolic blood pressure",
  "VS_meanBP1"      = "Pre Mean blood pressure",
  "VS_SBP2"         = "Post Systolic blood pressure",
  "VS_DBP2"         = "Post Diastolic blood pressure",
  "VS_meanBP2"      = "Post Mean blood pressure",
  "Pre_QFR"         = "Pre QFR",
  "Post_QFR"        = "Post QFR",
  "Pre_FlowVelocity"  = "Pre flow velocity",
  "Post_FlowVelocity" = "Post flow velocity",
  "Pre_IMR"         = "Pre IMR",
  "Post_IMR"        = "Post IMR",
  "BSA"             = "BSA",
  "EGFR2"           = "Renal function (eGFR)",
  "MRI1_INF_SZ_LV"  = "Infract size (%)",
  "VS_BMI1"         = "BMI",
  "LDL2"            = "LDL cholesterol",
  "HBA1C_22"        = "HbA1c",
  "Pre_Length"      = "Pre vessel length",
  "Post_Length"     = "Post vessel length"
)

results <- data.frame(
  Variable = character(),
  Median_CMD_0 = numeric(),
  Median_CMD_1 = numeric(),
  P_value = numeric(),
  stringsAsFactors = FALSE
)

for (var in non_normal_vars) {
  data <- data_final %>%
    select(CMD, !!sym(var)) %>%
    filter(!is.na(CMD) & !is.na(!!sym(var)))
  
  group0 <- data %>% filter(CMD == 0) %>% pull(!!sym(var))
  group1 <- data %>% filter(CMD == 1) %>% pull(!!sym(var))
  
  if (length(group0) < 2 || length(group1) < 2) {
    warning(paste("Skipping", var, "- not enough data"))
    next
  }
  
  test <- wilcox.test(group0, group1, exact=FALSE)
  
  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[var],  # Rename variable here
      Median_CMD_0 = round(median(group0), 2),
      Median_CMD_1 = round(median(group1), 2),
      P_value = round(test$p.value, 4)
    )
  )
}

print(results)

##                                   Variable Median_CMD_0 Median_CMD_1 P_value
## Post_IMR                          Post IMR        18.00        41.50  0.0000
## MRI1_INF_SZ_LV            Infract size (%)         8.12         8.32  0.9959
## CAP_NUM1               Total ischemic time       140.00       166.50  0.1062
## ECHO_VAL62                      Lateral e'         9.40         9.35  0.5952
## mean_e                             Mean e'         8.50         8.45  0.6994
## E_e_ratio                       E/e' ratio         0.08         0.07  0.9503
## ECHO_VAL92                             LAV        50.00        53.50  0.3664
## LAVI                                  LAVI        26.23        25.97  0.6977
## A4C_R_RR1                 Reservoir strain        26.00        27.00  0.2607
## A4C_CD_RR1                  Conduit Strain       -14.00       -15.00  0.2798
## A4C_CT_RR1              Contraction Strain       -11.50       -12.00  0.4012
## ECHO_VAL32                               E         0.68         0.64  0.4456
## VS_BMI1                                BMI        26.50        26.20  0.7753
## HBA1C_22                             HbA1c        40.00        39.00  0.8711
## LDL2                       LDL cholesterol         3.80         3.80  0.5575
## EGFR2                Renal function (eGFR)        92.50        94.00  0.7939
## VS_WGT1                             Weight        80.00        81.50  0.4290
## VS_meanBP2        Post Mean blood pressure        87.00        96.00  0.0000
## Post_QFR                          Post QFR         0.98         0.99  0.0035
## Pre_IMR                            Pre IMR        17.00        25.00  0.0857
## Pre_Length               Pre vessel length        61.80        62.95  0.3676
## Pre_FlowVelocity         Pre flow velocity        19.30        14.60  0.4604
## Post_Length             Post vessel length        64.10        65.80  0.9683
## Post_FlowVelocity       Post flow velocity        27.00        13.20  0.0000

Interpretation of results: 1. Post IMR: Significant difference; CMD=1 group has much higher Post IMR values.

Post Mean blood pressure: Significant difference; CMD=1 group has higher post mean blood pressure.
Post QFR: Significant different; CMD=1 group has higher post QFR
Post flow velocity: Significant difference; CMD=1 group has markedly lower post flow velocity.
Pre IMR: Trend toward higher Pre IMR in CMD=1 group, but not statistically significant.
Infract size:: No difference between groups; very similar infarct sizes.
Total ischemic time:: No statistically significant difference, though CMD=1 tends to have longer ischemic time.
all other cont. variables:: no statistically significant difference.

library(dplyr)

# 1. Variables
cat_vars <- c(
  "MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
  "MHCR_NPC", "MHCR_NY1", "MHCR_NY2", "MHCR_NY4",
  "MHCR_NY10", "MHCR_NY11", "MHCR_NY13",
  "diastolic_function", "diastolic_grade",
  "E_e_class", "criteria_positive", "strain_comment"
)

# 2. Mapping readable labels
rename_map <- c(
  MORT_ALL = "All-cause mortality",
  REVASC = "Revascularization",
  NEW_ONSET_HF = "New-onset heart failure",
  HF_HOSP = "HF hospitalization",
  DEMO_SEX = "Sex",
  MHCR_NPC = "Smoking",
  MHCR_NY1 = "Hypertension",
  MHCR_NY2 = "Hypercholesterolemia",
  MHCR_NY4 = "CVA (Stroke)",
  MHCR_NY10 = "CABG history",
  MHCR_NY11 = "PCI (other vessels)",
  MHCR_NY13 = "Cardiovascular family history <65 years",
  diastolic_function = "Diastolic function",
  diastolic_grade = "Diastolic grade",
  E_e_class = "E/e' classification",
  criteria_positive = "Diastolic dysfunction criteria positive",
  strain_comment = "Strain interpretation"
)

# 3. Run Chi-square or Fisher's test for each variable
results <- data.frame(
  Variable = character(),
  P_value = numeric(),
  Test = character(),
  stringsAsFactors = FALSE
)

for (var in cat_vars) {
  tbl <- table(data_final[[var]], data_final$CMD)
  
  # Skip if only 1 row level
  if (nrow(tbl) < 2) {
    warning(paste("Skipping", var, "- only one category present"))
    next
  }
  
  # Attempt Chi-square, fallback to Fisher if needed
  expected <- suppressWarnings(chisq.test(tbl)$expected)
  if (any(expected < 5)) {
    test <- fisher.test(tbl)
    test_name <- "Fisher's exact"
  } else {
    test <- chisq.test(tbl)
    test_name <- "Chi-square"
  }
  
  # Append results
  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[[var]],
      P_value = round(test$p.value, 4),
      Test = test_name,
      stringsAsFactors = FALSE
    )
  )
}

## Warning: Skipping MHCR_NY10 - only one category present

## Warning: Skipping E_e_class - only one category present

# 4. Print table
print(results)

##                                   Variable P_value           Test
## 1                      All-cause mortality  0.8900     Chi-square
## 2                        Revascularization  0.3971     Chi-square
## 3                  New-onset heart failure  0.8140     Chi-square
## 4                       HF hospitalization  1.0000 Fisher's exact
## 5                                      Sex  0.4337     Chi-square
## 6                                  Smoking  0.9920     Chi-square
## 7                             Hypertension  0.9179     Chi-square
## 8                     Hypercholesterolemia  0.3756     Chi-square
## 9                             CVA (Stroke)  0.5348 Fisher's exact
## 10                     PCI (other vessels)  0.5935 Fisher's exact
## 11 Cardiovascular family history <65 years  0.3128     Chi-square
## 12                      Diastolic function  0.8834     Chi-square
## 13                         Diastolic grade  0.9236     Chi-square
## 14 Diastolic dysfunction criteria positive  0.9591 Fisher's exact
## 15                   Strain interpretation  0.8597 Fisher's exact

# Optional: Pretty table using kableExtra
# install.packages("kableExtra")  # if not installed
library(kableExtra)
results %>%
  kable("html", align = "lcc", caption = "Bivariate analysis of categorical variables by CMD status") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Bivariate analysis of categorical variables by CMD status
Variable	P_value	Test
All-cause mortality	0.8900	Chi-square
Revascularization	0.3971	Chi-square
New-onset heart failure	0.8140	Chi-square
HF hospitalization	1.0000	Fisher’s exact
Sex	0.4337	Chi-square
Smoking	0.9920	Chi-square
Hypertension	0.9179	Chi-square
Hypercholesterolemia	0.3756	Chi-square
CVA (Stroke)	0.5348	Fisher’s exact
PCI (other vessels)	0.5935	Fisher’s exact
Cardiovascular family history <65 years	0.3128	Chi-square
Diastolic function	0.8834	Chi-square
Diastolic grade	0.9236	Chi-square
Diastolic dysfunction criteria positive	0.9591	Fisher’s exact
Strain interpretation	0.8597	Fisher’s exact

Interpretation: p value >0.05 which means there is no significant association between CMD status and baseline characteristics i.e. no difference in mortality between CMD groups, similar revascularization rates, no association of new onset heart failure with CMD, HF hospitalizations show identical distributions - P value is 1, CMD not linked to sex differences, Smoking patterns are similar across CMD groups. No difference in diastolic function between groups.

results <- data.frame(
  Variable = character(),
  CMD_1 = character(),
  CMD_0 = character(),
  P_value = numeric(),
  Test = character(),
  stringsAsFactors = FALSE
)

for (var in cat_vars) {
  tbl <- table(data_final[[var]], data_final$CMD)

  if (nrow(tbl) < 2) {
    warning(paste("Skipping", var, "- only one category present"))
    next
  }

  # Choose statistical test
  expected <- suppressWarnings(chisq.test(tbl)$expected)
  if (any(expected < 5)) {
    test <- fisher.test(tbl)
    test_name <- "Fisher's exact"
  } else {
    test <- chisq.test(tbl)
    test_name <- "Chi-square"
  }

  # Determine the "event" row (usually coded as 1, or "Yes", etc.)
  rownames_tbl <- rownames(tbl)
  target_row <- if ("1" %in% rownames_tbl) "1" else rownames_tbl[1]

  # Calculate percentages for CMD = 1 and CMD = 0
  perc_1 <- if ("1" %in% colnames(tbl)) {
    100 * tbl[target_row, "1"] / sum(tbl[, "1"])
  } else {
    100 * tbl[target_row, 2] / sum(tbl[, 2])
  }

  perc_0 <- if ("0" %in% colnames(tbl)) {
    100 * tbl[target_row, "0"] / sum(tbl[, "0"])
  } else {
    100 * tbl[target_row, 1] / sum(tbl[, 1])
  }

  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[[var]],
      CMD_1 = sprintf("%.1f%%", perc_1),
      CMD_0 = sprintf("%.1f%%", perc_0),
      P_value = round(test$p.value, 4),
      Test = test_name,
      stringsAsFactors = FALSE
    )
  )
}

## Warning: Skipping MHCR_NY10 - only one category present

## Warning: Skipping E_e_class - only one category present

# View result table
print(results)

##                                   Variable CMD_1 CMD_0 P_value           Test
## 1                      All-cause mortality 12.9% 11.4%  0.8900     Chi-square
## 2                        Revascularization  9.4% 13.9%  0.3971     Chi-square
## 3                  New-onset heart failure  5.9%  7.6%  0.8140     Chi-square
## 4                       HF hospitalization  2.4%  2.5%  1.0000 Fisher's exact
## 5                                      Sex 78.8% 73.4%  0.4337     Chi-square
## 6                                  Smoking 24.7% 25.3%  0.9920     Chi-square
## 7                             Hypertension 29.4% 27.8%  0.9179     Chi-square
## 8                     Hypercholesterolemia 62.9% 69.6%  0.3756     Chi-square
## 9                             CVA (Stroke)  0.6%  1.3%  0.5348 Fisher's exact
## 10                     PCI (other vessels)  1.2%  2.5%  0.5935 Fisher's exact
## 11 Cardiovascular family history <65 years 30.6% 38.0%  0.3128     Chi-square
## 12                      Diastolic function 32.9% 35.4%  0.8834     Chi-square
## 13                         Diastolic grade 41.2% 38.0%  0.9236     Chi-square
## 14 Diastolic dysfunction criteria positive 25.9% 26.6%  0.9591 Fisher's exact
## 15                   Strain interpretation  5.3%  6.3%  0.8597 Fisher's exact

# Optional: Styled table
library(kableExtra)
results %>%
  kable("html", align = "lcccl", caption = "Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)
Variable	CMD_1	CMD_0	P_value	Test
All-cause mortality	12.9%	11.4%	0.8900	Chi-square
Revascularization	9.4%	13.9%	0.3971	Chi-square
New-onset heart failure	5.9%	7.6%	0.8140	Chi-square
HF hospitalization	2.4%	2.5%	1.0000	Fisher’s exact
Sex	78.8%	73.4%	0.4337	Chi-square
Smoking	24.7%	25.3%	0.9920	Chi-square
Hypertension	29.4%	27.8%	0.9179	Chi-square
Hypercholesterolemia	62.9%	69.6%	0.3756	Chi-square
CVA (Stroke)	0.6%	1.3%	0.5348	Fisher’s exact
PCI (other vessels)	1.2%	2.5%	0.5935	Fisher’s exact
Cardiovascular family history <65 years	30.6%	38.0%	0.3128	Chi-square
Diastolic function	32.9%	35.4%	0.8834	Chi-square
Diastolic grade	41.2%	38.0%	0.9236	Chi-square
Diastolic dysfunction criteria positive	25.9%	26.6%	0.9591	Fisher’s exact
Strain interpretation	5.3%	6.3%	0.8597	Fisher’s exact

library(ggplot2)
library(dplyr)

# Step 1: Filter the dataset for the two groups of interest
df_strat <- data_final %>%
  filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present")) %>%
  select(Post_IMR, diastolic_function)

# Step 2: Create the boxplot with jittered individual data points
ggplot(df_strat, aes(x = diastolic_function, y = Post_IMR, fill = diastolic_function)) +
  # Boxplot with narrower width and no outliers (to avoid duplicate points with jitter)
  geom_boxplot(width = 0.4, position = position_dodge(width = 0.6), outlier.shape = NA) +
  
  # Jittered points for individual data visibility
  geom_jitter(width = 0.15, alpha = 0.4, color = "black") +
  
  # Labels and appearance
  labs(
    title = "Post-IMR by Diastolic Function Status",
    x = "Diastolic Function",
    y = "Post-IMR"
  ) +
  
  # Custom colors
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  
  # Clean theme
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(size = 12, angle = 0, vjust = 0.5)
  )

## Warning: Removed 82 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 82 rows containing missing values or values outside the scale range
## (`geom_point()`).

wilcox.test(Post_IMR ~ diastolic_function, data = df_strat)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Post_IMR by diastolic_function
## W = 4154, p-value = 0.8993
## alternative hypothesis: true location shift is not equal to 0

Interpretation: there is no signifficant differente in post IMR between patients wit diastolic dysfunction and patients with normal diastolic function.

library(dplyr)
library(tibble)

# Filter data to exclude 'Indeterminate'
filtered <- data_final %>%
  filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present"))

# Create contingency table
tab <- table(filtered$CMD, filtered$diastolic_function)
colnames(tab) <- c("Diastolic Dysfunction Present", "Normal Diastolic Function")
rownames(tab) <- c("CMD Absent (Post_IMR < 25)", "CMD Present (Post_IMR ≥ 25)")

# Convert contingency table to data frame
summary_table <- as.data.frame.matrix(tab) %>%
  rownames_to_column(var = "CMD Status") %>%
  mutate(Total = `Diastolic Dysfunction Present` + `Normal Diastolic Function`)

# Run chi-square test with Yates continuity correction
chi_res <- chisq.test(tab, correct = TRUE)

# Print the table
print(summary_table)

##                    CMD Status Diastolic Dysfunction Present
## 1  CMD Absent (Post_IMR < 25)                            28
## 2 CMD Present (Post_IMR ≥ 25)                            56
##   Normal Diastolic Function Total
## 1                        30    58
## 2                        70   126

# Create a data frame for test results
test_results <- data.frame(
  Statistic = c("Chi-squared", "Degrees of freedom", "p-value"),
  Value = c(round(chi_res$statistic, 3), chi_res$parameter, round(chi_res$p.value, 4))
)

# Print test results
cat("\nChi-square test results:\n")

## 
## Chi-square test results:

print(test_results)

##                    Statistic  Value
## X-squared        Chi-squared 0.1060
## df        Degrees of freedom 1.0000
##                      p-value 0.7448

Interpretation: CMD prevalence is not significantly higher among patients with diastolic dysfunction.

CMD (binary or continuous) does not differ significantly across diastolic function groups.

install.packages("corrplot", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/3g/ln4t2cvs3250dv7p12p13c_c0000gn/T//RtmpJI62xe/downloaded_packages

# Load required libraries
library(dplyr)
library(Hmisc)      # for rcorr

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(tibble)     # for rownames_to_column

# Define variables of interest
vars <- c(
  "Post_IMR",
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1",    # Contraction Strain
  "MRI1_INF_SZ_LV", # Infarct size
  "Complete_LVEF"   # Complete LVEF
)

# Select and filter complete cases only for these vars
corr_data <- data_final %>%
  select(all_of(vars)) %>%
  filter(complete.cases(.))

# Calculate Spearman correlation matrix and p-values
corr_res <- rcorr(as.matrix(corr_data), type = "spearman")
corr_mat <- corr_res$r
p_mat <- corr_res$P

# -- Debug prints (can remove after confirming correctness) --
cat("Dimensions of corr_mat:", dim(corr_mat), "\n")

## Dimensions of corr_mat: 12 12

cat("Dimensions of p_mat:", dim(p_mat), "\n")

## Dimensions of p_mat: 12 12

cat("Row names corr_mat:", rownames(corr_mat), "\n")

## Row names corr_mat: Post_IMR ECHO_VAL52 ECHO_VAL62 mean_e E_e_ratio ECHO_VAL92 LAVI A4C_R_RR1 A4C_CD_RR1 A4C_CT_RR1 MRI1_INF_SZ_LV Complete_LVEF

cat("Col names corr_mat:", colnames(corr_mat), "\n")

## Col names corr_mat: Post_IMR ECHO_VAL52 ECHO_VAL62 mean_e E_e_ratio ECHO_VAL92 LAVI A4C_R_RR1 A4C_CD_RR1 A4C_CT_RR1 MRI1_INF_SZ_LV Complete_LVEF

# Extract variable names except "Post_IMR", present in both corr_mat and p_mat
vars_except_post_imr <- intersect(
  colnames(corr_mat)[colnames(corr_mat) != "Post_IMR"],
  colnames(p_mat)[colnames(p_mat) != "Post_IMR"]
)

cat("Variables except Post_IMR:", vars_except_post_imr, "\n")

## Variables except Post_IMR: ECHO_VAL52 ECHO_VAL62 mean_e E_e_ratio ECHO_VAL92 LAVI A4C_R_RR1 A4C_CD_RR1 A4C_CT_RR1 MRI1_INF_SZ_LV Complete_LVEF

# Create data frame of Post_IMR correlations safely converting to vectors
post_imr_corr <- data.frame(
  Variable = vars_except_post_imr,
  Correlation = as.numeric(corr_mat["Post_IMR", vars_except_post_imr]),
  P_value = as.numeric(p_mat["Post_IMR", vars_except_post_imr])
)

# Rename variables for readability
rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "MRI1_INF_SZ_LV"  = "Infarct size",
  "Complete_LVEF"   = "Complete LVEF"
)

# Rename variables in the post_imr_corr table for readability
post_imr_corr$Variable <- ifelse(
  post_imr_corr$Variable %in% names(rename_map),
  rename_map[post_imr_corr$Variable],
  post_imr_corr$Variable
)

# Print correlation table for Post_IMR
print(post_imr_corr)

##              Variable Correlation    P_value
## 1           Septal e' -0.18833781 0.05434929
## 2          Lateral e' -0.16835404 0.08602644
## 3             Mean e' -0.20577300 0.03521325
## 4          E/e' ratio  0.04023148 0.68365531
## 5                 LAV  0.09200763 0.35056504
## 6                LAVI  0.07281939 0.46037744
## 7    Reservoir strain -0.14628351 0.13647704
## 8      Conduit Strain  0.14597429 0.13731683
## 9  Contraction Strain  0.06510065 0.50938329
## 10       Infarct size -0.08258399 0.40229126
## 11      Complete LVEF -0.00803436 0.93516879

Interpretation: The Mean e’ (average early diastolic mitral annular velocity) shows a statistically significant weak negative correlation with Post_IMR, meaning higher microvascular resistance (Post_IMR) tends to be associated with worse diastolic function (lower Mean e’).

Septal e’ shows a similar negative trend but just misses statistical significance (p = 0.054).

Other diastolic function markers (like E/e’ ratio, LAVI) and infarct size or LVEF do not show significant correlations with Post_IMR in your data.

Overall, the associations are weak, indicating that Post_IMR may have a subtle relationship mainly with early diastolic function markers but not strongly related to infarct size or overall systolic function (LVEF).

# Create a copy of your dataset to preserve original
data_sensitivity <- data_final %>%
  mutate(
    CMD_20 = case_when(
      is.na(Post_IMR) ~ NA_real_,
      Post_IMR >= 20 ~ 1,
      Post_IMR < 20 ~ 0
    )
  )

# Chi-squared test: CMD_20 vs diastolic_function
table_CMD20 <- table(data_sensitivity$CMD_20, data_sensitivity$diastolic_function)
chi_sq_result <- chisq.test(table_CMD20)

# Print results
print(table_CMD20)

##    
##     Diastolic Dysfunction Present Indeterminate Diastolic Function
##   0                            14                               15
##   1                            70                               50
##    
##     Normal Diastolic Function
##   0                        21
##   1                        79

print(chi_sq_result)

## 
##  Pearson's Chi-squared test
## 
## data:  table_CMD20
## X-squared = 1.0264, df = 2, p-value = 0.5986

Sensitivity analysis: The p-value (0.60) is not statistically significant.

This means there is no strong evidence of an association between CMD (using the more inclusive IMR ≥ 20) and diastolic dysfunction categories in your dataset.

This result is consistent with your primary analysis using IMR ≥ 25 (which also had a non-significant result).

Conclusion: Changing the CMD cutoff from 25 to 20 does not materially change the result — this suggests your original finding is robust to this threshold change.

# Calculate median ischemic time
median_time <- median(data_final$CAP_NUM1, na.rm = TRUE)

# Create stratified variable: early vs late
data_final <- data_final %>%
  mutate(
    ischemic_group = case_when(
      is.na(CAP_NUM1) ~ NA_character_,
      CAP_NUM1 <= median_time ~ "Early",
      CAP_NUM1 > median_time ~ "Late"
    )
  )

# Boxplot of Post_IMR by ischemic time group
library(ggplot2)

ggplot(data_final, aes(x = ischemic_group, y = Post_IMR)) +
  geom_boxplot(fill = "#69b3a2", alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.4, color = "black") +
  theme_minimal() +
  labs(title = "Post_IMR by Ischemic Time Group", x = "Ischemic Time Group", y = "Post IMR")

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).

wilcox.test(Post_IMR ~ ischemic_group, data = data_final)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Post_IMR by ischemic_group
## W = 6965, p-value = 0.1704
## alternative hypothesis: true location shift is not equal to 0

Sensitivity analysis using ischemic time: The p-value of 0.1704 indicates that there is no statistically significant difference in Post_IMR between patients who had early vs late revascularization (based on your median split of ischemic time).

ischemic time (as a binary early/late grouping) does not appear to significantly influence CMD severity (as measured by IMR) in this dataset.

# Convert to labeled factors
data_final <- data_final %>%
  mutate(
    Hypertension = factor(MHCR_NY1, levels = c(0, 1), labels = c("No", "Yes")),
    Sex = factor(DEMO_SEX, levels = c(1, 2), labels = c("Male", "Female"))
  )

# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1 + Hypertension + Sex, data = data_final)

# Show model summary
summary(model)

## 
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1 + Hypertension + 
##     Sex, data = data_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.6308  -6.6552  -0.9912   4.7475  21.0493 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        7.536e+00  2.422e+00   3.112   0.0022 **
## Post_IMR          -1.524e-03  6.143e-02  -0.025   0.9802   
## CAP_NUM1           7.662e-03  9.828e-03   0.780   0.4368   
## HypertensionYes    6.194e-01  1.403e+00   0.442   0.6594   
## SexFemale          1.146e+00  1.549e+00   0.740   0.4605   
## Post_IMR:CAP_NUM1 -2.971e-05  2.278e-04  -0.130   0.8964   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.856 on 162 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.02324,    Adjusted R-squared:  -0.006903 
## F-statistic: 0.771 on 5 and 162 DF,  p-value: 0.5719

None of the variables or interaction terms are statistically significant (all p-values > 0.05).

The interaction between Post_IMR and ischemic time is not significant → this suggests that the relationship between IMR and infarct size doesn’t change significantly depending on ischemic time.

Model Fit R-squared = 0.023 → the model explains only 2.3% of the variance in infarct size.

Adjusted R-squared = -0.007 → adjusting for predictors makes the model slightly worse.

F-test p = 0.5719 → the overall model is not statistically significant.

# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)

# Show model summary
summary(model)

## 
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.066  -6.593  -1.186   4.954  22.096 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        7.845e+00  2.389e+00   3.284  0.00125 **
## Post_IMR          -1.430e-04  6.082e-02  -0.002  0.99813   
## CAP_NUM1           7.968e-03  9.789e-03   0.814  0.41681   
## Post_IMR:CAP_NUM1 -3.372e-05  2.269e-04  -0.149  0.88208   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.829 on 164 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.01805,    Adjusted R-squared:  8.288e-05 
## F-statistic: 1.005 on 3 and 164 DF,  p-value: 0.3923

ischemic time does not appear to modify the relationship between IMR and infarct size.

# Create log-transformed infarct size
data_final <- data_final %>%
  mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV))  # log1p handles 0 values safely

# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1 + 
                  factor(MHCR_NY1) + factor(DEMO_SEX), data = data_final)

summary(model_log)

## 
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1 + factor(MHCR_NY1) + 
##     factor(DEMO_SEX), data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1839 -0.5855  0.2618  0.7926  1.6269 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.754e+00  3.180e-01   5.516 1.35e-07 ***
## Post_IMR          -2.658e-03  8.066e-03  -0.330    0.742    
## CAP_NUM1           1.039e-03  1.290e-03   0.806    0.422    
## factor(MHCR_NY1)1  1.424e-01  1.841e-01   0.773    0.441    
## factor(DEMO_SEX)2 -4.678e-02  2.033e-01  -0.230    0.818    
## Post_IMR:CAP_NUM1  1.399e-06  2.990e-05   0.047    0.963    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.031 on 162 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.03197,    Adjusted R-squared:  0.002095 
## F-statistic:  1.07 on 5 and 162 DF,  p-value: 0.3789

# Create log-transformed infarct size
data_final <- data_final %>%
  mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV))  # log1p handles 0 values safely

# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_final)

summary(model_log)

## 
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2197 -0.6001  0.2754  0.7704  1.5947 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.757e+00  3.134e-01   5.606 8.59e-08 ***
## Post_IMR          -1.949e-03  7.979e-03  -0.244    0.807    
## CAP_NUM1           1.051e-03  1.284e-03   0.818    0.414    
## Post_IMR:CAP_NUM1  1.240e-06  2.977e-05   0.042    0.967    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.027 on 164 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.02832,    Adjusted R-squared:  0.01055 
## F-statistic: 1.593 on 3 and 164 DF,  p-value: 0.193

# Identify extreme IMR values (e.g., > 99th percentile)
imr_cutoff <- quantile(data_final$Post_IMR, 0.99, na.rm = TRUE)

# Create filtered dataset
data_no_outliers <- data_final %>%
  filter(Post_IMR <= imr_cutoff)

# Rerun correlation or regression
model_no_outliers <- lm(MRI1_INF_SZ_LV ~ Post_IMR, data = data_no_outliers)
summary(model_no_outliers)

## 
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR, data = data_no_outliers)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.341 -6.330 -1.093  5.088 22.960 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.191247   1.431042   6.423 1.35e-09 ***
## Post_IMR    0.002176   0.037372   0.058    0.954    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.853 on 166 degrees of freedom
##   (79 observations deleted due to missingness)
## Multiple R-squared:  2.042e-05,  Adjusted R-squared:  -0.006004 
## F-statistic: 0.00339 on 1 and 166 DF,  p-value: 0.9536

# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_no_outliers)

summary(model_log)

## 
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_no_outliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2197 -0.6001  0.2754  0.7704  1.5947 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.757e+00  3.134e-01   5.606 8.59e-08 ***
## Post_IMR          -1.949e-03  7.979e-03  -0.244    0.807    
## CAP_NUM1           1.051e-03  1.284e-03   0.818    0.414    
## Post_IMR:CAP_NUM1  1.240e-06  2.977e-05   0.042    0.967    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.027 on 164 degrees of freedom
##   (79 observations deleted due to missingness)
## Multiple R-squared:  0.02832,    Adjusted R-squared:  0.01055 
## F-statistic: 1.593 on 3 and 164 DF,  p-value: 0.193

# Vector of diastolic dysfunction variables
diastolic_vars <- c("ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI", "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32")
# Select all variables of interest including dependent, independent, and covariates
vars_for_model <- c(
  diastolic_vars,      # your dependent variables (e.g. ECHO_VAL52, etc.)
  "Post_IMR",
  "AGE_AT_INCLUSION",
  "DEMO_SEX",
  "VS_BMI1",
  "MHCR_NY1",
  "MHCR_NPC",
  "HBA1C_22",
  "LDL2",
  "MHCR_NY2",
  "MHCR_NY4",
  "EGFR2",
  "CAP_NUM1"
)

# Count how many complete cases you have in these vars
sum(complete.cases(data_final[, vars_for_model]))

## [1] 128

# Load necessary package
library(dplyr)
library(broom)

## Warning: package 'broom' was built under R version 4.3.3

# List of diastolic dysfunction dependent variables
diastolic_vars <- c(
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1"     # Contraction Strain
)

# Loop through each diastolic variable, run unadjusted regression, store results
results_list <- list()

for (var in diastolic_vars) {
  formula <- as.formula(paste(var, "~ Post_IMR"))
  
  model_data <- data_final %>%
    select(all_of(c(var, "Post_IMR"))) %>%
    filter(complete.cases(.))
  
  model <- lm(formula, data = model_data)
  
  tidy_res <- broom::tidy(model)
  tidy_res$Dependent_Var <- var
  
  results_list[[var]] <- tidy_res
}

all_results <- do.call(rbind, results_list)
all_results <- all_results[, c("Dependent_Var", names(all_results)[names(all_results) != "Dependent_Var"])]

print(all_results)

## # A tibble: 18 × 6
##    Dependent_Var term          estimate std.error statistic  p.value
##    <chr>         <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 ECHO_VAL52    (Intercept)   8.21      0.343     24.0     6.39e-63
##  2 ECHO_VAL52    Post_IMR     -0.0135    0.00907   -1.49    1.39e- 1
##  3 ECHO_VAL62    (Intercept)  10.2       0.508     20.1     2.84e-51
##  4 ECHO_VAL62    Post_IMR     -0.0206    0.0134    -1.54    1.25e- 1
##  5 mean_e        (Intercept)   9.17      0.384     23.9     7.16e-63
##  6 mean_e        Post_IMR     -0.0166    0.0101    -1.64    1.03e- 1
##  7 E_e_ratio     (Intercept)   0.0781    0.00473   16.5     6.59e-40
##  8 E_e_ratio     Post_IMR      0.000121  0.000124   0.973   3.31e- 1
##  9 ECHO_VAL92    (Intercept)  55.4       2.88      19.2     1.80e-47
## 10 ECHO_VAL92    Post_IMR      0.0126    0.0751     0.168   8.66e- 1
## 11 LAVI          (Intercept)  27.9       1.38      20.3     1.90e-50
## 12 LAVI          Post_IMR     -0.00107   0.0359    -0.0299  9.76e- 1
## 13 A4C_R_RR1     (Intercept)  27.6       1.37      20.1     5.90e-49
## 14 A4C_R_RR1     Post_IMR      0.000211  0.0350     0.00602 9.95e- 1
## 15 A4C_CD_RR1    (Intercept) -15.8       1.01     -15.8     2.57e-36
## 16 A4C_CD_RR1    Post_IMR      0.0124    0.0256     0.484   6.29e- 1
## 17 A4C_CT_RR1    (Intercept) -11.7       0.948    -12.4     3.92e-26
## 18 A4C_CT_RR1    Post_IMR     -0.0129    0.0242    -0.535   5.93e- 1

Interpretations None of the diastolic dysfunction parameters showed a statistically significant association with Post_IMR in these unadjusted models.

The direction of effect for septal, lateral, and mean e’ is negative, consistent with worse diastolic function with higher Post_IMR, but these are not statistically significant.

The E/e’ ratio and left atrial volume indices show small and non-significant effects.

Strain measures (reservoir, conduit, contraction) also show no significant relationship.

# Load necessary libraries
library(dplyr)

# Create a new dataset so original is not altered
data_final2 <- data_final %>%
  mutate(
    # Log-transform non-normal continuous dependent variables (excluding Septal e')
    log_ECHO_VAL62 = ifelse(ECHO_VAL62 > 0, log(ECHO_VAL62), NA),
log_mean_e = ifelse(mean_e > 0, log(mean_e), NA),
log_E_e_ratio = ifelse(E_e_ratio > 0, log(E_e_ratio), NA),
log_LAVI = ifelse(LAVI > 0, log(LAVI), NA),
log_A4C_R_RR1 = ifelse(A4C_R_RR1 > 0, log(A4C_R_RR1), NA),
log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA),
log_A4C_CT_RR1 = ifelse(A4C_CT_RR1 > 0, log(A4C_CT_RR1), NA)

  )

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA)`.
## Caused by warning in `log()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

# List of dependent variables
dependent_vars <- list(
  "ECHO_VAL52",        # Septal e' (no transform)
  "log_ECHO_VAL62",    # Lateral e' (log)
  "log_mean_e",        # Mean e' (log)
  "log_E_e_ratio",     # E/e' ratio (log)
  "log_LAVI",          # LAVI (log)
  "log_A4C_R_RR1",     # Reservoir strain (log)
  "log_A4C_CD_RR1",    # Conduit strain (log)
  "log_A4C_CT_RR1"     # Contraction strain (log)
)

# Function to run a model for each dependent variable
run_model <- function(dep_var) {
  model_formula <- as.formula(paste(dep_var, "~ Post_IMR + AGE_AT_INCLUSION + factor(DEMO_SEX) + factor(MHCR_NY1) + VS_BMI1 + 
                                     factor(MHCR_NY2) + factor(MHCR_NY4) + factor(MHCR_NY11) + factor(MHCR_NY13) + 
                                     factor(MHCR_NPC) + CAP_NUM1"))

  # Filter only rows with complete data for this model
  model_data <- data_final2 %>%
    select(all.vars(model_formula)) %>%
    filter(complete.cases(.))
  
  # Only run the model if enough data is left
  if (nrow(model_data) > 10) {
    model <- lm(model_formula, data = model_data)
    return(summary(model))
  } else {
    return(paste("Not enough data for:", dep_var))
  }
}

# Run all models
results <- lapply(dependent_vars, run_model)
names(results) <- dependent_vars

# Print results
results

## $ECHO_VAL52
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6749 -1.2902 -0.0376  1.1029  5.4068 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.302e+01  1.567e+00   8.310 1.27e-14 ***
## Post_IMR           -9.664e-03  8.035e-03  -1.203  0.23043    
## AGE_AT_INCLUSION   -7.701e-02  1.410e-02  -5.461 1.36e-07 ***
## factor(DEMO_SEX)2  -1.878e-02  3.069e-01  -0.061  0.95126    
## factor(MHCR_NY1)1  -7.721e-01  2.925e-01  -2.640  0.00893 ** 
## VS_BMI1            -2.115e-02  3.820e-02  -0.554  0.58048    
## factor(MHCR_NY2)1  -1.727e-02  2.677e-01  -0.065  0.94862    
## factor(MHCR_NY4)1   9.309e-01  1.906e+00   0.488  0.62573    
## factor(MHCR_NY11)1 -1.286e+00  9.444e-01  -1.362  0.17462    
## factor(MHCR_NY13)1 -1.434e-02  2.789e-01  -0.051  0.95905    
## factor(MHCR_NPC)2   7.765e-01  3.791e-01   2.048  0.04182 *  
## factor(MHCR_NPC)3   7.466e-01  3.278e-01   2.277  0.02379 *  
## CAP_NUM1           -4.797e-05  8.370e-04  -0.057  0.95435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.82 on 206 degrees of freedom
## Multiple R-squared:  0.2925, Adjusted R-squared:  0.2513 
## F-statistic: 7.098 on 12 and 206 DF,  p-value: 8.913e-11
## 
## 
## $log_ECHO_VAL62
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05876 -0.21812  0.04793  0.22229  0.73778 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.0261402  0.2774712  10.906  < 2e-16 ***
## Post_IMR           -0.0015792  0.0014137  -1.117   0.2653    
## AGE_AT_INCLUSION   -0.0100952  0.0024641  -4.097 6.03e-05 ***
## factor(DEMO_SEX)2  -0.0035316  0.0538089  -0.066   0.9477    
## factor(MHCR_NY1)1  -0.1218967  0.0511617  -2.383   0.0181 *  
## VS_BMI1            -0.0070952  0.0066448  -1.068   0.2869    
## factor(MHCR_NY2)1  -0.0223579  0.0471547  -0.474   0.6359    
## factor(MHCR_NY4)1   0.2081541  0.3342594   0.623   0.5342    
## factor(MHCR_NY11)1 -0.1061490  0.1654612  -0.642   0.5219    
## factor(MHCR_NY13)1  0.0382071  0.0491873   0.777   0.4382    
## factor(MHCR_NPC)2   0.1572549  0.0668871   2.351   0.0197 *  
## factor(MHCR_NPC)3   0.1090616  0.0582831   1.871   0.0627 .  
## CAP_NUM1           -0.0001493  0.0001472  -1.014   0.3116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3192 on 205 degrees of freedom
## Multiple R-squared:  0.2166, Adjusted R-squared:  0.1707 
## F-statistic: 4.722 on 12 and 205 DF,  p-value: 8.512e-07
## 
## 
## $log_mean_e
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6517 -0.1725  0.0000  0.1785  0.6115 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.844e+00  2.135e-01  13.319  < 2e-16 ***
## Post_IMR           -1.348e-03  1.092e-03  -1.235  0.21836    
## AGE_AT_INCLUSION   -9.863e-03  1.908e-03  -5.169 5.51e-07 ***
## factor(DEMO_SEX)2  -4.480e-03  4.202e-02  -0.107  0.91519    
## factor(MHCR_NY1)1  -1.038e-01  3.991e-02  -2.602  0.00993 ** 
## VS_BMI1            -5.044e-03  5.181e-03  -0.973  0.33145    
## factor(MHCR_NY2)1  -1.950e-02  3.655e-02  -0.533  0.59428    
## factor(MHCR_NY4)1   1.810e-01  2.612e-01   0.693  0.48910    
## factor(MHCR_NY11)1 -1.402e-01  1.293e-01  -1.084  0.27952    
## factor(MHCR_NY13)1  2.500e-02  3.811e-02   0.656  0.51260    
## factor(MHCR_NPC)2   1.548e-01  5.147e-02   3.007  0.00296 ** 
## factor(MHCR_NPC)3   1.273e-01  4.436e-02   2.870  0.00453 ** 
## CAP_NUM1           -7.967e-05  1.147e-04  -0.695  0.48804    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2495 on 208 degrees of freedom
## Multiple R-squared:  0.296,  Adjusted R-squared:  0.2554 
## F-statistic: 7.287 on 12 and 208 DF,  p-value: 4.199e-11
## 
## 
## $log_E_e_ratio
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65266 -0.20691 -0.02816  0.17271  0.85327 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.3617119  0.2388125 -14.077  < 2e-16 ***
## Post_IMR            0.0003635  0.0012227   0.297 0.766581    
## AGE_AT_INCLUSION    0.0082685  0.0021354   3.872 0.000146 ***
## factor(DEMO_SEX)2   0.1381846  0.0472004   2.928 0.003810 ** 
## factor(MHCR_NY1)1   0.1015971  0.0452403   2.246 0.025810 *  
## VS_BMI1             0.0092503  0.0057972   1.596 0.112136    
## factor(MHCR_NY2)1   0.0013379  0.0412303   0.032 0.974145    
## factor(MHCR_NY4)1   0.1955202  0.2900993   0.674 0.501100    
## factor(MHCR_NY11)1 -0.0733406  0.1438800  -0.510 0.610797    
## factor(MHCR_NY13)1 -0.0265422  0.0437074  -0.607 0.544357    
## factor(MHCR_NPC)2  -0.0552739  0.0582650  -0.949 0.343931    
## factor(MHCR_NPC)3  -0.0518744  0.0502307  -1.033 0.302975    
## CAP_NUM1            0.0002109  0.0001279   1.649 0.100660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2768 on 201 degrees of freedom
## Multiple R-squared:  0.2251, Adjusted R-squared:  0.1789 
## F-statistic: 4.866 on 12 and 201 DF,  p-value: 5.047e-07
## 
## 
## $log_LAVI
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1287 -0.1678  0.0117  0.1749  0.9616 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.202e+00  2.638e-01  12.138   <2e-16 ***
## Post_IMR           -3.935e-04  1.304e-03  -0.302   0.7631    
## AGE_AT_INCLUSION    2.012e-03  2.430e-03   0.828   0.4086    
## factor(DEMO_SEX)2   1.079e-02  5.165e-02   0.209   0.8347    
## factor(MHCR_NY1)1   8.972e-02  4.797e-02   1.870   0.0630 .  
## VS_BMI1             1.092e-03  6.404e-03   0.170   0.8648    
## factor(MHCR_NY2)1  -1.041e-01  4.522e-02  -2.302   0.0224 *  
## factor(MHCR_NY4)1   2.256e-01  2.166e-01   1.042   0.2988    
## factor(MHCR_NY11)1  2.405e-02  1.530e-01   0.157   0.8753    
## factor(MHCR_NY13)1 -4.597e-02  4.700e-02  -0.978   0.3293    
## factor(MHCR_NPC)2   6.123e-03  6.457e-02   0.095   0.9245    
## factor(MHCR_NPC)3   1.783e-02  5.457e-02   0.327   0.7442    
## CAP_NUM1           -7.802e-05  1.357e-04  -0.575   0.5661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2939 on 190 degrees of freedom
## Multiple R-squared:  0.08126,    Adjusted R-squared:  0.02323 
## F-statistic:   1.4 on 12 and 190 DF,  p-value: 0.1686
## 
## 
## $log_A4C_R_RR1
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28726 -0.16347 -0.00903  0.23255  0.58418 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.6204528  0.2662794  13.596   <2e-16 ***
## Post_IMR            0.0010891  0.0013722   0.794   0.4285    
## AGE_AT_INCLUSION   -0.0064413  0.0024951  -2.582   0.0106 *  
## factor(DEMO_SEX)2  -0.0353346  0.0531805  -0.664   0.5073    
## factor(MHCR_NY1)1  -0.0524587  0.0517049  -1.015   0.3117    
## VS_BMI1             0.0023758  0.0062718   0.379   0.7053    
## factor(MHCR_NY2)1  -0.1173030  0.0481184  -2.438   0.0158 *  
## factor(MHCR_NY4)1  -0.3085844  0.2245066  -1.375   0.1710    
## factor(MHCR_NY11)1 -0.2250985  0.1579848  -1.425   0.1560    
## factor(MHCR_NY13)1  0.0778675  0.0482985   1.612   0.1087    
## factor(MHCR_NPC)2   0.1179118  0.0699447   1.686   0.0936 .  
## factor(MHCR_NPC)3   0.0574994  0.0566515   1.015   0.3115    
## CAP_NUM1           -0.0001480  0.0001452  -1.020   0.3092    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3035 on 179 degrees of freedom
## Multiple R-squared:  0.1469, Adjusted R-squared:  0.08968 
## F-statistic: 2.568 on 12 and 179 DF,  p-value: 0.003665
## 
## 
## $log_A4C_CD_RR1
## [1] "Not enough data for: log_A4C_CD_RR1"
## 
## $log_A4C_CT_RR1
## [1] "Not enough data for: log_A4C_CT_RR1"

Septal E and mean e: - Age: Significant negative association. Older age is associated with lower - Septal e’ velocity (worse diastolic function). Hypertension: Significant. Hypertensive patients tend to have lower Septal e’. - Smoking: current and former smokers have a higher septal E’. Residual confousing or selection bias?

E/e ratio age, female, hypertension

LAVI: - hypercholesterolemia

# Check levels and counts for all your categorical vars
table(data_final2$MHCR_NY2)

## 
##   0   1 
## 141 239

table(data_final2$MHCR_NY4)

## 
##   0   1 
## 377   3

table(data_final2$MHCR_NY11)

## 
##   0   1 
## 376   4

table(data_final2$MHCR_NY13)

## 
##   0   1 
## 253 127

table(data_final2$MHCR_NY1)

## 
##   0   1 
## 267 113

table(data_final2$DEMO_SEX)

## 
##   1   2 
## 285  95

table(data_final2$MHCR_NPC)

## 
##   1   2   3 
## 106  65 209

table(data_final$HF_HOSP)

## 
##   0   1 
## 373   7

table(data_final$MORT_ALL)

## 
##   0   1 
## 336  44

table(data_final$REVASC)

## 
##   0   1 
## 337  43

table(data_final$NEW_ONSET_HF)

## 
##   0   1 
## 355  25

table(data_final$MHCR_NY2)

## 
##   0   1 
## 141 239

table(data_final$MHCR_NY4)

## 
##   0   1 
## 377   3

table(data_final$MHCR_NY10)

## 
##   0 
## 380

table(data_final$MHCR_NY11)

## 
##   0   1 
## 376   4

table(data_final$MHCR_NY13)

## 
##   0   1 
## 253 127

table(data_final$MHCR_NY1)

## 
##   0   1 
## 267 113

table(data_final$DEMO_SEX)

## 
##   1   2 
## 285  95

table(data_final$MHCR_NPC)

## 
##   1   2   3 
## 106  65 209

library(dplyr)

# Step 1: Prepare a clean dataset with proper factor levels
data_final3 <- data_final %>%
  mutate(
    DEMO_SEX = factor(DEMO_SEX, levels = c(1, 2), labels = c("Male", "Female")),
    MHCR_NY1 = factor(MHCR_NY1, levels = c(0, 1), labels = c("No", "Yes")),  # Hypertension
    MHCR_NY11 = factor(MHCR_NY11, levels = c(0, 1), labels = c("No", "Yes")),  # Prior PCI
    MHCR_NPC = factor(MHCR_NPC, levels = c(1, 2, 3), labels = c("Never", "Former", "Current"))  # Smoking
  )

# Step 2: Run the multiple linear regression model
model_infarct <- lm(
  MRI1_INF_SZ_LV ~ Post_IMR + AGE_AT_INCLUSION + DEMO_SEX +
  MHCR_NPC + MHCR_NY1 + CAP_NUM1 + MHCR_NY11,
  data = data_final3
)

# Step 3: View model summary
summary(model_infarct)

## 
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR + AGE_AT_INCLUSION + DEMO_SEX + 
##     MHCR_NPC + MHCR_NY1 + CAP_NUM1 + MHCR_NY11, data = data_final3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.3480  -6.4028  -0.8143   4.5715  22.2468 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       2.947851   4.497287   0.655   0.5131  
## Post_IMR         -0.007633   0.038205  -0.200   0.8419  
## AGE_AT_INCLUSION  0.102202   0.064350   1.588   0.1142  
## DEMO_SEXFemale    1.120838   1.556878   0.720   0.4726  
## MHCR_NPCFormer   -1.963378   1.818701  -1.080   0.2820  
## MHCR_NPCCurrent  -1.341415   1.525943  -0.879   0.3807  
## MHCR_NY1Yes      -0.237621   1.444234  -0.165   0.8695  
## CAP_NUM1          0.006587   0.003839   1.716   0.0881 .
## MHCR_NY11Yes      3.919770   4.639052   0.845   0.3994  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.786 on 159 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.05836,    Adjusted R-squared:  0.01098 
## F-statistic: 1.232 on 8 and 159 DF,  p-value: 0.2838

no significant association between IMR en infract size

data_final <- data_final %>%
  mutate(
    MORT_ALL_DATE = format(mdy(MORT_ALL_DATE), "%d/%m/%Y"),
    NEW_ONSET_HF_DATE = format(mdy(NEW_ONSET_HF_DATE), "%d/%m/%Y"),
    HF_HOSP_DATE = format(mdy(HF_HOSP_DATE), "%d/%m/%Y"),
    REVASC_DATE = format(mdy(REVASC_DATE), "%d/%m/%Y")
  )

library(dplyr)
library(lubridate)

# Define your reference date
reference_date <- dmy("07/08/2015")

# Create a new dataset with calculated time differences in days
data_final_with_time_diff <- data_final %>%
  mutate(
    MORT_ALL_DATE = mdy(MORT_ALL_DATE),
    NEW_ONSET_HF_DATE = mdy(NEW_ONSET_HF_DATE),
    HF_HOSP_DATE = mdy(HF_HOSP_DATE),
    REVASC_DATE = mdy(REVASC_DATE),

    # Calculate difference in days
    days_since_mortality = as.numeric(MORT_ALL_DATE - reference_date),
    days_since_new_onset_HF = as.numeric(NEW_ONSET_HF_DATE - reference_date),
    days_since_HF_hosp = as.numeric(HF_HOSP_DATE - reference_date),
    days_since_revasc = as.numeric(REVASC_DATE - reference_date)
  )

New statistics

Anna Themistokleous

2025-08-02