New statistics

# Calculate the mean e' using septal and lateral values
data_final$mean_e <- rowMeans(sapply(data_final[, c("ECHO_VAL52", "ECHO_VAL62")], as.numeric), na.rm = TRUE)

# Calculate the E/e' ratio using E and mean e' values
data_final$E_e_ratio <- data_final$ECHO_VAL32 / data_final$mean_e

# Compute the body surface area (BSA) where weight is in kg and height in cm
data_final$BSA <- 0.007184 * (data_final$VS_HGT1^0.725) * (data_final$VS_WGT1^0.425)

# Compute LAVI from LAV and BSA values
data_final$LAVI <- data_final$ECHO_VAL92 / data_final$BSA

# Classify E/e' status
data_final$E_e_class <- with(data_final, ifelse(is.na(E_e_ratio), NA,
                                   ifelse(E_e_ratio < 8, "Normal",
                                   ifelse(E_e_ratio <= 14, "Borderline", "Elevated"))))

library(dplyr)

data_final <- data_final %>%
  rowwise() %>%
  mutate(
    # ASE flags (ensure numeric comparisons; convert factors/characters if needed)
    LAVI_flag = ifelse(!is.na(LAVI), LAVI > 34, NA),
    E_e_flag = ifelse(!is.na(E_e_ratio), E_e_ratio > 14, NA),
    septal_e_flag = ifelse(!is.na(ECHO_VAL52), ECHO_VAL52 < 7, NA),
    lateral_e_flag = ifelse(!is.na(ECHO_VAL62), ECHO_VAL62 < 10, NA),
    
    # Count how many ASE criteria are positive
    criteria_positive = sum(c(LAVI_flag, E_e_flag, septal_e_flag, lateral_e_flag), na.rm = TRUE),
    
    # LA strain evaluation
    strain_comment = case_when(
      is.na(A4C_R_RR1) ~ "Reservoir strain missing",
      A4C_R_RR1 < 18 ~ "Abnormal LA strain – supports dysfunction",
      A4C_R_RR1 >= 18 ~ "Normal LA strain",
      TRUE ~ "Strain unclear"
    ),
    
    # Base dysfunction classification
    diastolic_function = case_when(
      criteria_positive >= 2 ~ "Diastolic Dysfunction Present",
      criteria_positive == 1 ~ "Indeterminate Diastolic Function",
      criteria_positive == 0 ~ "Normal Diastolic Function",
      TRUE ~ "Insufficient Data"
    ),
    
    # Grading diastolic dysfunction
    diastolic_grade = case_when(
      diastolic_function == "Normal Diastolic Function" ~ "Grade 0 (Normal)",
      diastolic_function == "Indeterminate Diastolic Function" ~ "Indeterminate",
      
      # Grade I: low E/e', normal LAVI, normal LA strain
      E_e_ratio <= 14 & LAVI <= 34 & A4C_R_RR1 >= 18 ~ "Grade I (Impaired relaxation)",
      
      # Grade III: assume only if reservoir strain is very low and E/e′ very high
      E_e_ratio >= 20 & A4C_R_RR1 < 12 ~ "Grade III (Restrictive)",
      
      # Grade II: 2+ abnormal criteria or abnormal LA strain
      criteria_positive >= 2 | A4C_R_RR1 < 18 ~ "Grade II (Pseudonormal)",
      
      TRUE ~ "Cannot determine grade"
    )
  ) %>%
  ungroup()

# Replace commas with dots and convert to numeric
cols_to_fix <- c("Pre_QFR", "Pre_Length", "Pre_FlowVelocity")

data_final <- data_final %>%
  mutate(across(all_of(cols_to_fix), ~ as.numeric(gsub(",", ".", .))))

data_final <- data_final%>%
  mutate(
    CMD = case_when(
      is.na(Post_IMR) ~ NA_real_,     # Preserve NA if IMR is missing
      Post_IMR >= 25 ~ 1,             # CMD present
      Post_IMR < 25 ~ 0               # CMD absent
    )
  )

library(dplyr)
library(knitr)
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

# Your existing processing code simplified for the table
data_final <- data_final %>%
  mutate(
    CMD = case_when(
      is.na(Post_IMR) ~ NA_real_,
      Post_IMR >= 25 ~ 1,
      Post_IMR < 25 ~ 0
    )
  )

# Define continuous variables
cont_vars <- c(
  "AGE_AT_INCLUSION", "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", "Complete_LVEF",
  "ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
  "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", "HBA1C_22",
  "LDL2", "EGFR2", "VS_HGT1", "VS_WGT1", "VS_meanBP1", "VS_meanBP2", "Pre_QFR",
  "Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", "Post_Length",
  "Post_FlowVelocity", "BSA"
)

# Convert to numeric safely (if needed)
data_final <- data_final %>%
  mutate(across(all_of(cont_vars), ~ as.numeric(gsub(",", ".", as.character(.)))))

# Calculate descriptive stats
cont_summary <- lapply(cont_vars, function(var) {
  values <- data_final[[var]]
  values <- values[!is.na(values)]
  q <- quantile(values, probs = c(0.25, 0.75), na.rm = TRUE)
  med <- median(values, na.rm = TRUE)
  data.frame(
    Variable = var,
    Valid_n = length(values),
    Mean = round(mean(values, na.rm = TRUE), 1),
    SD = round(sd(values, na.rm = TRUE), 1),
    Median = round(med, 1),
    `Median [IQR]` = paste0(round(med,1), " [", round(q[1],1), "–", round(q[2],1), "]"),
    `Min–Max` = paste0(round(min(values, na.rm = TRUE),1), "–", round(max(values, na.rm = TRUE),1))
  )
}) %>%
  bind_rows()

# Rename variables for readability
rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "AGE_AT_INCLUSION"= "Age at inclusion",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "ECHO_VAL32"      = "E",
  "CAP_NUM1"        = "Total ischemic time",
  "Complete_LVEF"   = "Left ventricular ejection fraction",
  "VS_WGT1"         = "Weight",
  "VS_HGT1"         = "Height",
  "VS_meanBP1"      = "Pre Mean blood pressure",
  "VS_meanBP2"      = "Post Mean blood pressure",
  "Pre_QFR"         = "Pre QFR",
  "Post_QFR"        = "Post QFR",
  "Pre_FlowVelocity"  = "Pre flow velocity",
  "Post_FlowVelocity" = "Post flow velocity",
  "Pre_IMR"         = "Pre IMR",
  "Post_IMR"        = "Post IMR",
  "BSA"             = "BSA",
  "EGFR2"           = "Renal function (eGFR)",
  "MRI1_INF_SZ_LV"  = "Infarct size (%)",
  "VS_BMI1"         = "BMI",
  "LDL2"            = "LDL cholesterol",
  "HBA1C_22"        = "HbA1c",
  "Pre_Length"      = "Pre vessel length",
  "Post_Length"     = "Post vessel length"
)

cont_summary$Variable <- rename_map[cont_summary$Variable]
cont_summary$Variable[is.na(cont_summary$Variable)] <- cont_summary$Variable[is.na(cont_summary$Variable)]

# Print nice table with kable and kableExtra
cont_summary %>%
  kable(
    caption = "Descriptive statistics for continuous variables",
    col.names = c("Variable", "Valid n", "Mean", "SD", "Median", "Median [IQR]", "Min–Max"),
    align = "lrrrrrr"
  ) %>%
  kable_styling(full_width = FALSE, position = "left", bootstrap_options = c("striped", "hover"))

Descriptive statistics for continuous variables
Variable	Valid n	Mean	SD	Median	Median [IQR]	Min–Max
Age at inclusion	380	61.8	11.7	62.0	62 [53–70]	27–93
Post IMR	249	35.1	16.3	35.0	35 [22–46]	8–94
Infarct size (%)	257	8.8	7.7	7.0	7 [2.1–13.6]	0–32.2
Total ischemic time	380	211.6	153.6	161.0	161 [109–249.2]	30–759
Left ventricular ejection fraction	372	48.2	9.5	48.0	48 [42.8–55]	15–72
Septal e’	337	7.6	2.1	7.6	7.6 [6.1–9]	2.3–13.7
Lateral e’	336	9.4	3.0	9.1	9.1 [7.3–11.3]	2.8–19.1
Mean e’	340	8.5	2.3	8.4	8.4 [6.8–9.9]	3.5–14.8
E/e’ ratio	331	0.1	0.0	0.1	0.1 [0.1–0.1]	0–0.2
LAV	303	55.6	16.7	53.0	53 [44–65]	16–134
LAVI	303	27.8	8.1	26.1	26.1 [22.4–32]	8.2–65.2
Reservoir strain	288	27.5	8.0	27.0	27 [22–33]	4–57
Conduit Strain	288	-14.8	6.0	-14.0	-14 [-18–-11]	-33–13
Contraction Strain	288	-12.4	6.1	-12.0	-12 [-16–-8]	-37–21
E	344	0.7	0.2	0.7	0.7 [0.5–0.8]	0.3–1.2
BMI	380	26.9	3.8	26.6	26.6 [24.2–29.2]	18.6–49.6
HbA1c	374	41.1	8.6	40.0	40 [38–42]	28–123
LDL cholesterol	377	3.8	1.0	3.8	3.8 [3.2–4.4]	1.3–9
Renal function (eGFR)	333	93.8	21.4	93.0	93 [80–106]	46–170
Height	380	176.5	9.0	176.0	176 [170–183]	153–204
Weight	380	84.2	14.3	82.2	82.2 [75–94]	53–147
Pre Mean blood pressure	354	103.3	16.3	102.5	102.5 [93–113]	61–154
Post Mean blood pressure	326	94.8	16.4	94.0	94 [82–106.8]	55–166
Pre QFR	96	0.6	0.2	0.6	0.6 [0.5–0.7]	0.2–1
Post QFR	279	1.0	0.1	1.0	1 [1–1]	0.6–1
Pre IMR	89	26.3	16.8	22.0	22 [14–34]	4–107
Pre vessel length	96	64.9	14.7	61.8	61.8 [56.4–71]	28.3–107.1
Pre flow velocity	96	17.1	8.3	14.8	14.8 [10.8–21.9]	3.9–44
Post vessel length	279	67.5	14.3	64.8	64.8 [58–74]	12.4–124.9
Post flow velocity	279	18.9	10.4	16.1	16.1 [11.9–23.1]	6.5–86.6
BSA	380	2.0	0.2	2.0	2 [1.9–2.1]	1.5–2.7

library(dplyr)
library(knitr)
library(kableExtra)

# Your Shapiro-Wilk normality test code
normality_results <- lapply(cont_vars, function(var) {
  values <- data_final[[var]]
  values <- values[!is.na(values)]
  
  if (length(values) < 3) {
    return(data.frame(Variable = var, W = NA, p_value = NA))
  }
  
  test <- shapiro.test(values)
  data.frame(Variable = var, W = as.numeric(test$statistic), p_value = as.numeric(test$p.value))
}) %>% bind_rows()

# Apply human-readable variable names
normality_results$Variable <- rename_map[normality_results$Variable]
normality_results$Variable[is.na(normality_results$Variable)] <- cont_vars[is.na(rename_map[normality_results$Variable])]

# Round the numbers nicely
normality_results <- normality_results %>%
  mutate(
    W = round(W, 3),
    p_value = signif(p_value, 3)
  )

# Create a pretty table
normality_results %>%
  kable(
    caption = "Shapiro-Wilk Normality Test Results",
    col.names = c("Variable", "W Statistic", "p-value"),
    align = c("l", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Shapiro-Wilk Normality Test Results
Variable	W Statistic	p-value
Age at inclusion	0.991	2.48e-02
Post IMR	0.967	1.68e-05
Infarct size (%)	0.915	0.00e+00
Total ischemic time	0.807	0.00e+00
Left ventricular ejection fraction	0.992	4.24e-02
Septal e’	0.992	6.97e-02
Lateral e’	0.988	6.18e-03
Mean e’	0.989	1.15e-02
E/e’ ratio	0.896	0.00e+00
LAV	0.961	3.00e-07
LAVI	0.960	3.00e-07
Reservoir strain	0.988	1.52e-02
Conduit Strain	0.975	6.75e-05
Contraction Strain	0.960	4.00e-07
E	0.983	4.81e-04
BMI	0.953	0.00e+00
HbA1c	0.497	0.00e+00
LDL cholesterol	0.965	1.00e-07
Renal function (eGFR)	0.987	5.55e-03
Height	0.995	2.05e-01
Weight	0.982	1.01e-04
Pre Mean blood pressure	0.993	1.27e-01
Post Mean blood pressure	0.982	4.67e-04
Pre QFR	0.980	1.46e-01
Post QFR	0.649	0.00e+00
Pre IMR	0.862	1.00e-07
Pre vessel length	0.949	9.04e-04
Pre flow velocity	0.913	8.50e-06
Post vessel length	0.934	0.00e+00
Post flow velocity	0.809	0.00e+00
BSA	0.993	9.01e-02

Shapiro-Wilk normality test results: variables that do not significantly variate from the normal include septal e’, age, left ventricular ejection fraction, height, pre mean blooed pressure, pre QFR and BSA.

library(ggplot2)

# Histograms
for (var in cont_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found in dataframe! Skipping..."))
    next
  }
  
  var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
  
  p <- ggplot(data_final, aes_string(x = var)) +
    geom_histogram(bins = 30, fill = "steelblue", color = "black") +
    ggtitle(paste0("Histogram of ", var_name)) +
    theme_minimal()
  
  print(p)
}

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Boxplots
for (var in cont_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found in dataframe! Skipping..."))
    next
  }
  
  var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
  
  p <- ggplot(data_final, aes_string(x = "1", y = var)) +
    geom_boxplot(fill = "steelblue", color = "black", outlier.color = "red", outlier.shape = 16) +
    labs(title = paste("Boxplot of", var_name),
         x = "",
         y = var_name) +
    theme_minimal() +
    theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
  
  print(p)
}

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

library(dplyr)

# 1. Categorical variables
cat_vars <- c(
  "MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
  "MHCR_NPC",    # Smoking
  "MHCR_NY1",    # Hypertension
  "MHCR_NY2",    # Hypercholesterolemia
  "MHCR_NY4",    # CVA
  "MHCR_NY10",   # CABG
  "MHCR_NY11",   # PCI (other)
  "MHCR_NY13",   # Cardiovascular Family History
  "diastolic_function", "diastolic_grade", "CMD",
  "E_e_class", "criteria_positive", "strain_comment"
)

# 2. Variable label mapping
rename_map <- c(
  MORT_ALL = "All-cause mortality",
  REVASC = "Revascularization",
  NEW_ONSET_HF = "New-onset heart failure",
  HF_HOSP = "HF hospitalization",
  DEMO_SEX = "Sex",
  MHCR_NPC = "Smoking",
  MHCR_NY1 = "Hypertension",
  MHCR_NY2 = "Hypercholesterolemia",
  MHCR_NY4 = "CVA (Stroke)",
  MHCR_NY10 = "CABG history",
  MHCR_NY11 = "PCI (other vessels)",
  MHCR_NY13 = "Cardiovascular family history <65 years",
  diastolic_function = "Diastolic function",
  diastolic_grade = "Diastolic grade",
  CMD = "Coronary microvascular dysfunction",
  E_e_class = "E/e' classification",
  criteria_positive = "diastolic dysfunction criteria positive",
  strain_comment = "Strain interpretation"
)

# 3. Binary variable names to convert 0 = No, 1 = Yes
binary_vars <- c(
  "All-cause mortality", "Revascularization", "New-onset heart failure",
  "HF hospitalization", "Hypertension", "Hypercholesterolemia",
  "CVA (Stroke)", "CABG history", "PCI (other vessels)",
  "Cardiovascular family history <65 years", "Coronary microvascular dysfunction"
)

# 4. Initialize list to hold summaries
summary_list <- list()

# 5. Loop through variables
for (var in cat_vars) {
  if (!var %in% colnames(data_final)) {
    message(paste("Variable", var, "not found. Skipping..."))
    next
  }

  label <- rename_map[[var]]
  counts <- table(data_final[[var]], useNA = "ifany")
  percentages <- prop.table(counts) * 100

  df_summary <- data.frame(
    Variable = label,
    Category = names(counts),
    Count = as.vector(counts),
    Percentage = round(as.vector(percentages), 2),
    stringsAsFactors = FALSE
  )

  # Recode categories
  if (label %in% binary_vars) {
    df_summary$Category[df_summary$Category == "0"] <- "No"
    df_summary$Category[df_summary$Category == "1"] <- "Yes"
  } else if (label == "Sex") {
    df_summary$Category[df_summary$Category == "1"] <- "Male"
    df_summary$Category[df_summary$Category == "2"] <- "Female"
  } else if (label == "Smoking") {
    df_summary$Category[df_summary$Category == "1"] <- "Never smoked"
    df_summary$Category[df_summary$Category == "2"] <- "Former smoker"
    df_summary$Category[df_summary$Category == "3"] <- "Current smoker"
  }

  summary_list[[label]] <- df_summary
}

# 6. Combine and clean table
summary_table <- bind_rows(summary_list)

summary_table <- summary_table %>%
  group_by(Variable) %>%
  mutate(Variable = ifelse(row_number() == 1, Variable, "")) %>%
  ungroup()

# 7. View the final table
print(summary_table)

## # A tibble: 43 × 4
##    Variable                  Category Count Percentage
##    <chr>                     <chr>    <int>      <dbl>
##  1 "All-cause mortality"     No         336      88.4 
##  2 ""                        Yes         44      11.6 
##  3 "Revascularization"       No         337      88.7 
##  4 ""                        Yes         43      11.3 
##  5 "New-onset heart failure" No         355      93.4 
##  6 ""                        Yes         25       6.58
##  7 "HF hospitalization"      No         373      98.2 
##  8 ""                        Yes          7       1.84
##  9 "Sex"                     Male       285      75   
## 10 ""                        Female      95      25   
## # ℹ 33 more rows

# Optional: Export
# write.csv(summary_table, "categorical_summary_final.csv", row.names = FALSE)

library(kableExtra)
library(dplyr)
library(knitr)

summary_table <- summary_table %>%
  mutate(VarGroup = cumsum(Variable != ""))  # mark groups

colors <- rep(c("#F0F8FF", "#FAFAD2"), length.out = length(unique(summary_table$VarGroup)))

row_colors <- colors[summary_table$VarGroup]

summary_table %>%
  select(-VarGroup) %>%
  kable("html", escape = FALSE, align = "lccc") %>%
  kable_styling(full_width = FALSE, position = "left") %>%
  row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
  row_spec(1:nrow(summary_table), background = row_colors)

Variable	Category	Count	Percentage
All-cause mortality	No	336	88.42
	Yes	44	11.58
Revascularization	No	337	88.68
	Yes	43	11.32
New-onset heart failure	No	355	93.42
	Yes	25	6.58
HF hospitalization	No	373	98.16
	Yes	7	1.84
Sex	Male	285	75.00
	Female	95	25.00
Smoking	Never	106	27.89
	Former	65	17.11
	Current	209	55.00
Hypertension	No	267	70.26
	Yes	113	29.74
Hypercholesterolemia	No	141	37.11
	Yes	239	62.89
CVA (Stroke)	No	377	99.21
	Yes	3	0.79
CABG history	No	380	100.00
PCI (other vessels)	No	376	98.95
	Yes	4	1.05
Cardiovascular family history	No	253	66.58
	Yes	127	33.42
Diastolic function	Diastolic Dysfunction Present	127	33.42
	Indeterminate Diastolic Function	114	30.00
	Normal Diastolic Function	139	36.58
Diastolic grade	Grade 0 (Normal)	139	36.58
	Grade I (Impaired relaxation)	50	13.16
	Grade II (Pseudonormal)	77	20.26
	Indeterminate	114	30.00
Coronary microvascular dysfunction	No	79	20.79
	Yes	170	44.74
	NA	131	34.47
E/e’ classification	Normal	331	87.11
	NA	49	12.89
diastolic dysfunction criteria positive	0	139	36.58
	1	114	30.00
	2	104	27.37
	3	23	6.05
Strain interpretation	Abnormal LA strain – supports dysfunction	22	5.79
	Normal LA strain	266	70.00
	Reservoir strain missing	92	24.21

library(dplyr)
library(knitr)
library(kableExtra)

# Your summary results dataframe
results_df <- data.frame(
  Variable = c("Septal e'", "Age at inclusion", "Left ventricular ejection fraction",
               "Height (cm)", "Pre mean blood pressure (mmHg)", "Pre QFR", "BSA"),
  Mean_CMD_0 = c(7.82, 59.8, 47.53, 175.58, 99.84, 0.56, 1.98),
  Mean_CMD_1 = c(7.72, 61.02, 48.27, 177.5, 104.32, 0.6, 2.01),
  P_value = c(0.7463, 0.3944, 0.5586, 0.1333, 0.0601, 0.347, 0.2636)
)

# Round means and p-values nicely
results_df <- results_df %>%
  mutate(
    Mean_CMD_0 = round(Mean_CMD_0, 2),
    Mean_CMD_1 = round(Mean_CMD_1, 2),
    P_value = signif(P_value, 3)
  )

# Print a nice table
results_df %>%
  kable(
    caption = "T-test results comparing CMD groups for selected normal variables",
    col.names = c("Variable", "Mean (CMD = 0)", "Mean (CMD = 1)", "P-value"),
    align = c("l", "r", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

T-test results comparing CMD groups for selected normal variables
Variable	Mean (CMD = 0)	Mean (CMD = 1)	P-value
Septal e’	7.82	7.72	0.7460
Age at inclusion	59.80	61.02	0.3940
Left ventricular ejection fraction	47.53	48.27	0.5590
Height (cm)	175.58	177.50	0.1330
Pre mean blood pressure (mmHg)	99.84	104.32	0.0601
Pre QFR	0.56	0.60	0.3470
BSA	1.98	2.01	0.2640

# Interpretation text you can include in your report
cat(
  "- Septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups.\n",
  "- Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant difference.\n",
  "- Left ventricular ejection fraction: Means close, p > 0.05 → no significant difference.\n",
  "- Height: Slightly taller in CMD=1, p > 0.05 → no significant difference.\n",
  "- Pre mean blood pressure: Higher in CMD=1; p-value ~0.06, close but not significant.\n",
  "- Pre QFR: Small difference, p > 0.05 → no significant difference.\n",
  "- BSA: Means very close; p > 0.05 → no significant difference.\n"
)

## - Septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups.
##  - Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant difference.
##  - Left ventricular ejection fraction: Means close, p > 0.05 → no significant difference.
##  - Height: Slightly taller in CMD=1, p > 0.05 → no significant difference.
##  - Pre mean blood pressure: Higher in CMD=1; p-value ~0.06, close but not significant.
##  - Pre QFR: Small difference, p > 0.05 → no significant difference.
##  - BSA: Means very close; p > 0.05 → no significant difference.

library(dplyr)
library(knitr)
library(kableExtra)

non_normal_vars <- c(
  "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", 
  "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
  "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", 
  "HBA1C_22", "LDL2", "EGFR2", "VS_WGT1", "VS_meanBP2", 
  "Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", 
  "Post_Length", "Post_FlowVelocity"
)

rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "AGE_AT_INCLUSION"= "Age at inclusion",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "ECHO_VAL32"      = "E",
  "CAP_NUM1"        = "Total ischemic time",
  "Complete_LVEF"   = "Left ventricular ejection fraction",
  "VS_WGT1"         = "Weight",
  "VS_HGT1"         = "Height",
  "VS_SBP1"         = "Pre Systolic blood pressure",
  "VS_DBP1"         = "Pre Diastolic blood pressure",
  "VS_meanBP1"      = "Pre Mean blood pressure",
  "VS_SBP2"         = "Post Systolic blood pressure",
  "VS_DBP2"         = "Post Diastolic blood pressure",
  "VS_meanBP2"      = "Post Mean blood pressure",
  "Pre_QFR"         = "Pre QFR",
  "Post_QFR"        = "Post QFR",
  "Pre_FlowVelocity"  = "Pre flow velocity",
  "Post_FlowVelocity" = "Post flow velocity",
  "Pre_IMR"         = "Pre IMR",
  "Post_IMR"        = "Post IMR",
  "BSA"             = "BSA",
  "EGFR2"           = "Renal function (eGFR)",
  "MRI1_INF_SZ_LV"  = "Infarct size (%)",
  "VS_BMI1"         = "BMI",
  "LDL2"            = "LDL cholesterol",
  "HBA1C_22"        = "HbA1c",
  "Pre_Length"      = "Pre vessel length",
  "Post_Length"     = "Post vessel length"
)

results <- data.frame(
  Variable = character(),
  Median_CMD_0 = numeric(),
  Median_CMD_1 = numeric(),
  P_value = numeric(),
  stringsAsFactors = FALSE
)

for (var in non_normal_vars) {
  data <- data_final %>%
    select(CMD, !!sym(var)) %>%
    filter(!is.na(CMD) & !is.na(!!sym(var)))
  
  group0 <- data %>% filter(CMD == 0) %>% pull(!!sym(var))
  group1 <- data %>% filter(CMD == 1) %>% pull(!!sym(var))
  
  if (length(group0) < 2 || length(group1) < 2) {
    warning(paste("Skipping", var, "- not enough data"))
    next
  }
  
  test <- wilcox.test(group0, group1, exact=FALSE)
  
  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[var],  # Rename variable here
      Median_CMD_0 = round(median(group0), 2),
      Median_CMD_1 = round(median(group1), 2),
      P_value = round(test$p.value, 4)
    )
  )
}

results %>%
  kable(
    caption = "Wilcoxon Rank-Sum Test Results Comparing CMD Groups on Non-Normal Variables",
    col.names = c("Variable", "Median (CMD = 0)", "Median (CMD = 1)", "P-value"),
    digits = 2,
    align = c("l", "r", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Wilcoxon Rank-Sum Test Results Comparing CMD Groups on Non-Normal Variables
	Variable	Median (CMD = 0)	Median (CMD = 1)	P-value
Post_IMR	Post IMR	18.00	41.50	0.00
MRI1_INF_SZ_LV	Infarct size (%)	8.12	8.32	1.00
CAP_NUM1	Total ischemic time	140.00	166.50	0.11
ECHO_VAL62	Lateral e’	9.40	9.35	0.60
mean_e	Mean e’	8.50	8.45	0.70
E_e_ratio	E/e’ ratio	0.08	0.07	0.95
ECHO_VAL92	LAV	50.00	53.50	0.37
LAVI	LAVI	26.23	25.97	0.70
A4C_R_RR1	Reservoir strain	26.00	27.00	0.26
A4C_CD_RR1	Conduit Strain	-14.00	-15.00	0.28
A4C_CT_RR1	Contraction Strain	-11.50	-12.00	0.40
ECHO_VAL32	E	0.68	0.64	0.45
VS_BMI1	BMI	26.50	26.20	0.78
HBA1C_22	HbA1c	40.00	39.00	0.87
LDL2	LDL cholesterol	3.80	3.80	0.56
EGFR2	Renal function (eGFR)	92.50	94.00	0.79
VS_WGT1	Weight	80.00	81.50	0.43
VS_meanBP2	Post Mean blood pressure	87.00	96.00	0.00
Post_QFR	Post QFR	0.98	0.99	0.00
Pre_IMR	Pre IMR	17.00	25.00	0.09
Pre_Length	Pre vessel length	61.80	62.95	0.37
Pre_FlowVelocity	Pre flow velocity	19.30	14.60	0.46
Post_Length	Post vessel length	64.10	65.80	0.97
Post_FlowVelocity	Post flow velocity	27.00	13.20	0.00

Interpretation of results: 1. Post IMR: Significant difference; CMD=1 group has much higher Post IMR values.

Post Mean blood pressure: Significant difference; CMD=1 group has higher post mean blood pressure.
Post QFR: Significant different; CMD=1 group has higher post QFR
Post flow velocity: Significant difference; CMD=1 group has markedly lower post flow velocity.
Pre IMR: Trend toward higher Pre IMR in CMD=1 group, but not statistically significant.
Infract size:: No difference between groups; very similar infarct sizes.
Total ischemic time:: No statistically significant difference, though CMD=1 tends to have longer ischemic time.
all other cont. variables:: no statistically significant difference.

library(dplyr)

# 1. Variables
cat_vars <- c(
  "MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
  "MHCR_NPC", "MHCR_NY1", "MHCR_NY2", "MHCR_NY4",
  "MHCR_NY10", "MHCR_NY11", "MHCR_NY13",
  "diastolic_function", "diastolic_grade",
  "E_e_class", "criteria_positive", "strain_comment"
)

# 2. Mapping readable labels
rename_map <- c(
  MORT_ALL = "All-cause mortality",
  REVASC = "Revascularization",
  NEW_ONSET_HF = "New-onset heart failure",
  HF_HOSP = "HF hospitalization",
  DEMO_SEX = "Sex",
  MHCR_NPC = "Smoking",
  MHCR_NY1 = "Hypertension",
  MHCR_NY2 = "Hypercholesterolemia",
  MHCR_NY4 = "CVA (Stroke)",
  MHCR_NY10 = "CABG history",
  MHCR_NY11 = "PCI (other vessels)",
  MHCR_NY13 = "Cardiovascular family history <65 years",
  diastolic_function = "Diastolic function",
  diastolic_grade = "Diastolic grade",
  E_e_class = "E/e' classification",
  criteria_positive = "Diastolic dysfunction criteria positive",
  strain_comment = "Strain interpretation"
)

# 3. Run Chi-square or Fisher's test for each variable
results <- data.frame(
  Variable = character(),
  P_value = numeric(),
  Test = character(),
  stringsAsFactors = FALSE
)

for (var in cat_vars) {
  tbl <- table(data_final[[var]], data_final$CMD)
  
  # Skip if only 1 row level
  if (nrow(tbl) < 2) {
    warning(paste("Skipping", var, "- only one category present"))
    next
  }
  
  # Attempt Chi-square, fallback to Fisher if needed
  expected <- suppressWarnings(chisq.test(tbl)$expected)
  if (any(expected < 5)) {
    test <- fisher.test(tbl)
    test_name <- "Fisher's exact"
  } else {
    test <- chisq.test(tbl)
    test_name <- "Chi-square"
  }
  
  # Append results
  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[[var]],
      P_value = round(test$p.value, 4),
      Test = test_name,
      stringsAsFactors = FALSE
    )
  )
}

## Warning: Skipping MHCR_NY10 - only one category present

## Warning: Skipping E_e_class - only one category present

# 4. Print table
print(results)

##                                   Variable P_value           Test
## 1                      All-cause mortality  0.8900     Chi-square
## 2                        Revascularization  0.3971     Chi-square
## 3                  New-onset heart failure  0.8140     Chi-square
## 4                       HF hospitalization  1.0000 Fisher's exact
## 5                                      Sex  0.4337     Chi-square
## 6                                  Smoking  0.9920     Chi-square
## 7                             Hypertension  0.9179     Chi-square
## 8                     Hypercholesterolemia  0.3756     Chi-square
## 9                             CVA (Stroke)  0.5348 Fisher's exact
## 10                     PCI (other vessels)  0.5935 Fisher's exact
## 11 Cardiovascular family history <65 years  0.3128     Chi-square
## 12                      Diastolic function  0.8834     Chi-square
## 13                         Diastolic grade  0.9236     Chi-square
## 14 Diastolic dysfunction criteria positive  0.9591 Fisher's exact
## 15                   Strain interpretation  0.8597 Fisher's exact

# Optional: Pretty table using kableExtra
# install.packages("kableExtra")  # if not installed
library(kableExtra)
results %>%
  kable("html", align = "lcc", caption = "Bivariate analysis of categorical variables by CMD status") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Bivariate analysis of categorical variables by CMD status
Variable	P_value	Test
All-cause mortality	0.8900	Chi-square
Revascularization	0.3971	Chi-square
New-onset heart failure	0.8140	Chi-square
HF hospitalization	1.0000	Fisher’s exact
Sex	0.4337	Chi-square
Smoking	0.9920	Chi-square
Hypertension	0.9179	Chi-square
Hypercholesterolemia	0.3756	Chi-square
CVA (Stroke)	0.5348	Fisher’s exact
PCI (other vessels)	0.5935	Fisher’s exact
Cardiovascular family history <65 years	0.3128	Chi-square
Diastolic function	0.8834	Chi-square
Diastolic grade	0.9236	Chi-square
Diastolic dysfunction criteria positive	0.9591	Fisher’s exact
Strain interpretation	0.8597	Fisher’s exact

Interpretation: p value >0.05 which means there is no significant association between CMD status and baseline characteristics i.e. no difference in mortality between CMD groups, similar revascularization rates, no association of new onset heart failure with CMD, HF hospitalizations show identical distributions - P value is 1, CMD not linked to sex differences, Smoking patterns are similar across CMD groups. No difference in diastolic function between groups.

results <- data.frame(
  Variable = character(),
  CMD_1 = character(),
  CMD_0 = character(),
  P_value = numeric(),
  Test = character(),
  stringsAsFactors = FALSE
)

for (var in cat_vars) {
  tbl <- table(data_final[[var]], data_final$CMD)

  if (nrow(tbl) < 2) {
    warning(paste("Skipping", var, "- only one category present"))
    next
  }

  # Choose statistical test
  expected <- suppressWarnings(chisq.test(tbl)$expected)
  if (any(expected < 5)) {
    test <- fisher.test(tbl)
    test_name <- "Fisher's exact"
  } else {
    test <- chisq.test(tbl)
    test_name <- "Chi-square"
  }

  # Determine the "event" row (usually coded as 1, or "Yes", etc.)
  rownames_tbl <- rownames(tbl)
  target_row <- if ("1" %in% rownames_tbl) "1" else rownames_tbl[1]

  # Calculate percentages for CMD = 1 and CMD = 0
  perc_1 <- if ("1" %in% colnames(tbl)) {
    100 * tbl[target_row, "1"] / sum(tbl[, "1"])
  } else {
    100 * tbl[target_row, 2] / sum(tbl[, 2])
  }

  perc_0 <- if ("0" %in% colnames(tbl)) {
    100 * tbl[target_row, "0"] / sum(tbl[, "0"])
  } else {
    100 * tbl[target_row, 1] / sum(tbl[, 1])
  }

  results <- rbind(
    results,
    data.frame(
      Variable = rename_map[[var]],
      CMD_1 = sprintf("%.1f%%", perc_1),
      CMD_0 = sprintf("%.1f%%", perc_0),
      P_value = round(test$p.value, 4),
      Test = test_name,
      stringsAsFactors = FALSE
    )
  )
}

## Warning: Skipping MHCR_NY10 - only one category present

## Warning: Skipping E_e_class - only one category present

# Optional: Styled table
library(kableExtra)
results %>%
  kable("html", align = "lcccl", caption = "Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)
Variable	CMD_1	CMD_0	P_value	Test
All-cause mortality	12.9%	11.4%	0.8900	Chi-square
Revascularization	9.4%	13.9%	0.3971	Chi-square
New-onset heart failure	5.9%	7.6%	0.8140	Chi-square
HF hospitalization	2.4%	2.5%	1.0000	Fisher’s exact
Sex	78.8%	73.4%	0.4337	Chi-square
Smoking	24.7%	25.3%	0.9920	Chi-square
Hypertension	70.6%	72.2%	0.9179	Chi-square
Hypercholesterolemia	37.1%	30.4%	0.3756	Chi-square
CVA (Stroke)	99.4%	98.7%	0.5348	Fisher’s exact
PCI (other vessels)	98.8%	97.5%	0.5935	Fisher’s exact
Cardiovascular family history <65 years	69.4%	62.0%	0.3128	Chi-square
Diastolic function	32.9%	35.4%	0.8834	Chi-square
Diastolic grade	41.2%	38.0%	0.9236	Chi-square
Diastolic dysfunction criteria positive	25.9%	26.6%	0.9591	Fisher’s exact
Strain interpretation	5.3%	6.3%	0.8597	Fisher’s exact

library(ggplot2)
library(dplyr)

# Step 1: Filter the dataset for the two groups of interest
df_strat <- data_final %>%
  filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present")) %>%
  select(Post_IMR, diastolic_function)

# Step 2: Create the boxplot with jittered individual data points
ggplot(df_strat, aes(x = diastolic_function, y = Post_IMR, fill = diastolic_function)) +
  # Boxplot with narrower width and no outliers (to avoid duplicate points with jitter)
  geom_boxplot(width = 0.4, position = position_dodge(width = 0.6), outlier.shape = NA) +
  
  # Jittered points for individual data visibility
  geom_jitter(width = 0.15, alpha = 0.4, color = "black") +
  
  # Labels and appearance
  labs(
    title = "Post-IMR by Diastolic Function Status",
    x = "Diastolic Function",
    y = "Post-IMR"
  ) +
  
  # Custom colors
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  
  # Clean theme
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "none",
    axis.text.x = element_text(size = 12, angle = 0, vjust = 0.5)
  )

## Warning: Removed 82 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 82 rows containing missing values or values outside the scale range
## (`geom_point()`).

wilcox.test(Post_IMR ~ diastolic_function, data = df_strat)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Post_IMR by diastolic_function
## W = 4154, p-value = 0.8993
## alternative hypothesis: true location shift is not equal to 0

Interpretation Wilcoxon rank rum test: there is no signifficant differente in post IMR between patients wit diastolic dysfunction and patients with normal diastolic function.

library(dplyr)
library(tibble)

# Filter data to exclude 'Indeterminate'
filtered <- data_final %>%
  filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present"))

# Create contingency table
tab <- table(filtered$CMD, filtered$diastolic_function)
colnames(tab) <- c("Diastolic Dysfunction Present", "Normal Diastolic Function")
rownames(tab) <- c("CMD Absent (Post_IMR < 25)", "CMD Present (Post_IMR ≥ 25)")

# Convert contingency table to data frame
summary_table <- as.data.frame.matrix(tab) %>%
  rownames_to_column(var = "CMD Status") %>%
  mutate(Total = `Diastolic Dysfunction Present` + `Normal Diastolic Function`)

# Run chi-square test with Yates continuity correction
chi_res <- chisq.test(tab, correct = TRUE)

# Print the table
print(summary_table)

##                    CMD Status Diastolic Dysfunction Present
## 1  CMD Absent (Post_IMR < 25)                            28
## 2 CMD Present (Post_IMR ≥ 25)                            56
##   Normal Diastolic Function Total
## 1                        30    58
## 2                        70   126

# Create a data frame for test results
test_results <- data.frame(
  Statistic = c("Chi-squared", "Degrees of freedom", "p-value"),
  Value = c(round(chi_res$statistic, 3), chi_res$parameter, round(chi_res$p.value, 4))
)

# Print test results
cat("\nChi-square test results:\n")

## 
## Chi-square test results:

print(test_results)

##                    Statistic  Value
## X-squared        Chi-squared 0.1060
## df        Degrees of freedom 1.0000
##                      p-value 0.7448

Interpretation: CMD prevalence is not significantly higher among patients with diastolic dysfunction.

CMD (binary or continuous) does not differ significantly across diastolic function groups.

install.packages("corrplot", repos = "https://cloud.r-project.org")

## 
## The downloaded binary packages are in
##  /var/folders/3g/ln4t2cvs3250dv7p12p13c_c0000gn/T//RtmpFGl0km/downloaded_packages

library(dplyr)
library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(tibble)
library(knitr)
library(kableExtra)

# Define variables
vars <- c(
  "Post_IMR",
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1",    # Contraction Strain
  "MRI1_INF_SZ_LV", # Infarct size
  "Complete_LVEF"   # Complete LVEF
)

# Filter complete cases
corr_data <- data_final %>%
  select(all_of(vars)) %>%
  filter(complete.cases(.))

# Calculate Spearman correlation
corr_res <- rcorr(as.matrix(corr_data), type = "spearman")
corr_mat <- corr_res$r
p_mat <- corr_res$P

# Variables excluding Post_IMR
vars_except_post_imr <- setdiff(colnames(corr_mat), "Post_IMR")

# Build results dataframe
post_imr_corr <- data.frame(
  Variable = vars_except_post_imr,
  Correlation = corr_mat["Post_IMR", vars_except_post_imr],
  P_value = p_mat["Post_IMR", vars_except_post_imr]
)

# Rename variables for readability
rename_map <- c(
  "ECHO_VAL52"      = "Septal e'",
  "ECHO_VAL62"      = "Lateral e'",
  "mean_e"          = "Mean e'",
  "E_e_ratio"       = "E/e' ratio",
  "ECHO_VAL92"      = "LAV",
  "LAVI"            = "LAVI",
  "A4C_R_RR1"       = "Reservoir strain",
  "A4C_CD_RR1"      = "Conduit Strain",
  "A4C_CT_RR1"      = "Contraction Strain",
  "MRI1_INF_SZ_LV"  = "Infarct size",
  "Complete_LVEF"   = "Complete LVEF"
)

post_imr_corr$Variable <- rename_map[post_imr_corr$Variable]

# Round correlation and p-values for display
post_imr_corr <- post_imr_corr %>%
  mutate(
    Correlation = round(Correlation, 3),
    P_value = signif(P_value, 3)
  )

# Create a nice formatted table
post_imr_corr %>%
  kable(
    caption = "Spearman Correlation of Post_IMR with Other Variables",
    col.names = c("Variable", "Spearman Correlation (rho)", "P-value"),
    align = c("l", "r", "r")
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  column_spec(3, color = ifelse(post_imr_corr$P_value < 0.05, "red", "black")) %>% # highlight significant p-values in red
  footnote(general = "P-values < 0.05 indicate statistically significant correlations.")

Spearman Correlation of Post_IMR with Other Variables
	Variable	Spearman Correlation (rho)	P-value
ECHO_VAL52	Septal e’	-0.188	0.0543
ECHO_VAL62	Lateral e’	-0.168	0.0860
mean_e	Mean e’	-0.206	0.0352
E_e_ratio	E/e’ ratio	0.040	0.6840
ECHO_VAL92	LAV	0.092	0.3510
LAVI	LAVI	0.073	0.4600
A4C_R_RR1	Reservoir strain	-0.146	0.1360
A4C_CD_RR1	Conduit Strain	0.146	0.1370
A4C_CT_RR1	Contraction Strain	0.065	0.5090
MRI1_INF_SZ_LV	Infarct size	-0.083	0.4020
Complete_LVEF	Complete LVEF	-0.008	0.9350
Note:
P-values < 0.05 indicate statistically significant correlations.

Interpretation: The Mean e’ (average early diastolic mitral annular velocity) shows a statistically significant weak negative correlation with Post_IMR, meaning higher microvascular resistance (Post_IMR) tends to be associated with worse diastolic function (lower Mean e’).

Septal e’ shows a similar negative trend but just misses statistical significance (p = 0.054).

Other diastolic function markers (like E/e’ ratio, LAVI) and infarct size or LVEF do not show significant correlations with Post_IMR in your data.

Overall, the associations are weak, indicating that Post_IMR may have a subtle relationship mainly with early diastolic function markers but not strongly related to infarct size or overall systolic function (LVEF).

library(dplyr)
library(knitr)
library(kableExtra)

# Create the CMD_20 variable
data_sensitivity <- data_final %>%
  mutate(
    CMD_20 = case_when(
      is.na(Post_IMR) ~ NA_real_,
      Post_IMR >= 20 ~ 1,
      Post_IMR < 20 ~ 0
    )
  )

# Create contingency table
table_CMD20 <- table(data_sensitivity$CMD_20, data_sensitivity$diastolic_function)

# Perform Chi-squared test
chi_sq_result <- chisq.test(table_CMD20)

# Convert the contingency table to a data frame for nicer printing
table_df <- as.data.frame.matrix(table_CMD20)

# Add row names as a column
table_df <- table_df %>% 
  tibble::rownames_to_column(var = "CMD_20")

# Rename CMD_20 values for clarity
table_df$CMD_20 <- ifelse(table_df$CMD_20 == 0, "Post_IMR < 20", "Post_IMR >= 20")

# Print the contingency table nicely
cat("### Contingency Table: CMD_20 vs Diastolic Function\n")

## ### Contingency Table: CMD_20 vs Diastolic Function

table_df %>%
  kable(caption = "Contingency Table of CMD_20 and Diastolic Function") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Contingency Table of CMD_20 and Diastolic Function
CMD_20	Diastolic Dysfunction Present	Indeterminate Diastolic Function	Normal Diastolic Function
Post_IMR < 20	14	15	21
Post_IMR >= 20	70	50	79

# Print Chi-squared test results
cat("\n### Chi-squared Test Result\n")

## 
## ### Chi-squared Test Result

cat("Chi-squared =", round(chi_sq_result$statistic, 3), "\n")

## Chi-squared = 1.026

cat("Degrees of freedom =", chi_sq_result$parameter, "\n")

## Degrees of freedom = 2

cat("P-value =", signif(chi_sq_result$p.value, 4), "\n")

## P-value = 0.5986

Sensitivity analysis: The p-value (0.60) is not statistically significant.

This means there is no strong evidence of an association between CMD (using the more inclusive IMR ≥ 20) and diastolic dysfunction categories in your dataset.

This result is consistent with your primary analysis using IMR ≥ 25 (which also had a non-significant result).

Conclusion: Changing the CMD cutoff from 25 to 20 does not materially change the result — this suggests your original finding is robust to this threshold change.

# Calculate median ischemic time
median_time <- median(data_final$CAP_NUM1, na.rm = TRUE)

# Create stratified variable: early vs late
data_final <- data_final %>%
  mutate(
    ischemic_group = case_when(
      is.na(CAP_NUM1) ~ NA_character_,
      CAP_NUM1 <= median_time ~ "Early",
      CAP_NUM1 > median_time ~ "Late"
    )
  )

# Boxplot of Post_IMR by ischemic time group
library(ggplot2)

ggplot(data_final, aes(x = ischemic_group, y = Post_IMR)) +
  geom_boxplot(fill = "#69b3a2", alpha = 0.6) +
  geom_jitter(width = 0.2, alpha = 0.4, color = "black") +
  theme_minimal() +
  labs(title = "Post_IMR by Ischemic Time Group", x = "Ischemic Time Group", y = "Post IMR")

## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).

wilcox.test(Post_IMR ~ ischemic_group, data = data_final)

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Post_IMR by ischemic_group
## W = 6965, p-value = 0.1704
## alternative hypothesis: true location shift is not equal to 0

library(knitr)
library(kableExtra)

# Prepare the test result as a data frame
wilcox_summary <- data.frame(
  Test = "Wilcoxon rank sum test",
  Data = "Post_IMR by ischemic_group",
  Statistic_W = 6965,
  P_value = 0.1704,
  Alternative_Hypothesis = "true location shift is not equal to 0"
)

# Print the table nicely
wilcox_summary %>%
  kable(caption = "Wilcoxon Rank Sum Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)

Wilcoxon Rank Sum Test Result
Test	Data	Statistic_W	P_value	Alternative_Hypothesis
Wilcoxon rank sum test	Post_IMR by ischemic_group	6965	0.1704	true location shift is not equal to 0

Sensitivity analysis using ischemic time: The p-value of 0.1704 indicates that there is no statistically significant difference in Post_IMR between patients who had early vs late revascularization (based on your median split of ischemic time).

ischemic time (as a binary early/late grouping) does not appear to significantly influence CMD severity (as measured by IMR) in this dataset.

library(dplyr)
library(broom)

## Warning: package 'broom' was built under R version 4.3.3

library(knitr)
library(kableExtra)

data_final_backup <- readRDS("data_final_backup.rds")

# Convert to labeled factors
data_final5 <- data_final_backup %>%
  mutate(
    Hypertension = factor(MHCR_NY1, levels = c(0, 1), labels = c("No", "Yes")),
    Sex = factor(DEMO_SEX, levels = c(1, 2), labels = c("Male", "Female"))
  )

# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1 + Hypertension + Sex, data = data_final5)

# Tidy model output
model_table <- broom::tidy(model)

# Rename variables for readability
model_table <- model_table %>%
  mutate(
    term = case_when(
      term == "(Intercept)" ~ "Intercept",
      term == "Post_IMR" ~ "Post IMR",
      term == "CAP_NUM1" ~ "Ischemic Time",
      term == "Post_IMR:CAP_NUM1" ~ "Post IMR : Ischemic Time (Interaction)",
      term == "HypertensionNo" ~ "Hypertension: No",
      term == "HypertensionYes" ~ "Hypertension: Yes",
      term == "SexMale" ~ "Sex: Male",
      term == "SexFemale" ~ "Sex: Female",
      TRUE ~ term
    ),
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3)
  )

# Print the table with a title
cat("\n### Regression Model Results: Infarct Size ~ Post IMR * Ischemic Time + Hypertension + Sex\n\n")

## 
## ### Regression Model Results: Infarct Size ~ Post IMR * Ischemic Time + Hypertension + Sex

kable(model_table, caption = "Regression Model Summary", booktabs = TRUE) %>%
  kable_styling(full_width = FALSE)

Regression Model Summary
term	estimate	std.error	statistic	p.value
Intercept	7.5361	2.4220	3.112	0.0022
Post IMR	-0.0015	0.0614	-0.025	0.9800
Ischemic Time	0.0077	0.0098	0.780	0.4370
Hypertension: Yes	0.6194	1.4026	0.442	0.6590
Sex: Female	1.1456	1.5485	0.740	0.4600
Post IMR : Ischemic Time (Interaction)	0.0000	0.0002	-0.130	0.8960

None of the variables or interaction terms are statistically significant (all p-values > 0.05).

The interaction between Post_IMR and ischemic time is not significant → this suggests that the relationship between IMR and infarct size doesn’t change significantly depending on ischemic time.

Model Fit R-squared = 0.023 → the model explains only 2.3% of the variance in infarct size.

Adjusted R-squared = -0.007 → adjusting for predictors makes the model slightly worse.

F-test p = 0.5719 → the overall model is not statistically significant.

# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)

# Show model summary
summary(model)

## 
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.066  -6.593  -1.186   4.954  22.096 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)        7.845e+00  2.389e+00   3.284  0.00125 **
## Post_IMR          -1.430e-04  6.082e-02  -0.002  0.99813   
## CAP_NUM1           7.968e-03  9.789e-03   0.814  0.41681   
## Post_IMR:CAP_NUM1 -3.372e-05  2.269e-04  -0.149  0.88208   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.829 on 164 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.01805,    Adjusted R-squared:  8.288e-05 
## F-statistic: 1.005 on 3 and 164 DF,  p-value: 0.3923

ischemic time does not appear to modify the relationship between IMR and infarct size.

# Create log-transformed infarct size
data_final <- data_final %>%
  mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV))  # log1p handles 0 values safely

# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1 + 
                  factor(MHCR_NY1) + factor(DEMO_SEX), data = data_final)

summary(model_log)

## 
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1 + factor(MHCR_NY1) + 
##     factor(DEMO_SEX), data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1839 -0.5855  0.2618  0.7926  1.6269 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             1.754e+00  3.180e-01   5.516 1.35e-07 ***
## Post_IMR               -2.658e-03  8.066e-03  -0.330    0.742    
## CAP_NUM1                1.039e-03  1.290e-03   0.806    0.422    
## factor(MHCR_NY1)Yes     1.424e-01  1.841e-01   0.773    0.441    
## factor(DEMO_SEX)Female -4.678e-02  2.033e-01  -0.230    0.818    
## Post_IMR:CAP_NUM1       1.399e-06  2.990e-05   0.047    0.963    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.031 on 162 degrees of freedom
##   (212 observations deleted due to missingness)
## Multiple R-squared:  0.03197,    Adjusted R-squared:  0.002095 
## F-statistic:  1.07 on 5 and 162 DF,  p-value: 0.3789

library(broom)
library(dplyr)

# Create log-transformed infarct size
data_final <- data_final %>%
  mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV))  # handles zeros safely

# Run the model with interaction
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_final)

# Tidy model output
model_log_table <- broom::tidy(model_log)

# Rename variables for readability
rename_map <- c(
  "(Intercept)"       = "Intercept",
  "Post_IMR"          = "Post IMR",
  "CAP_NUM1"          = "Ischemic Time",
  "Post_IMR:CAP_NUM1" = "Post IMR : Ischemic Time (Interaction)"
)

model_log_table <- model_log_table %>%
  mutate(
    term = rename_map[term],
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3)
  )

print(model_log_table)

## # A tibble: 4 × 5
##   term                                   estimate std.error statistic    p.value
##   <chr>                                     <dbl>     <dbl>     <dbl>      <dbl>
## 1 Intercept                                1.76      0.313      5.61     8.59e-8
## 2 Post IMR                                -0.0019    0.008     -0.244    8.07e-1
## 3 Ischemic Time                            0.0011    0.0013     0.818    4.14e-1
## 4 Post IMR : Ischemic Time (Interaction)   0         0          0.042    9.67e-1

library(broom)
library(dplyr)

# Identify extreme IMR values (e.g., > 99th percentile)
imr_cutoff <- quantile(data_final$Post_IMR, 0.99, na.rm = TRUE)

# Create filtered dataset excluding extreme IMR
data_no_outliers <- data_final %>%
  filter(Post_IMR <= imr_cutoff)

# Fit model: infarct size ~ Post_IMR
model_no_outliers <- lm(MRI1_INF_SZ_LV ~ Post_IMR, data = data_no_outliers)

# Fit model with log-transformed infarct size and interaction
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_no_outliers)

# Create tidy summaries
summary_no_outliers <- broom::tidy(model_no_outliers) %>%
  mutate(Model = "Infarct Size ~ Post_IMR")

summary_log <- broom::tidy(model_log) %>%
  mutate(Model = "Log(Infarct Size) ~ Post_IMR * CAP_NUM1")

# Combine both tables
combined_results <- bind_rows(summary_no_outliers, summary_log) %>%
  select(Model, term, estimate, std.error, statistic = statistic, p.value) %>%
  mutate(
    estimate = round(estimate, 4),
    std.error = round(std.error, 4),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3)
  )

print(combined_results)

## # A tibble: 6 × 6
##   Model                               term  estimate std.error statistic p.value
##   <chr>                               <chr>    <dbl>     <dbl>     <dbl>   <dbl>
## 1 Infarct Size ~ Post_IMR             (Int…   9.19      1.43       6.42  1.35e-9
## 2 Infarct Size ~ Post_IMR             Post…   0.0022    0.0374     0.058 9.54e-1
## 3 Log(Infarct Size) ~ Post_IMR * CAP… (Int…   1.76      0.313      5.61  8.59e-8
## 4 Log(Infarct Size) ~ Post_IMR * CAP… Post…  -0.0019    0.008     -0.244 8.07e-1
## 5 Log(Infarct Size) ~ Post_IMR * CAP… CAP_…   0.0011    0.0013     0.818 4.14e-1
## 6 Log(Infarct Size) ~ Post_IMR * CAP… Post…   0         0          0.042 9.67e-1

# Vector of diastolic dysfunction variables
diastolic_vars <- c("ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI", "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32")
# Select all variables of interest including dependent, independent, and covariates
vars_for_model <- c(
  diastolic_vars,      # your dependent variables (e.g. ECHO_VAL52, etc.)
  "Post_IMR",
  "AGE_AT_INCLUSION",
  "DEMO_SEX",
  "VS_BMI1",
  "MHCR_NY1",
  "MHCR_NPC",
  "HBA1C_22",
  "LDL2",
  "MHCR_NY2",
  "MHCR_NY4",
  "EGFR2",
  "CAP_NUM1"
)

# Count how many complete cases you have in these vars
sum(complete.cases(data_final[, vars_for_model]))

## [1] 128

# List of diastolic dysfunction dependent variables
diastolic_vars <- c(
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1"     # Contraction Strain
)

# Loop through each diastolic variable, run unadjusted regression, store results
results_list <- list()

for (var in diastolic_vars) {
  formula <- as.formula(paste(var, "~ Post_IMR"))
  
  model_data <- data_final %>%
    select(all_of(c(var, "Post_IMR"))) %>%
    filter(complete.cases(.))
  
  model <- lm(formula, data = model_data)
  
  tidy_res <- broom::tidy(model)
  tidy_res$Dependent_Var <- var
  
  results_list[[var]] <- tidy_res
}

all_results <- do.call(rbind, results_list)
all_results <- all_results[, c("Dependent_Var", names(all_results)[names(all_results) != "Dependent_Var"])]

print(all_results)

## # A tibble: 18 × 6
##    Dependent_Var term          estimate std.error statistic  p.value
##    <chr>         <chr>            <dbl>     <dbl>     <dbl>    <dbl>
##  1 ECHO_VAL52    (Intercept)   8.21      0.343     24.0     6.39e-63
##  2 ECHO_VAL52    Post_IMR     -0.0135    0.00907   -1.49    1.39e- 1
##  3 ECHO_VAL62    (Intercept)  10.2       0.508     20.1     2.84e-51
##  4 ECHO_VAL62    Post_IMR     -0.0206    0.0134    -1.54    1.25e- 1
##  5 mean_e        (Intercept)   9.17      0.384     23.9     7.16e-63
##  6 mean_e        Post_IMR     -0.0166    0.0101    -1.64    1.03e- 1
##  7 E_e_ratio     (Intercept)   0.0781    0.00473   16.5     6.59e-40
##  8 E_e_ratio     Post_IMR      0.000121  0.000124   0.973   3.31e- 1
##  9 ECHO_VAL92    (Intercept)  55.4       2.88      19.2     1.80e-47
## 10 ECHO_VAL92    Post_IMR      0.0126    0.0751     0.168   8.66e- 1
## 11 LAVI          (Intercept)  27.9       1.38      20.3     1.90e-50
## 12 LAVI          Post_IMR     -0.00107   0.0359    -0.0299  9.76e- 1
## 13 A4C_R_RR1     (Intercept)  27.6       1.37      20.1     5.90e-49
## 14 A4C_R_RR1     Post_IMR      0.000211  0.0350     0.00602 9.95e- 1
## 15 A4C_CD_RR1    (Intercept) -15.8       1.01     -15.8     2.57e-36
## 16 A4C_CD_RR1    Post_IMR      0.0124    0.0256     0.484   6.29e- 1
## 17 A4C_CT_RR1    (Intercept) -11.7       0.948    -12.4     3.92e-26
## 18 A4C_CT_RR1    Post_IMR     -0.0129    0.0242    -0.535   5.93e- 1

library(dplyr)
library(broom)

# Rename map for dependent vars
rename_map <- c(
  "ECHO_VAL52"   = "Septal e'",
  "ECHO_VAL62"   = "Lateral e'",
  "mean_e"       = "Mean e'",
  "E_e_ratio"    = "E/e' ratio",
  "ECHO_VAL92"   = "LAV",
  "LAVI"         = "LAVI",
  "A4C_R_RR1"    = "Reservoir strain",
  "A4C_CD_RR1"   = "Conduit strain",
  "A4C_CT_RR1"   = "Contraction strain"
)

# Format and clean up the results
formatted_results <- all_results %>%
  filter(term %in% c("(Intercept)", "Post_IMR")) %>%
  mutate(
    Dependent_Var = rename_map[Dependent_Var],
    term = ifelse(term == "(Intercept)", "Intercept", "Post_IMR"),
    estimate = round(estimate, 3),
    std.error = round(std.error, 3),
    statistic = round(statistic, 3),
    p.value = signif(p.value, 3),
    Significance = case_when(
      p.value < 0.001 ~ "***",
      p.value < 0.01  ~ "**",
      p.value < 0.05  ~ "*",
      TRUE           ~ ""
    )
  ) %>%
  select(Dependent_Variable = Dependent_Var, Term = term, Estimate = estimate, 
         `Std. Error` = std.error, `t value` = statistic, `p value` = p.value, Significance)

print(formatted_results)

## # A tibble: 18 × 7
##    Dependent_Variable Term      Estimate `Std. Error` `t value` `p value`
##    <chr>              <chr>        <dbl>        <dbl>     <dbl>     <dbl>
##  1 Septal e'          Intercept    8.22         0.343    24.0    6.39e-63
##  2 Septal e'          Post_IMR    -0.013        0.009    -1.49   1.39e- 1
##  3 Lateral e'         Intercept   10.2          0.508    20.1    2.84e-51
##  4 Lateral e'         Post_IMR    -0.021        0.013    -1.54   1.25e- 1
##  5 Mean e'            Intercept    9.17         0.384    23.9    7.16e-63
##  6 Mean e'            Post_IMR    -0.017        0.01     -1.64   1.03e- 1
##  7 E/e' ratio         Intercept    0.078        0.005    16.5    6.59e-40
##  8 E/e' ratio         Post_IMR     0            0         0.973  3.31e- 1
##  9 LAV                Intercept   55.4          2.88     19.2    1.80e-47
## 10 LAV                Post_IMR     0.013        0.075     0.168  8.66e- 1
## 11 LAVI               Intercept   27.9          1.38     20.3    1.90e-50
## 12 LAVI               Post_IMR    -0.001        0.036    -0.03   9.76e- 1
## 13 Reservoir strain   Intercept   27.6          1.37     20.1    5.90e-49
## 14 Reservoir strain   Post_IMR     0            0.035     0.006  9.95e- 1
## 15 Conduit strain     Intercept  -15.8          1.00    -15.8    2.57e-36
## 16 Conduit strain     Post_IMR     0.012        0.026     0.484  6.29e- 1
## 17 Contraction strain Intercept  -11.7          0.948   -12.4    3.92e-26
## 18 Contraction strain Post_IMR    -0.013        0.024    -0.535  5.93e- 1
## # ℹ 1 more variable: Significance <chr>

library(dplyr)
library(broom)
library(knitr)
library(kableExtra)

# List of diastolic dysfunction dependent variables
diastolic_vars <- c(
  "ECHO_VAL52",    # Septal e'
  "ECHO_VAL62",    # Lateral e'
  "mean_e",
  "E_e_ratio",
  "ECHO_VAL92",    # LAV
  "LAVI",
  "A4C_R_RR1",     # Reservoir strain
  "A4C_CD_RR1",    # Conduit Strain
  "A4C_CT_RR1"     # Contraction Strain
)

# Run regressions and collect results
results_list <- list()
for (var in diastolic_vars) {
  formula <- as.formula(paste(var, "~ Post_IMR"))
  model_data <- data_final %>% select(all_of(c(var, "Post_IMR"))) %>% filter(complete.cases(.))
  model <- lm(formula, data = model_data)
  tidy_res <- broom::tidy(model)
  tidy_res$Dependent_Var <- var
  results_list[[var]] <- tidy_res
}

all_results <- do.call(rbind, results_list)

# Rename map for dependent vars
rename_map <- c(
  "ECHO_VAL52"   = "Septal e'",
  "ECHO_VAL62"   = "Lateral e'",
  "mean_e"       = "Mean e'",
  "E_e_ratio"    = "E/e' ratio",
  "ECHO_VAL92"   = "LAV",
  "LAVI"         = "LAVI",
  "A4C_R_RR1"    = "Reservoir strain",
  "A4C_CD_RR1"   = "Conduit strain",
  "A4C_CT_RR1"   = "Contraction strain"
)

# Prepare the table
formatted_results <- all_results %>%
  filter(term %in% c("(Intercept)", "Post_IMR")) %>%
  mutate(
    Dependent_Var = rename_map[Dependent_Var],
    Term = ifelse(term == "(Intercept)", "Intercept", "Post_IMR"),
    Estimate = round(estimate, 3),
    `Std. Error` = round(std.error, 3),
    `t value` = round(statistic, 3),
    `p value` = signif(p.value, 3),
    Significance = case_when(
      `p value` < 0.001 ~ "***",
      `p value` < 0.01  ~ "**",
      `p value` < 0.05  ~ "*",
      TRUE              ~ ""
    )
  ) %>%
  select(`Dependent Variable` = Dependent_Var, Term, Estimate, `Std. Error`, `t value`, `p value`, Significance)

# Print with a title
cat("\n### Table: Linear regression results for diastolic variables vs Post_IMR\n\n")

## 
## ### Table: Linear regression results for diastolic variables vs Post_IMR

formatted_results %>%
  kable(format = "html", caption = "Regression Results: Diastolic Variables ~ Post_IMR") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Regression Results: Diastolic Variables ~ Post_IMR
Dependent Variable	Term	Estimate	Std. Error	t value	p value	Significance
Septal e’	Intercept	8.215	0.343	23.976	0.000	***
Septal e’	Post_IMR	-0.013	0.009	-1.486	0.139
Lateral e’	Intercept	10.198	0.508	20.072	0.000	***
Lateral e’	Post_IMR	-0.021	0.013	-1.538	0.125
Mean e’	Intercept	9.171	0.384	23.873	0.000	***
Mean e’	Post_IMR	-0.017	0.010	-1.639	0.103
E/e’ ratio	Intercept	0.078	0.005	16.505	0.000	***
E/e’ ratio	Post_IMR	0.000	0.000	0.973	0.331
LAV	Intercept	55.379	2.878	19.241	0.000	***
LAV	Post_IMR	0.013	0.075	0.168	0.866
LAVI	Intercept	27.881	1.376	20.259	0.000	***
LAVI	Post_IMR	-0.001	0.036	-0.030	0.976
Reservoir strain	Intercept	27.591	1.372	20.116	0.000	***
Reservoir strain	Post_IMR	0.000	0.035	0.006	0.995
Conduit strain	Intercept	-15.837	1.005	-15.753	0.000	***
Conduit strain	Post_IMR	0.012	0.026	0.484	0.629
Contraction strain	Intercept	-11.721	0.948	-12.359	0.000	***
Contraction strain	Post_IMR	-0.013	0.024	-0.535	0.593

Interpretations None of the diastolic dysfunction parameters showed a statistically significant association with Post_IMR in these unadjusted models.

The direction of effect for septal, lateral, and mean e’ is negative, consistent with worse diastolic function with higher Post_IMR, but these are not statistically significant.

The E/e’ ratio and left atrial volume indices show small and non-significant effects.

Strain measures (reservoir, conduit, contraction) also show no significant relationship.

# Load necessary libraries
library(dplyr)

# Create a new dataset so original is not altered
data_final2 <- data_final %>%
  mutate(
    # Log-transform non-normal continuous dependent variables (excluding Septal e')
    log_ECHO_VAL62 = ifelse(ECHO_VAL62 > 0, log(ECHO_VAL62), NA),
log_mean_e = ifelse(mean_e > 0, log(mean_e), NA),
log_E_e_ratio = ifelse(E_e_ratio > 0, log(E_e_ratio), NA),
log_LAVI = ifelse(LAVI > 0, log(LAVI), NA),
log_A4C_R_RR1 = ifelse(A4C_R_RR1 > 0, log(A4C_R_RR1), NA),
log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA),
log_A4C_CT_RR1 = ifelse(A4C_CT_RR1 > 0, log(A4C_CT_RR1), NA)

  )

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA)`.
## Caused by warning in `log()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

# List of dependent variables
dependent_vars <- list(
  "ECHO_VAL52",        # Septal e' (no transform)
  "log_ECHO_VAL62",    # Lateral e' (log)
  "log_mean_e",        # Mean e' (log)
  "log_E_e_ratio",     # E/e' ratio (log)
  "log_LAVI",          # LAVI (log)
  "log_A4C_R_RR1",     # Reservoir strain (log)
  "log_A4C_CD_RR1",    # Conduit strain (log)
  "log_A4C_CT_RR1"     # Contraction strain (log)
)

# Function to run a model for each dependent variable
run_model <- function(dep_var) {
  model_formula <- as.formula(paste(dep_var, "~ Post_IMR + AGE_AT_INCLUSION + factor(DEMO_SEX) + factor(MHCR_NY1) + VS_BMI1 + 
                                     factor(MHCR_NY2) + factor(MHCR_NY4) + factor(MHCR_NY11) + factor(MHCR_NY13) + 
                                     factor(MHCR_NPC) + CAP_NUM1"))

  # Filter only rows with complete data for this model
  model_data <- data_final2 %>%
    select(all.vars(model_formula)) %>%
    filter(complete.cases(.))
  
  # Only run the model if enough data is left
  if (nrow(model_data) > 10) {
    model <- lm(model_formula, data = model_data)
    return(summary(model))
  } else {
    return(paste("Not enough data for:", dep_var))
  }
}

# Run all models
results <- lapply(dependent_vars, run_model)
names(results) <- dependent_vars

# Print results
results

## $ECHO_VAL52
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6749 -1.2902 -0.0376  1.1029  5.4068 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.302e+01  1.567e+00   8.310 1.27e-14 ***
## Post_IMR                -9.664e-03  8.035e-03  -1.203  0.23043    
## AGE_AT_INCLUSION        -7.701e-02  1.410e-02  -5.461 1.36e-07 ***
## factor(DEMO_SEX)Female  -1.878e-02  3.069e-01  -0.061  0.95126    
## factor(MHCR_NY1)Yes     -7.721e-01  2.925e-01  -2.640  0.00893 ** 
## VS_BMI1                 -2.115e-02  3.820e-02  -0.554  0.58048    
## factor(MHCR_NY2)Yes     -1.727e-02  2.677e-01  -0.065  0.94862    
## factor(MHCR_NY4)Yes      9.309e-01  1.906e+00   0.488  0.62573    
## factor(MHCR_NY11)Yes    -1.286e+00  9.444e-01  -1.362  0.17462    
## factor(MHCR_NY13)Yes    -1.434e-02  2.789e-01  -0.051  0.95905    
## factor(MHCR_NPC)Former   7.765e-01  3.791e-01   2.048  0.04182 *  
## factor(MHCR_NPC)Current  7.466e-01  3.278e-01   2.277  0.02379 *  
## CAP_NUM1                -4.797e-05  8.370e-04  -0.057  0.95435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.82 on 206 degrees of freedom
## Multiple R-squared:  0.2925, Adjusted R-squared:  0.2513 
## F-statistic: 7.098 on 12 and 206 DF,  p-value: 8.913e-11
## 
## 
## $log_ECHO_VAL62
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.05876 -0.21812  0.04793  0.22229  0.73778 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.0261402  0.2774712  10.906  < 2e-16 ***
## Post_IMR                -0.0015792  0.0014137  -1.117   0.2653    
## AGE_AT_INCLUSION        -0.0100952  0.0024641  -4.097 6.03e-05 ***
## factor(DEMO_SEX)Female  -0.0035316  0.0538089  -0.066   0.9477    
## factor(MHCR_NY1)Yes     -0.1218967  0.0511617  -2.383   0.0181 *  
## VS_BMI1                 -0.0070952  0.0066448  -1.068   0.2869    
## factor(MHCR_NY2)Yes     -0.0223579  0.0471547  -0.474   0.6359    
## factor(MHCR_NY4)Yes      0.2081541  0.3342594   0.623   0.5342    
## factor(MHCR_NY11)Yes    -0.1061490  0.1654612  -0.642   0.5219    
## factor(MHCR_NY13)Yes     0.0382071  0.0491873   0.777   0.4382    
## factor(MHCR_NPC)Former   0.1572549  0.0668871   2.351   0.0197 *  
## factor(MHCR_NPC)Current  0.1090616  0.0582831   1.871   0.0627 .  
## CAP_NUM1                -0.0001493  0.0001472  -1.014   0.3116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3192 on 205 degrees of freedom
## Multiple R-squared:  0.2166, Adjusted R-squared:  0.1707 
## F-statistic: 4.722 on 12 and 205 DF,  p-value: 8.512e-07
## 
## 
## $log_mean_e
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6517 -0.1725  0.0000  0.1785  0.6115 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.844e+00  2.135e-01  13.319  < 2e-16 ***
## Post_IMR                -1.348e-03  1.092e-03  -1.235  0.21836    
## AGE_AT_INCLUSION        -9.863e-03  1.908e-03  -5.169 5.51e-07 ***
## factor(DEMO_SEX)Female  -4.480e-03  4.202e-02  -0.107  0.91519    
## factor(MHCR_NY1)Yes     -1.038e-01  3.991e-02  -2.602  0.00993 ** 
## VS_BMI1                 -5.044e-03  5.181e-03  -0.973  0.33145    
## factor(MHCR_NY2)Yes     -1.950e-02  3.655e-02  -0.533  0.59428    
## factor(MHCR_NY4)Yes      1.810e-01  2.612e-01   0.693  0.48910    
## factor(MHCR_NY11)Yes    -1.402e-01  1.293e-01  -1.084  0.27952    
## factor(MHCR_NY13)Yes     2.500e-02  3.811e-02   0.656  0.51260    
## factor(MHCR_NPC)Former   1.548e-01  5.147e-02   3.007  0.00296 ** 
## factor(MHCR_NPC)Current  1.273e-01  4.436e-02   2.870  0.00453 ** 
## CAP_NUM1                -7.967e-05  1.147e-04  -0.695  0.48804    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2495 on 208 degrees of freedom
## Multiple R-squared:  0.296,  Adjusted R-squared:  0.2554 
## F-statistic: 7.287 on 12 and 208 DF,  p-value: 4.199e-11
## 
## 
## $log_E_e_ratio
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65266 -0.20691 -0.02816  0.17271  0.85327 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -3.3617119  0.2388125 -14.077  < 2e-16 ***
## Post_IMR                 0.0003635  0.0012227   0.297 0.766581    
## AGE_AT_INCLUSION         0.0082685  0.0021354   3.872 0.000146 ***
## factor(DEMO_SEX)Female   0.1381846  0.0472004   2.928 0.003810 ** 
## factor(MHCR_NY1)Yes      0.1015971  0.0452403   2.246 0.025810 *  
## VS_BMI1                  0.0092503  0.0057972   1.596 0.112136    
## factor(MHCR_NY2)Yes      0.0013379  0.0412303   0.032 0.974145    
## factor(MHCR_NY4)Yes      0.1955202  0.2900993   0.674 0.501100    
## factor(MHCR_NY11)Yes    -0.0733406  0.1438800  -0.510 0.610797    
## factor(MHCR_NY13)Yes    -0.0265422  0.0437074  -0.607 0.544357    
## factor(MHCR_NPC)Former  -0.0552739  0.0582650  -0.949 0.343931    
## factor(MHCR_NPC)Current -0.0518744  0.0502307  -1.033 0.302975    
## CAP_NUM1                 0.0002109  0.0001279   1.649 0.100660    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2768 on 201 degrees of freedom
## Multiple R-squared:  0.2251, Adjusted R-squared:  0.1789 
## F-statistic: 4.866 on 12 and 201 DF,  p-value: 5.047e-07
## 
## 
## $log_LAVI
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1287 -0.1678  0.0117  0.1749  0.9616 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.202e+00  2.638e-01  12.138   <2e-16 ***
## Post_IMR                -3.935e-04  1.304e-03  -0.302   0.7631    
## AGE_AT_INCLUSION         2.012e-03  2.430e-03   0.828   0.4086    
## factor(DEMO_SEX)Female   1.079e-02  5.165e-02   0.209   0.8347    
## factor(MHCR_NY1)Yes      8.972e-02  4.797e-02   1.870   0.0630 .  
## VS_BMI1                  1.092e-03  6.404e-03   0.170   0.8648    
## factor(MHCR_NY2)Yes     -1.041e-01  4.522e-02  -2.302   0.0224 *  
## factor(MHCR_NY4)Yes      2.256e-01  2.166e-01   1.042   0.2988    
## factor(MHCR_NY11)Yes     2.405e-02  1.530e-01   0.157   0.8753    
## factor(MHCR_NY13)Yes    -4.597e-02  4.700e-02  -0.978   0.3293    
## factor(MHCR_NPC)Former   6.123e-03  6.457e-02   0.095   0.9245    
## factor(MHCR_NPC)Current  1.783e-02  5.457e-02   0.327   0.7442    
## CAP_NUM1                -7.802e-05  1.357e-04  -0.575   0.5661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2939 on 190 degrees of freedom
## Multiple R-squared:  0.08126,    Adjusted R-squared:  0.02323 
## F-statistic:   1.4 on 12 and 190 DF,  p-value: 0.1686
## 
## 
## $log_A4C_R_RR1
## 
## Call:
## lm(formula = model_formula, data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.28726 -0.16347 -0.00903  0.23255  0.58418 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              3.6204528  0.2662794  13.596   <2e-16 ***
## Post_IMR                 0.0010891  0.0013722   0.794   0.4285    
## AGE_AT_INCLUSION        -0.0064413  0.0024951  -2.582   0.0106 *  
## factor(DEMO_SEX)Female  -0.0353346  0.0531805  -0.664   0.5073    
## factor(MHCR_NY1)Yes     -0.0524587  0.0517049  -1.015   0.3117    
## VS_BMI1                  0.0023758  0.0062718   0.379   0.7053    
## factor(MHCR_NY2)Yes     -0.1173030  0.0481184  -2.438   0.0158 *  
## factor(MHCR_NY4)Yes     -0.3085844  0.2245066  -1.375   0.1710    
## factor(MHCR_NY11)Yes    -0.2250985  0.1579848  -1.425   0.1560    
## factor(MHCR_NY13)Yes     0.0778675  0.0482985   1.612   0.1087    
## factor(MHCR_NPC)Former   0.1179118  0.0699447   1.686   0.0936 .  
## factor(MHCR_NPC)Current  0.0574994  0.0566515   1.015   0.3115    
## CAP_NUM1                -0.0001480  0.0001452  -1.020   0.3092    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3035 on 179 degrees of freedom
## Multiple R-squared:  0.1469, Adjusted R-squared:  0.08968 
## F-statistic: 2.568 on 12 and 179 DF,  p-value: 0.003665
## 
## 
## $log_A4C_CD_RR1
## [1] "Not enough data for: log_A4C_CD_RR1"
## 
## $log_A4C_CT_RR1
## [1] "Not enough data for: log_A4C_CT_RR1"

Septal E and mean e: - Age: Significant negative association. Older age is associated with lower - Septal e’ velocity (worse diastolic function). Hypertension: Significant. Hypertensive patients tend to have lower Septal e’. - Smoking: current and former smokers have a higher septal E’. Residual confousing or selection bias?

E/e ratio age, female, hypertension

LAVI: - hypercholesterolemia

# Check levels and counts for all your categorical vars
table(data_final2$MHCR_NY2)

## 
##  No Yes 
## 141 239

table(data_final2$MHCR_NY4)

## 
##  No Yes 
## 377   3

table(data_final2$MHCR_NY11)

## 
##  No Yes 
## 376   4

table(data_final2$MHCR_NY13)

## 
##  No Yes 
## 253 127

table(data_final2$MHCR_NY1)

## 
##  No Yes 
## 267 113

table(data_final2$DEMO_SEX)

## 
##   Male Female 
##    285     95

table(data_final2$MHCR_NPC)

## 
##   Never  Former Current 
##     106      65     209

table(data_final$HF_HOSP)

## 
##   0   1 
## 373   7

table(data_final$MORT_ALL)

## 
##   0   1 
## 336  44

table(data_final$REVASC)

## 
##   0   1 
## 337  43

table(data_final$NEW_ONSET_HF)

## 
##   0   1 
## 355  25

table(data_final$MHCR_NY2)

## 
##  No Yes 
## 141 239

table(data_final$MHCR_NY4)

## 
##  No Yes 
## 377   3

table(data_final$MHCR_NY10)

## 
##   0 
## 380

table(data_final$MHCR_NY11)

## 
##  No Yes 
## 376   4

table(data_final$MHCR_NY13)

## 
##  No Yes 
## 253 127

table(data_final$MHCR_NY1)

## 
##  No Yes 
## 267 113

table(data_final$DEMO_SEX)

## 
##   Male Female 
##    285     95

table(data_final$MHCR_NPC)

## 
##   Never  Former Current 
##     106      65     209

library(dplyr)
library(broom)
library(knitr)
library(kableExtra)

# Select only the needed columns and filter complete cases without modifying data_final
model_vars <- c("MRI1_INF_SZ_LV", "Post_IMR", "AGE_AT_INCLUSION", 
                "DEMO_SEX", "MHCR_NPC", "MHCR_NY1", "CAP_NUM1", "MHCR_NY11")

model_data <- data_final %>%
  select(all_of(model_vars)) %>%
  filter(complete.cases(.))

# Run the linear regression model on filtered data (no change to original dataset)
model_infarct <- lm(
  MRI1_INF_SZ_LV ~ Post_IMR + AGE_AT_INCLUSION + DEMO_SEX +
    MHCR_NPC + MHCR_NY1 + CAP_NUM1 + MHCR_NY11,
  data = model_data
)

# Tidy model output
tidy_res <- broom::tidy(model_infarct)

# Rename variables/factor terms only in the output table
rename_map <- c(
  "Post_IMR" = "Post IMR",
  "AGE_AT_INCLUSION" = "Age",
  "DEMO_SEXfemale" = "Sex: Female",
  "DEMO_SEXmale" = "Sex: Male (ref)",
  "MHCR_NPCformer" = "Smoking: Former",
  "MHCR_NPCnever" = "Smoking: Never",
  "MHCR_NPCcurrent" = "Smoking: Current",
  "MHCR_NY1yes" = "Hypertension: Yes",
  "MHCR_NY1no" = "Hypertension: No (ref)",
  "CAP_NUM1" = "Total Ischemic Time",
  "MHCR_NY11yes" = "Prior PCI: Yes",
  "MHCR_NY11no" = "Prior PCI: No (ref)"
)

# Create readable term names for factors
tidy_res$Term_Nice <- tidy_res$term
for (nm in names(rename_map)) {
  tidy_res$Term_Nice <- gsub(nm, rename_map[[nm]], tidy_res$Term_Nice)
}

# Create and display a nice HTML table with caption
tidy_res %>%
  select(Term = Term_Nice, Estimate = estimate, `Std. Error` = std.error, `t value` = statistic, `p value` = p.value) %>%
  kable(format = "html", caption = "Linear Regression Results: Infarct Size Model (No Dataset Changes)") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Linear Regression Results: Infarct Size Model (No Dataset Changes)
Term	Estimate	Std. Error	t value	p value
(Intercept)	2.9478511	4.4972869	0.6554732	0.5131104
Post IMR	-0.0076330	0.0382051	-0.1997907	0.8418996
Age	0.1022016	0.0643504	1.5882046	0.1142266
DEMO_SEXFemale	1.1208379	1.5568780	0.7199267	0.4726271
MHCR_NPCFormer	-1.9633776	1.8187009	-1.0795495	0.2819777
MHCR_NPCCurrent	-1.3414150	1.5259429	-0.8790729	0.3806883
MHCR_NY1Yes	-0.2376209	1.4442343	-0.1645307	0.8695223
Total Ischemic Time	0.0065865	0.0038386	1.7158668	0.0881342
MHCR_NY11Yes	3.9197696	4.6390524	0.8449505	0.3994081

no significant association between IMR en infract size

data_final <- data_final %>%
  mutate(
    MORT_ALL_DATE = format(mdy(MORT_ALL_DATE), "%d/%m/%Y"),
    NEW_ONSET_HF_DATE = format(mdy(NEW_ONSET_HF_DATE), "%d/%m/%Y"),
    HF_HOSP_DATE = format(mdy(HF_HOSP_DATE), "%d/%m/%Y"),
    REVASC_DATE = format(mdy(REVASC_DATE), "%d/%m/%Y")
  )

library(dplyr)
library(lubridate)

# Define your reference date
reference_date <- dmy("07/08/2015")

# Create a new dataset with calculated time differences in days
data_final_with_time_diff <- data_final %>%
  mutate(
    MORT_ALL_DATE = mdy(MORT_ALL_DATE),
    NEW_ONSET_HF_DATE = mdy(NEW_ONSET_HF_DATE),
    HF_HOSP_DATE = mdy(HF_HOSP_DATE),
    REVASC_DATE = mdy(REVASC_DATE),

    # Calculate difference in days
    days_since_mortality = as.numeric(MORT_ALL_DATE - reference_date),
    days_since_new_onset_HF = as.numeric(NEW_ONSET_HF_DATE - reference_date),
    days_since_HF_hosp = as.numeric(HF_HOSP_DATE - reference_date),
    days_since_revasc = as.numeric(REVASC_DATE - reference_date)
  )

New statistics

Anna Themistokleous

2025-08-02