# Calculate the mean e' using septal and lateral values
data_final$mean_e <- rowMeans(sapply(data_final[, c("ECHO_VAL52", "ECHO_VAL62")], as.numeric), na.rm = TRUE)
# Calculate the E/e' ratio using E and mean e' values
data_final$E_e_ratio <- data_final$ECHO_VAL32 / data_final$mean_e
# Compute the body surface area (BSA) where weight is in kg and height in cm
data_final$BSA <- 0.007184 * (data_final$VS_HGT1^0.725) * (data_final$VS_WGT1^0.425)
# Compute LAVI from LAV and BSA values
data_final$LAVI <- data_final$ECHO_VAL92 / data_final$BSA
# Classify E/e' status
data_final$E_e_class <- with(data_final, ifelse(is.na(E_e_ratio), NA,
ifelse(E_e_ratio < 8, "Normal",
ifelse(E_e_ratio <= 14, "Borderline", "Elevated"))))
library(dplyr)
data_final <- data_final %>%
rowwise() %>%
mutate(
# ASE flags (ensure numeric comparisons; convert factors/characters if needed)
LAVI_flag = ifelse(!is.na(LAVI), LAVI > 34, NA),
E_e_flag = ifelse(!is.na(E_e_ratio), E_e_ratio > 14, NA),
septal_e_flag = ifelse(!is.na(ECHO_VAL52), ECHO_VAL52 < 7, NA),
lateral_e_flag = ifelse(!is.na(ECHO_VAL62), ECHO_VAL62 < 10, NA),
# Count how many ASE criteria are positive
criteria_positive = sum(c(LAVI_flag, E_e_flag, septal_e_flag, lateral_e_flag), na.rm = TRUE),
# LA strain evaluation
strain_comment = case_when(
is.na(A4C_R_RR1) ~ "Reservoir strain missing",
A4C_R_RR1 < 18 ~ "Abnormal LA strain – supports dysfunction",
A4C_R_RR1 >= 18 ~ "Normal LA strain",
TRUE ~ "Strain unclear"
),
# Base dysfunction classification
diastolic_function = case_when(
criteria_positive >= 2 ~ "Diastolic Dysfunction Present",
criteria_positive == 1 ~ "Indeterminate Diastolic Function",
criteria_positive == 0 ~ "Normal Diastolic Function",
TRUE ~ "Insufficient Data"
),
# Grading diastolic dysfunction
diastolic_grade = case_when(
diastolic_function == "Normal Diastolic Function" ~ "Grade 0 (Normal)",
diastolic_function == "Indeterminate Diastolic Function" ~ "Indeterminate",
# Grade I: low E/e', normal LAVI, normal LA strain
E_e_ratio <= 14 & LAVI <= 34 & A4C_R_RR1 >= 18 ~ "Grade I (Impaired relaxation)",
# Grade III: assume only if reservoir strain is very low and E/e′ very high
E_e_ratio >= 20 & A4C_R_RR1 < 12 ~ "Grade III (Restrictive)",
# Grade II: 2+ abnormal criteria or abnormal LA strain
criteria_positive >= 2 | A4C_R_RR1 < 18 ~ "Grade II (Pseudonormal)",
TRUE ~ "Cannot determine grade"
)
) %>%
ungroup()
# Replace commas with dots and convert to numeric
cols_to_fix <- c("Pre_QFR", "Pre_Length", "Pre_FlowVelocity")
data_final <- data_final %>%
mutate(across(all_of(cols_to_fix), ~ as.numeric(gsub(",", ".", .))))
data_final <- data_final%>%
mutate(
CMD = case_when(
is.na(Post_IMR) ~ NA_real_, # Preserve NA if IMR is missing
Post_IMR >= 25 ~ 1, # CMD present
Post_IMR < 25 ~ 0 # CMD absent
)
)
# Load necessary libraries
library(dplyr)
# Define continuous variables
cont_vars <- c(
"AGE_AT_INCLUSION", "Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1", "Complete_LVEF",
"ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
"A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1", "HBA1C_22",
"LDL2", "EGFR2", "VS_HGT1", "VS_WGT1", "VS_meanBP1", "VS_meanBP2", "Pre_QFR",
"Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity", "Post_Length",
"Post_FlowVelocity", "BSA"
)
# Ensure numeric conversion where needed
data_final <- data_final %>%
mutate(across(all_of(cont_vars), ~ as.numeric(gsub(",", ".", .))))
# Descriptive statistics
cont_summary <- cont_vars %>%
setNames(cont_vars) %>%
lapply(function(var) {
values <- data_final[[var]]
values <- values[!is.na(values)]
if (length(values) == 0) {
return(data.frame(
Variable = var,
Valid_n = 0,
Mean = NA, SD = NA, Median = NA,
Q1 = NA, Q3 = NA,
IQR = NA, Min = NA, Max = NA,
`Median [IQR]` = NA,
`Min–Max` = NA
))
}
q <- quantile(values, probs = c(0.25, 0.75), na.rm = TRUE)
med <- median(values)
data.frame(
Variable = var,
Valid_n = length(values),
Mean = mean(values),
SD = sd(values),
Median = med,
Q1 = q[1],
Q3 = q[2],
IQR = q[2] - q[1],
Min = min(values),
Max = max(values),
`Median [IQR]` = paste0(round(med, 1), " [", round(q[1], 1), "–", round(q[2], 1), "]"),
`Min–Max` = paste0(round(min(values), 1), "–", round(max(values), 1))
)
}) %>%
bind_rows() %>%
mutate(across(c(Mean, SD, Median, Q1, Q3, IQR, Min, Max), ~ round(., 1)))
# Rename variables for readability
rename_map <- c(
"ECHO_VAL52" = "Septal e'",
"AGE_AT_INCLUSION"= "Age at inclusion",
"ECHO_VAL62" = "Lateral e'",
"mean_e" = "Mean e'",
"E_e_ratio" = "E/e' ratio",
"ECHO_VAL92" = "LAV",
"LAVI" = "LAVI",
"A4C_R_RR1" = "Reservoir strain",
"A4C_CD_RR1" = "Conduit Strain",
"A4C_CT_RR1" = "Contraction Strain",
"ECHO_VAL32" = "E",
"CAP_NUM1" = "Total ischemic time",
"Complete_LVEF" = "Left ventricular ejection fraction",
"VS_WGT1" = "Weight",
"VS_HGT1" = "Height",
"VS_SBP1" = "Pre Systolic blood pressure",
"VS_DBP1" = "Pre Diastolic blood pressure",
"VS_meanBP1" = "Pre Mean blood pressure",
"VS_SBP2" = "Post Systolic blood pressure",
"VS_DBP2" = "Post Diastolic blood pressure",
"VS_meanBP2" = "Post Mean blood pressure",
"Pre_QFR" = "Pre QFR",
"Post_QFR" = "Post QFR",
"Pre_FlowVelocity" = "Pre flow velocity",
"Post_FlowVelocity" = "Post flow velocity",
"Pre_IMR" = "Pre IMR",
"Post_IMR" = "Post IMR",
"BSA" = "BSA",
"EGFR2" = "Renal function (eGFR)",
"MRI1_INF_SZ_LV" = "Infract size (%)",
"VS_BMI1" = "BMI",
"LDL2" = "LDL cholesterol",
"HBA1C_22" = "HbA1c",
"Pre_Length" = "Pre vessel length",
"Post_Length" = "Post vessel length"
)
# Apply renaming
cont_summary$Variable <- rename_map[cont_summary$Variable]
cont_summary$Variable[is.na(cont_summary$Variable)] <- cont_summary$Variable[is.na(cont_summary$Variable)]
# Print the final summary
print(cont_summary)
## Variable Valid_n Mean SD Median Q1
## 25%...1 Age at inclusion 380 61.8 11.7 62.0 53.0
## 25%...2 Post IMR 249 35.1 16.3 35.0 22.0
## 25%...3 Infract size (%) 257 8.8 7.7 7.0 2.1
## 25%...4 Total ischemic time 380 211.6 153.6 161.0 109.0
## 25%...5 Left ventricular ejection fraction 372 48.2 9.5 48.0 42.8
## 25%...6 Septal e' 337 7.6 2.1 7.6 6.1
## 25%...7 Lateral e' 336 9.4 3.0 9.1 7.3
## 25%...8 Mean e' 340 8.5 2.3 8.4 6.8
## 25%...9 E/e' ratio 331 0.1 0.0 0.1 0.1
## 25%...10 LAV 303 55.6 16.7 53.0 44.0
## 25%...11 LAVI 303 27.8 8.1 26.1 22.4
## 25%...12 Reservoir strain 288 27.5 8.0 27.0 22.0
## 25%...13 Conduit Strain 288 -14.8 6.0 -14.0 -18.0
## 25%...14 Contraction Strain 288 -12.4 6.1 -12.0 -16.0
## 25%...15 E 344 0.7 0.2 0.7 0.5
## 25%...16 BMI 380 26.9 3.8 26.6 24.2
## 25%...17 HbA1c 374 41.1 8.6 40.0 38.0
## 25%...18 LDL cholesterol 377 3.8 1.0 3.8 3.2
## 25%...19 Renal function (eGFR) 333 93.8 21.4 93.0 80.0
## 25%...20 Height 380 176.5 9.0 176.0 170.0
## 25%...21 Weight 380 84.2 14.3 82.2 75.0
## 25%...22 Pre Mean blood pressure 354 103.3 16.3 102.5 93.0
## 25%...23 Post Mean blood pressure 326 94.8 16.4 94.0 82.0
## 25%...24 Pre QFR 96 0.6 0.2 0.6 0.5
## 25%...25 Post QFR 279 1.0 0.1 1.0 1.0
## 25%...26 Pre IMR 89 26.3 16.8 22.0 14.0
## 25%...27 Pre vessel length 96 64.9 14.7 61.8 56.4
## 25%...28 Pre flow velocity 96 17.1 8.3 14.8 10.8
## 25%...29 Post vessel length 279 67.5 14.3 64.8 58.0
## 25%...30 Post flow velocity 279 18.9 10.4 16.1 11.9
## 25%...31 BSA 380 2.0 0.2 2.0 1.9
## Q3 IQR Min Max Median..IQR. Min.Max
## 25%...1 70.0 17.0 27.0 93.0 62 [53–70] 27–93
## 25%...2 46.0 24.0 8.0 94.0 35 [22–46] 8–94
## 25%...3 13.6 11.5 0.0 32.2 7 [2.1–13.6] 0–32.2
## 25%...4 249.2 140.2 30.0 759.0 161 [109–249.2] 30–759
## 25%...5 55.0 12.2 15.0 72.0 48 [42.8–55] 15–72
## 25%...6 9.0 2.9 2.3 13.7 7.6 [6.1–9] 2.3–13.7
## 25%...7 11.3 4.0 2.8 19.1 9.1 [7.3–11.3] 2.8–19.1
## 25%...8 9.9 3.1 3.5 14.8 8.4 [6.8–9.9] 3.5–14.8
## 25%...9 0.1 0.0 0.0 0.2 0.1 [0.1–0.1] 0–0.2
## 25%...10 65.0 21.0 16.0 134.0 53 [44–65] 16–134
## 25%...11 32.0 9.6 8.2 65.2 26.1 [22.4–32] 8.2–65.2
## 25%...12 33.0 11.0 4.0 57.0 27 [22–33] 4–57
## 25%...13 -11.0 7.0 -33.0 13.0 -14 [-18–-11] -33–13
## 25%...14 -8.0 8.0 -37.0 21.0 -12 [-16–-8] -37–21
## 25%...15 0.8 0.3 0.3 1.2 0.7 [0.5–0.8] 0.3–1.2
## 25%...16 29.2 5.0 18.6 49.6 26.6 [24.2–29.2] 18.6–49.6
## 25%...17 42.0 4.0 28.0 123.0 40 [38–42] 28–123
## 25%...18 4.4 1.2 1.3 9.0 3.8 [3.2–4.4] 1.3–9
## 25%...19 106.0 26.0 46.0 170.0 93 [80–106] 46–170
## 25%...20 183.0 13.0 153.0 204.0 176 [170–183] 153–204
## 25%...21 94.0 19.0 53.0 147.0 82.2 [75–94] 53–147
## 25%...22 113.0 20.0 61.0 154.0 102.5 [93–113] 61–154
## 25%...23 106.8 24.8 55.0 166.0 94 [82–106.8] 55–166
## 25%...24 0.7 0.2 0.2 1.0 0.6 [0.5–0.7] 0.2–1
## 25%...25 1.0 0.0 0.6 1.0 1 [1–1] 0.6–1
## 25%...26 34.0 20.0 4.0 107.0 22 [14–34] 4–107
## 25%...27 71.0 14.6 28.3 107.1 61.8 [56.4–71] 28.3–107.1
## 25%...28 21.9 11.1 3.9 44.0 14.8 [10.8–21.9] 3.9–44
## 25%...29 74.0 16.0 12.4 124.9 64.8 [58–74] 12.4–124.9
## 25%...30 23.1 11.2 6.5 86.6 16.1 [11.9–23.1] 6.5–86.6
## 25%...31 2.1 0.3 1.5 2.7 2 [1.9–2.1] 1.5–2.7
library(dplyr)
library(ggplot2)
# Perform Shapiro-Wilk normality test and store p-values
normality_results <- lapply(cont_vars, function(var) {
values <- data_final[[var]]
values <- values[!is.na(values)]
if (length(values) < 3) {
return(data.frame(Variable = var, W = NA, p_value = NA))
}
test <- shapiro.test(values)
data.frame(Variable = var, W = as.numeric(test$statistic), p_value = as.numeric(test$p.value))
}) %>% bind_rows()
# Apply human-readable variable names
normality_results$Variable <- rename_map[normality_results$Variable]
normality_results$Variable[is.na(normality_results$Variable)] <- cont_vars[is.na(rename_map[normality_results$Variable])]
# Print final result
print(normality_results)
## Variable W p_value
## 1 Age at inclusion 0.9913013 2.476416e-02
## 2 Post IMR 0.9670925 1.681776e-05
## 3 Infract size (%) 0.9149106 6.299865e-11
## 4 Total ischemic time 0.8068021 5.598844e-21
## 5 Left ventricular ejection fraction 0.9919819 4.237354e-02
## 6 Septal e' 0.9920835 6.974405e-02
## 7 Lateral e' 0.9877486 6.177146e-03
## 8 Mean e' 0.9890059 1.152987e-02
## 9 E/e' ratio 0.8960622 2.928381e-14
## 10 LAV 0.9607447 2.742724e-07
## 11 LAVI 0.9604630 2.513231e-07
## 12 Reservoir strain 0.9877433 1.523506e-02
## 13 Conduit Strain 0.9751776 6.750848e-05
## 14 Contraction Strain 0.9601121 4.146433e-07
## 15 E 0.9831892 4.805298e-04
## 16 BMI 0.9534062 1.357699e-09
## 17 HbA1c 0.4972163 3.602033e-31
## 18 LDL cholesterol 0.9650271 7.681380e-08
## 19 Renal function (eGFR) 0.9874452 5.547691e-03
## 20 Height 0.9946164 2.046767e-01
## 21 Weight 0.9818021 1.011212e-04
## 22 Pre Mean blood pressure 0.9934416 1.272261e-01
## 23 Post Mean blood pressure 0.9822412 4.667785e-04
## 24 Pre QFR 0.9798262 1.459434e-01
## 25 Post QFR 0.6491108 1.376403e-23
## 26 Pre IMR 0.8616745 1.270658e-07
## 27 Pre vessel length 0.9486955 9.043549e-04
## 28 Pre flow velocity 0.9125430 8.519495e-06
## 29 Post vessel length 0.9343518 8.692440e-10
## 30 Post flow velocity 0.8085450 7.694020e-18
## 31 BSA 0.9933110 9.005425e-02
Shapiro-Wilk normality test results: variables that do not significantly variate from the normal include septal e’, age, left ventricular ejection fraction, height, pre mean blooed pressure, pre QFR and BSA.
library(ggplot2)
# Histograms
for (var in cont_vars) {
if (!var %in% colnames(data_final)) {
message(paste("Variable", var, "not found in dataframe! Skipping..."))
next
}
var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
p <- ggplot(data_final, aes_string(x = var)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
ggtitle(paste0("Histogram of ", var_name)) +
theme_minimal()
print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Boxplots
for (var in cont_vars) {
if (!var %in% colnames(data_final)) {
message(paste("Variable", var, "not found in dataframe! Skipping..."))
next
}
var_name <- ifelse(is.na(rename_map[[var]]), var, rename_map[[var]])
p <- ggplot(data_final, aes_string(x = "1", y = var)) +
geom_boxplot(fill = "steelblue", color = "black", outlier.color = "red", outlier.shape = 16) +
labs(title = paste("Boxplot of", var_name),
x = "",
y = var_name) +
theme_minimal() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
print(p)
}
## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 123 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 43 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 44 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 40 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 49 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 77 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 92 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 36 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 6 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 47 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 26 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 291 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 284 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 101 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
library(dplyr)
# 1. Categorical variables
cat_vars <- c(
"MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
"MHCR_NPC", # Smoking
"MHCR_NY1", # Hypertension
"MHCR_NY2", # Hypercholesterolemia
"MHCR_NY4", # CVA
"MHCR_NY10", # CABG
"MHCR_NY11", # PCI (other)
"MHCR_NY13", # Cardiovascular Family History
"diastolic_function", "diastolic_grade", "CMD",
"E_e_class", "criteria_positive", "strain_comment"
)
# 2. Variable label mapping
rename_map <- c(
MORT_ALL = "All-cause mortality",
REVASC = "Revascularization",
NEW_ONSET_HF = "New-onset heart failure",
HF_HOSP = "HF hospitalization",
DEMO_SEX = "Sex",
MHCR_NPC = "Smoking",
MHCR_NY1 = "Hypertension",
MHCR_NY2 = "Hypercholesterolemia",
MHCR_NY4 = "CVA (Stroke)",
MHCR_NY10 = "CABG history",
MHCR_NY11 = "PCI (other vessels)",
MHCR_NY13 = "Cardiovascular family history <65 years",
diastolic_function = "Diastolic function",
diastolic_grade = "Diastolic grade",
CMD = "Coronary microvascular dysfunction",
E_e_class = "E/e' classification",
criteria_positive = "diastolic dysfunction criteria positive",
strain_comment = "Strain interpretation"
)
# 3. Binary variable names to convert 0 = No, 1 = Yes
binary_vars <- c(
"All-cause mortality", "Revascularization", "New-onset heart failure",
"HF hospitalization", "Hypertension", "Hypercholesterolemia",
"CVA (Stroke)", "CABG history", "PCI (other vessels)",
"Cardiovascular family history <65 years", "Coronary microvascular dysfunction"
)
# 4. Initialize list to hold summaries
summary_list <- list()
# 5. Loop through variables
for (var in cat_vars) {
if (!var %in% colnames(data_final)) {
message(paste("Variable", var, "not found. Skipping..."))
next
}
label <- rename_map[[var]]
counts <- table(data_final[[var]], useNA = "ifany")
percentages <- prop.table(counts) * 100
df_summary <- data.frame(
Variable = label,
Category = names(counts),
Count = as.vector(counts),
Percentage = round(as.vector(percentages), 2),
stringsAsFactors = FALSE
)
# Recode categories
if (label %in% binary_vars) {
df_summary$Category[df_summary$Category == "0"] <- "No"
df_summary$Category[df_summary$Category == "1"] <- "Yes"
} else if (label == "Sex") {
df_summary$Category[df_summary$Category == "1"] <- "Male"
df_summary$Category[df_summary$Category == "2"] <- "Female"
} else if (label == "Smoking") {
df_summary$Category[df_summary$Category == "1"] <- "Never smoked"
df_summary$Category[df_summary$Category == "2"] <- "Former smoker"
df_summary$Category[df_summary$Category == "3"] <- "Current smoker"
}
summary_list[[label]] <- df_summary
}
# 6. Combine and clean table
summary_table <- bind_rows(summary_list)
summary_table <- summary_table %>%
group_by(Variable) %>%
mutate(Variable = ifelse(row_number() == 1, Variable, "")) %>%
ungroup()
# 7. View the final table
print(summary_table)
## # A tibble: 43 × 4
## Variable Category Count Percentage
## <chr> <chr> <int> <dbl>
## 1 "All-cause mortality" No 336 88.4
## 2 "" Yes 44 11.6
## 3 "Revascularization" No 337 88.7
## 4 "" Yes 43 11.3
## 5 "New-onset heart failure" No 355 93.4
## 6 "" Yes 25 6.58
## 7 "HF hospitalization" No 373 98.2
## 8 "" Yes 7 1.84
## 9 "Sex" Male 285 75
## 10 "" Female 95 25
## # ℹ 33 more rows
# Optional: Export
# write.csv(summary_table, "categorical_summary_final.csv", row.names = FALSE)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(dplyr)
library(knitr)
summary_table <- summary_table %>%
mutate(VarGroup = cumsum(Variable != "")) # mark groups
colors <- rep(c("#F0F8FF", "#FAFAD2"), length.out = length(unique(summary_table$VarGroup)))
row_colors <- colors[summary_table$VarGroup]
summary_table %>%
select(-VarGroup) %>%
kable("html", escape = FALSE, align = "lccc") %>%
kable_styling(full_width = FALSE, position = "left") %>%
row_spec(0, bold = TRUE, background = "#D3D3D3") %>%
row_spec(1:nrow(summary_table), background = row_colors)
| Variable | Category | Count | Percentage |
|---|---|---|---|
| All-cause mortality | No | 336 | 88.42 |
| Yes | 44 | 11.58 | |
| Revascularization | No | 337 | 88.68 |
| Yes | 43 | 11.32 | |
| New-onset heart failure | No | 355 | 93.42 |
| Yes | 25 | 6.58 | |
| HF hospitalization | No | 373 | 98.16 |
| Yes | 7 | 1.84 | |
| Sex | Male | 285 | 75.00 |
| Female | 95 | 25.00 | |
| Smoking | Never smoked | 106 | 27.89 |
| Former smoker | 65 | 17.11 | |
| Current smoker | 209 | 55.00 | |
| Hypertension | No | 267 | 70.26 |
| Yes | 113 | 29.74 | |
| Hypercholesterolemia | No | 141 | 37.11 |
| Yes | 239 | 62.89 | |
| CVA (Stroke) | No | 377 | 99.21 |
| Yes | 3 | 0.79 | |
| CABG history | No | 380 | 100.00 |
| PCI (other vessels) | No | 376 | 98.95 |
| Yes | 4 | 1.05 | |
| Cardiovascular family history | No | 253 | 66.58 |
| Yes | 127 | 33.42 | |
| Diastolic function | Diastolic Dysfunction Present | 127 | 33.42 |
| Indeterminate Diastolic Function | 114 | 30.00 | |
| Normal Diastolic Function | 139 | 36.58 | |
| Diastolic grade | Grade 0 (Normal) | 139 | 36.58 |
| Grade I (Impaired relaxation) | 50 | 13.16 | |
| Grade II (Pseudonormal) | 77 | 20.26 | |
| Indeterminate | 114 | 30.00 | |
| Coronary microvascular dysfunction | No | 79 | 20.79 |
| Yes | 170 | 44.74 | |
| NA | 131 | 34.47 | |
| E/e’ classification | Normal | 331 | 87.11 |
| NA | 49 | 12.89 | |
| diastolic dysfunction criteria positive | 0 | 139 | 36.58 |
| 1 | 114 | 30.00 | |
| 2 | 104 | 27.37 | |
| 3 | 23 | 6.05 | |
| Strain interpretation | Abnormal LA strain – supports dysfunction | 22 | 5.79 |
| Normal LA strain | 266 | 70.00 | |
| Reservoir strain missing | 92 | 24.21 |
library(dplyr)
normal_vars <- c(
"ECHO_VAL52", # Septal e'
"AGE_AT_INCLUSION",
"Complete_LVEF",
"VS_HGT1", # Height
"VS_meanBP1", # Pre mean blood pressure
"Pre_QFR",
"BSA"
)
for (var in normal_vars) {
# Select the variable and CMD
data <- data_final %>%
select(CMD, !!sym(var)) %>%
filter(!is.na(CMD) & !is.na(!!sym(var)))
# Separate into CMD groups
group0 <- data %>% filter(CMD == 0) %>% pull(!!sym(var))
group1 <- data %>% filter(CMD == 1) %>% pull(!!sym(var))
# ✅ CHECK: do both groups have at least 2 values?
if (length(group0) < 2 || length(group1) < 2) {
cat("\n⚠️ Skipping", var, "- not enough data in one of the groups\n")
next # Skip to next variable
}
# Run t-test
test <- t.test(group0, group1)
# Print result
cat("\nVariable:", var,
"\nTest: t-test",
"\nMean (CMD = 0):", round(mean(group0), 2),
"\nMean (CMD = 1):", round(mean(group1), 2),
"\nP-value:", round(test$p.value, 4), "\n")
}
##
## Variable: ECHO_VAL52
## Test: t-test
## Mean (CMD = 0): 7.82
## Mean (CMD = 1): 7.72
## P-value: 0.7463
##
## Variable: AGE_AT_INCLUSION
## Test: t-test
## Mean (CMD = 0): 59.8
## Mean (CMD = 1): 61.02
## P-value: 0.3944
##
## Variable: Complete_LVEF
## Test: t-test
## Mean (CMD = 0): 47.53
## Mean (CMD = 1): 48.27
## P-value: 0.5586
##
## Variable: VS_HGT1
## Test: t-test
## Mean (CMD = 0): 175.58
## Mean (CMD = 1): 177.5
## P-value: 0.1333
##
## Variable: VS_meanBP1
## Test: t-test
## Mean (CMD = 0): 99.84
## Mean (CMD = 1): 104.32
## P-value: 0.0601
##
## Variable: Pre_QFR
## Test: t-test
## Mean (CMD = 0): 0.56
## Mean (CMD = 1): 0.6
## P-value: 0.347
##
## Variable: BSA
## Test: t-test
## Mean (CMD = 0): 1.98
## Mean (CMD = 1): 2.01
## P-value: 0.2636
# Create a summary data frame with your t-test results
results_df <- data.frame(
Variable = c("ECHO_VAL52", "AGE_AT_INCLUSION", "Complete_LVEF", "VS_HGT1",
"VS_meanBP1", "Pre_QFR", "BSA"),
Mean_CMD_0 = c(7.82, 59.8, 47.53, 175.58, 99.84, 0.56, 1.98),
Mean_CMD_1 = c(7.72, 61.02, 48.27, 177.5, 104.32, 0.6, 2.01),
P_value = c(0.7463, 0.3944, 0.5586, 0.1333, 0.0601, 0.347, 0.2636)
)
# Print the table nicely
print(results_df)
## Variable Mean_CMD_0 Mean_CMD_1 P_value
## 1 ECHO_VAL52 7.82 7.72 0.7463
## 2 AGE_AT_INCLUSION 59.80 61.02 0.3944
## 3 Complete_LVEF 47.53 48.27 0.5586
## 4 VS_HGT1 175.58 177.50 0.1333
## 5 VS_meanBP1 99.84 104.32 0.0601
## 6 Pre_QFR 0.56 0.60 0.3470
## 7 BSA 1.98 2.01 0.2636
cat(
"- septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups for Septal e'\n",
"- Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant age difference between groups.\n",
"- Complete_LVEF:Means close, p > 0.05 → no significant difference in left ventricular ejection fraction between groups.\n",
"- Height: Slightly taller in CMD=1 group, but p > 0.05 → no significant height difference. \n",
"- Pre mean blood pressure: Mean blood pressure somewhat higher in CMD=1; p-value close to 0.05 but still not quite statistically significant. \n",
"- Pre QFR: Small mean difference, p > 0.05 → no significant difference in Pre_QFR between groups.\n",
"- BSA: Means very close; p > 0.05 → no significant difference in BSA.\n"
)
## - septal e': Means are very similar; high p-value (>0.05) means no statistically significant difference between CMD groups for Septal e'
## - Age: Slightly higher mean age in CMD=1, but p-value > 0.05 → no significant age difference between groups.
## - Complete_LVEF:Means close, p > 0.05 → no significant difference in left ventricular ejection fraction between groups.
## - Height: Slightly taller in CMD=1 group, but p > 0.05 → no significant height difference.
## - Pre mean blood pressure: Mean blood pressure somewhat higher in CMD=1; p-value close to 0.05 but still not quite statistically significant.
## - Pre QFR: Small mean difference, p > 0.05 → no significant difference in Pre_QFR between groups.
## - BSA: Means very close; p > 0.05 → no significant difference in BSA.
library(dplyr)
non_normal_vars <- c(
"Post_IMR", "MRI1_INF_SZ_LV", "CAP_NUM1",
"ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI",
"A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32", "VS_BMI1",
"HBA1C_22", "LDL2", "EGFR2", "VS_WGT1", "VS_meanBP2",
"Post_QFR", "Pre_IMR", "Pre_Length", "Pre_FlowVelocity",
"Post_Length", "Post_FlowVelocity"
)
rename_map <- c(
"ECHO_VAL52" = "Septal e'",
"AGE_AT_INCLUSION"= "Age at inclusion",
"ECHO_VAL62" = "Lateral e'",
"mean_e" = "Mean e'",
"E_e_ratio" = "E/e' ratio",
"ECHO_VAL92" = "LAV",
"LAVI" = "LAVI",
"A4C_R_RR1" = "Reservoir strain",
"A4C_CD_RR1" = "Conduit Strain",
"A4C_CT_RR1" = "Contraction Strain",
"ECHO_VAL32" = "E",
"CAP_NUM1" = "Total ischemic time",
"Complete_LVEF" = "Left ventricular ejection fraction",
"VS_WGT1" = "Weight",
"VS_HGT1" = "Height",
"VS_SBP1" = "Pre Systolic blood pressure",
"VS_DBP1" = "Pre Diastolic blood pressure",
"VS_meanBP1" = "Pre Mean blood pressure",
"VS_SBP2" = "Post Systolic blood pressure",
"VS_DBP2" = "Post Diastolic blood pressure",
"VS_meanBP2" = "Post Mean blood pressure",
"Pre_QFR" = "Pre QFR",
"Post_QFR" = "Post QFR",
"Pre_FlowVelocity" = "Pre flow velocity",
"Post_FlowVelocity" = "Post flow velocity",
"Pre_IMR" = "Pre IMR",
"Post_IMR" = "Post IMR",
"BSA" = "BSA",
"EGFR2" = "Renal function (eGFR)",
"MRI1_INF_SZ_LV" = "Infract size (%)",
"VS_BMI1" = "BMI",
"LDL2" = "LDL cholesterol",
"HBA1C_22" = "HbA1c",
"Pre_Length" = "Pre vessel length",
"Post_Length" = "Post vessel length"
)
results <- data.frame(
Variable = character(),
Median_CMD_0 = numeric(),
Median_CMD_1 = numeric(),
P_value = numeric(),
stringsAsFactors = FALSE
)
for (var in non_normal_vars) {
data <- data_final %>%
select(CMD, !!sym(var)) %>%
filter(!is.na(CMD) & !is.na(!!sym(var)))
group0 <- data %>% filter(CMD == 0) %>% pull(!!sym(var))
group1 <- data %>% filter(CMD == 1) %>% pull(!!sym(var))
if (length(group0) < 2 || length(group1) < 2) {
warning(paste("Skipping", var, "- not enough data"))
next
}
test <- wilcox.test(group0, group1, exact=FALSE)
results <- rbind(
results,
data.frame(
Variable = rename_map[var], # Rename variable here
Median_CMD_0 = round(median(group0), 2),
Median_CMD_1 = round(median(group1), 2),
P_value = round(test$p.value, 4)
)
)
}
print(results)
## Variable Median_CMD_0 Median_CMD_1 P_value
## Post_IMR Post IMR 18.00 41.50 0.0000
## MRI1_INF_SZ_LV Infract size (%) 8.12 8.32 0.9959
## CAP_NUM1 Total ischemic time 140.00 166.50 0.1062
## ECHO_VAL62 Lateral e' 9.40 9.35 0.5952
## mean_e Mean e' 8.50 8.45 0.6994
## E_e_ratio E/e' ratio 0.08 0.07 0.9503
## ECHO_VAL92 LAV 50.00 53.50 0.3664
## LAVI LAVI 26.23 25.97 0.6977
## A4C_R_RR1 Reservoir strain 26.00 27.00 0.2607
## A4C_CD_RR1 Conduit Strain -14.00 -15.00 0.2798
## A4C_CT_RR1 Contraction Strain -11.50 -12.00 0.4012
## ECHO_VAL32 E 0.68 0.64 0.4456
## VS_BMI1 BMI 26.50 26.20 0.7753
## HBA1C_22 HbA1c 40.00 39.00 0.8711
## LDL2 LDL cholesterol 3.80 3.80 0.5575
## EGFR2 Renal function (eGFR) 92.50 94.00 0.7939
## VS_WGT1 Weight 80.00 81.50 0.4290
## VS_meanBP2 Post Mean blood pressure 87.00 96.00 0.0000
## Post_QFR Post QFR 0.98 0.99 0.0035
## Pre_IMR Pre IMR 17.00 25.00 0.0857
## Pre_Length Pre vessel length 61.80 62.95 0.3676
## Pre_FlowVelocity Pre flow velocity 19.30 14.60 0.4604
## Post_Length Post vessel length 64.10 65.80 0.9683
## Post_FlowVelocity Post flow velocity 27.00 13.20 0.0000
Interpretation of results: 1. Post IMR: Significant difference; CMD=1 group has much higher Post IMR values.
Post Mean blood pressure: Significant difference; CMD=1 group has higher post mean blood pressure.
Post QFR: Significant different; CMD=1 group has higher post QFR
Post flow velocity: Significant difference; CMD=1 group has markedly lower post flow velocity.
Pre IMR: Trend toward higher Pre IMR in CMD=1 group, but not statistically significant.
Infract size:: No difference between groups; very similar infarct sizes.
Total ischemic time:: No statistically significant difference, though CMD=1 tends to have longer ischemic time.
all other cont. variables:: no statistically significant difference.
library(dplyr)
# 1. Variables
cat_vars <- c(
"MORT_ALL", "REVASC", "NEW_ONSET_HF", "HF_HOSP", "DEMO_SEX",
"MHCR_NPC", "MHCR_NY1", "MHCR_NY2", "MHCR_NY4",
"MHCR_NY10", "MHCR_NY11", "MHCR_NY13",
"diastolic_function", "diastolic_grade",
"E_e_class", "criteria_positive", "strain_comment"
)
# 2. Mapping readable labels
rename_map <- c(
MORT_ALL = "All-cause mortality",
REVASC = "Revascularization",
NEW_ONSET_HF = "New-onset heart failure",
HF_HOSP = "HF hospitalization",
DEMO_SEX = "Sex",
MHCR_NPC = "Smoking",
MHCR_NY1 = "Hypertension",
MHCR_NY2 = "Hypercholesterolemia",
MHCR_NY4 = "CVA (Stroke)",
MHCR_NY10 = "CABG history",
MHCR_NY11 = "PCI (other vessels)",
MHCR_NY13 = "Cardiovascular family history <65 years",
diastolic_function = "Diastolic function",
diastolic_grade = "Diastolic grade",
E_e_class = "E/e' classification",
criteria_positive = "Diastolic dysfunction criteria positive",
strain_comment = "Strain interpretation"
)
# 3. Run Chi-square or Fisher's test for each variable
results <- data.frame(
Variable = character(),
P_value = numeric(),
Test = character(),
stringsAsFactors = FALSE
)
for (var in cat_vars) {
tbl <- table(data_final[[var]], data_final$CMD)
# Skip if only 1 row level
if (nrow(tbl) < 2) {
warning(paste("Skipping", var, "- only one category present"))
next
}
# Attempt Chi-square, fallback to Fisher if needed
expected <- suppressWarnings(chisq.test(tbl)$expected)
if (any(expected < 5)) {
test <- fisher.test(tbl)
test_name <- "Fisher's exact"
} else {
test <- chisq.test(tbl)
test_name <- "Chi-square"
}
# Append results
results <- rbind(
results,
data.frame(
Variable = rename_map[[var]],
P_value = round(test$p.value, 4),
Test = test_name,
stringsAsFactors = FALSE
)
)
}
## Warning: Skipping MHCR_NY10 - only one category present
## Warning: Skipping E_e_class - only one category present
# 4. Print table
print(results)
## Variable P_value Test
## 1 All-cause mortality 0.8900 Chi-square
## 2 Revascularization 0.3971 Chi-square
## 3 New-onset heart failure 0.8140 Chi-square
## 4 HF hospitalization 1.0000 Fisher's exact
## 5 Sex 0.4337 Chi-square
## 6 Smoking 0.9920 Chi-square
## 7 Hypertension 0.9179 Chi-square
## 8 Hypercholesterolemia 0.3756 Chi-square
## 9 CVA (Stroke) 0.5348 Fisher's exact
## 10 PCI (other vessels) 0.5935 Fisher's exact
## 11 Cardiovascular family history <65 years 0.3128 Chi-square
## 12 Diastolic function 0.8834 Chi-square
## 13 Diastolic grade 0.9236 Chi-square
## 14 Diastolic dysfunction criteria positive 0.9591 Fisher's exact
## 15 Strain interpretation 0.8597 Fisher's exact
# Optional: Pretty table using kableExtra
# install.packages("kableExtra") # if not installed
library(kableExtra)
results %>%
kable("html", align = "lcc", caption = "Bivariate analysis of categorical variables by CMD status") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
| Variable | P_value | Test |
|---|---|---|
| All-cause mortality | 0.8900 | Chi-square |
| Revascularization | 0.3971 | Chi-square |
| New-onset heart failure | 0.8140 | Chi-square |
| HF hospitalization | 1.0000 | Fisher’s exact |
| Sex | 0.4337 | Chi-square |
| Smoking | 0.9920 | Chi-square |
| Hypertension | 0.9179 | Chi-square |
| Hypercholesterolemia | 0.3756 | Chi-square |
| CVA (Stroke) | 0.5348 | Fisher’s exact |
| PCI (other vessels) | 0.5935 | Fisher’s exact |
| Cardiovascular family history <65 years | 0.3128 | Chi-square |
| Diastolic function | 0.8834 | Chi-square |
| Diastolic grade | 0.9236 | Chi-square |
| Diastolic dysfunction criteria positive | 0.9591 | Fisher’s exact |
| Strain interpretation | 0.8597 | Fisher’s exact |
Interpretation: p value >0.05 which means there is no significant association between CMD status and baseline characteristics i.e. no difference in mortality between CMD groups, similar revascularization rates, no association of new onset heart failure with CMD, HF hospitalizations show identical distributions - P value is 1, CMD not linked to sex differences, Smoking patterns are similar across CMD groups. No difference in diastolic function between groups.
results <- data.frame(
Variable = character(),
CMD_1 = character(),
CMD_0 = character(),
P_value = numeric(),
Test = character(),
stringsAsFactors = FALSE
)
for (var in cat_vars) {
tbl <- table(data_final[[var]], data_final$CMD)
if (nrow(tbl) < 2) {
warning(paste("Skipping", var, "- only one category present"))
next
}
# Choose statistical test
expected <- suppressWarnings(chisq.test(tbl)$expected)
if (any(expected < 5)) {
test <- fisher.test(tbl)
test_name <- "Fisher's exact"
} else {
test <- chisq.test(tbl)
test_name <- "Chi-square"
}
# Determine the "event" row (usually coded as 1, or "Yes", etc.)
rownames_tbl <- rownames(tbl)
target_row <- if ("1" %in% rownames_tbl) "1" else rownames_tbl[1]
# Calculate percentages for CMD = 1 and CMD = 0
perc_1 <- if ("1" %in% colnames(tbl)) {
100 * tbl[target_row, "1"] / sum(tbl[, "1"])
} else {
100 * tbl[target_row, 2] / sum(tbl[, 2])
}
perc_0 <- if ("0" %in% colnames(tbl)) {
100 * tbl[target_row, "0"] / sum(tbl[, "0"])
} else {
100 * tbl[target_row, 1] / sum(tbl[, 1])
}
results <- rbind(
results,
data.frame(
Variable = rename_map[[var]],
CMD_1 = sprintf("%.1f%%", perc_1),
CMD_0 = sprintf("%.1f%%", perc_0),
P_value = round(test$p.value, 4),
Test = test_name,
stringsAsFactors = FALSE
)
)
}
## Warning: Skipping MHCR_NY10 - only one category present
## Warning: Skipping E_e_class - only one category present
# View result table
print(results)
## Variable CMD_1 CMD_0 P_value Test
## 1 All-cause mortality 12.9% 11.4% 0.8900 Chi-square
## 2 Revascularization 9.4% 13.9% 0.3971 Chi-square
## 3 New-onset heart failure 5.9% 7.6% 0.8140 Chi-square
## 4 HF hospitalization 2.4% 2.5% 1.0000 Fisher's exact
## 5 Sex 78.8% 73.4% 0.4337 Chi-square
## 6 Smoking 24.7% 25.3% 0.9920 Chi-square
## 7 Hypertension 29.4% 27.8% 0.9179 Chi-square
## 8 Hypercholesterolemia 62.9% 69.6% 0.3756 Chi-square
## 9 CVA (Stroke) 0.6% 1.3% 0.5348 Fisher's exact
## 10 PCI (other vessels) 1.2% 2.5% 0.5935 Fisher's exact
## 11 Cardiovascular family history <65 years 30.6% 38.0% 0.3128 Chi-square
## 12 Diastolic function 32.9% 35.4% 0.8834 Chi-square
## 13 Diastolic grade 41.2% 38.0% 0.9236 Chi-square
## 14 Diastolic dysfunction criteria positive 25.9% 26.6% 0.9591 Fisher's exact
## 15 Strain interpretation 5.3% 6.3% 0.8597 Fisher's exact
# Optional: Styled table
library(kableExtra)
results %>%
kable("html", align = "lcccl", caption = "Bivariate analysis by CMD status (CMD = 1 vs CMD = 0)") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
| Variable | CMD_1 | CMD_0 | P_value | Test |
|---|---|---|---|---|
| All-cause mortality | 12.9% | 11.4% | 0.8900 | Chi-square |
| Revascularization | 9.4% | 13.9% | 0.3971 | Chi-square |
| New-onset heart failure | 5.9% | 7.6% | 0.8140 | Chi-square |
| HF hospitalization | 2.4% | 2.5% | 1.0000 | Fisher’s exact |
| Sex | 78.8% | 73.4% | 0.4337 | Chi-square |
| Smoking | 24.7% | 25.3% | 0.9920 | Chi-square |
| Hypertension | 29.4% | 27.8% | 0.9179 | Chi-square |
| Hypercholesterolemia | 62.9% | 69.6% | 0.3756 | Chi-square |
| CVA (Stroke) | 0.6% | 1.3% | 0.5348 | Fisher’s exact |
| PCI (other vessels) | 1.2% | 2.5% | 0.5935 | Fisher’s exact |
| Cardiovascular family history <65 years | 30.6% | 38.0% | 0.3128 | Chi-square |
| Diastolic function | 32.9% | 35.4% | 0.8834 | Chi-square |
| Diastolic grade | 41.2% | 38.0% | 0.9236 | Chi-square |
| Diastolic dysfunction criteria positive | 25.9% | 26.6% | 0.9591 | Fisher’s exact |
| Strain interpretation | 5.3% | 6.3% | 0.8597 | Fisher’s exact |
library(ggplot2)
library(dplyr)
# Step 1: Filter the dataset for the two groups of interest
df_strat <- data_final %>%
filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present")) %>%
select(Post_IMR, diastolic_function)
# Step 2: Create the boxplot with jittered individual data points
ggplot(df_strat, aes(x = diastolic_function, y = Post_IMR, fill = diastolic_function)) +
# Boxplot with narrower width and no outliers (to avoid duplicate points with jitter)
geom_boxplot(width = 0.4, position = position_dodge(width = 0.6), outlier.shape = NA) +
# Jittered points for individual data visibility
geom_jitter(width = 0.15, alpha = 0.4, color = "black") +
# Labels and appearance
labs(
title = "Post-IMR by Diastolic Function Status",
x = "Diastolic Function",
y = "Post-IMR"
) +
# Custom colors
scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
# Clean theme
theme_minimal(base_size = 14) +
theme(
legend.position = "none",
axis.text.x = element_text(size = 12, angle = 0, vjust = 0.5)
)
## Warning: Removed 82 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 82 rows containing missing values or values outside the scale range
## (`geom_point()`).
wilcox.test(Post_IMR ~ diastolic_function, data = df_strat)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Post_IMR by diastolic_function
## W = 4154, p-value = 0.8993
## alternative hypothesis: true location shift is not equal to 0
Interpretation: there is no signifficant differente in post IMR between patients wit diastolic dysfunction and patients with normal diastolic function.
library(dplyr)
library(tibble)
# Filter data to exclude 'Indeterminate'
filtered <- data_final %>%
filter(diastolic_function %in% c("Normal Diastolic Function", "Diastolic Dysfunction Present"))
# Create contingency table
tab <- table(filtered$CMD, filtered$diastolic_function)
colnames(tab) <- c("Diastolic Dysfunction Present", "Normal Diastolic Function")
rownames(tab) <- c("CMD Absent (Post_IMR < 25)", "CMD Present (Post_IMR ≥ 25)")
# Convert contingency table to data frame
summary_table <- as.data.frame.matrix(tab) %>%
rownames_to_column(var = "CMD Status") %>%
mutate(Total = `Diastolic Dysfunction Present` + `Normal Diastolic Function`)
# Run chi-square test with Yates continuity correction
chi_res <- chisq.test(tab, correct = TRUE)
# Print the table
print(summary_table)
## CMD Status Diastolic Dysfunction Present
## 1 CMD Absent (Post_IMR < 25) 28
## 2 CMD Present (Post_IMR ≥ 25) 56
## Normal Diastolic Function Total
## 1 30 58
## 2 70 126
# Create a data frame for test results
test_results <- data.frame(
Statistic = c("Chi-squared", "Degrees of freedom", "p-value"),
Value = c(round(chi_res$statistic, 3), chi_res$parameter, round(chi_res$p.value, 4))
)
# Print test results
cat("\nChi-square test results:\n")
##
## Chi-square test results:
print(test_results)
## Statistic Value
## X-squared Chi-squared 0.1060
## df Degrees of freedom 1.0000
## p-value 0.7448
Interpretation: CMD prevalence is not significantly higher among patients with diastolic dysfunction.
CMD (binary or continuous) does not differ significantly across diastolic function groups.
install.packages("corrplot", repos = "https://cloud.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/3g/ln4t2cvs3250dv7p12p13c_c0000gn/T//RtmpJI62xe/downloaded_packages
# Load required libraries
library(dplyr)
library(Hmisc) # for rcorr
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(tibble) # for rownames_to_column
# Define variables of interest
vars <- c(
"Post_IMR",
"ECHO_VAL52", # Septal e'
"ECHO_VAL62", # Lateral e'
"mean_e",
"E_e_ratio",
"ECHO_VAL92", # LAV
"LAVI",
"A4C_R_RR1", # Reservoir strain
"A4C_CD_RR1", # Conduit Strain
"A4C_CT_RR1", # Contraction Strain
"MRI1_INF_SZ_LV", # Infarct size
"Complete_LVEF" # Complete LVEF
)
# Select and filter complete cases only for these vars
corr_data <- data_final %>%
select(all_of(vars)) %>%
filter(complete.cases(.))
# Calculate Spearman correlation matrix and p-values
corr_res <- rcorr(as.matrix(corr_data), type = "spearman")
corr_mat <- corr_res$r
p_mat <- corr_res$P
# -- Debug prints (can remove after confirming correctness) --
cat("Dimensions of corr_mat:", dim(corr_mat), "\n")
## Dimensions of corr_mat: 12 12
cat("Dimensions of p_mat:", dim(p_mat), "\n")
## Dimensions of p_mat: 12 12
cat("Row names corr_mat:", rownames(corr_mat), "\n")
## Row names corr_mat: Post_IMR ECHO_VAL52 ECHO_VAL62 mean_e E_e_ratio ECHO_VAL92 LAVI A4C_R_RR1 A4C_CD_RR1 A4C_CT_RR1 MRI1_INF_SZ_LV Complete_LVEF
cat("Col names corr_mat:", colnames(corr_mat), "\n")
## Col names corr_mat: Post_IMR ECHO_VAL52 ECHO_VAL62 mean_e E_e_ratio ECHO_VAL92 LAVI A4C_R_RR1 A4C_CD_RR1 A4C_CT_RR1 MRI1_INF_SZ_LV Complete_LVEF
# Extract variable names except "Post_IMR", present in both corr_mat and p_mat
vars_except_post_imr <- intersect(
colnames(corr_mat)[colnames(corr_mat) != "Post_IMR"],
colnames(p_mat)[colnames(p_mat) != "Post_IMR"]
)
cat("Variables except Post_IMR:", vars_except_post_imr, "\n")
## Variables except Post_IMR: ECHO_VAL52 ECHO_VAL62 mean_e E_e_ratio ECHO_VAL92 LAVI A4C_R_RR1 A4C_CD_RR1 A4C_CT_RR1 MRI1_INF_SZ_LV Complete_LVEF
# Create data frame of Post_IMR correlations safely converting to vectors
post_imr_corr <- data.frame(
Variable = vars_except_post_imr,
Correlation = as.numeric(corr_mat["Post_IMR", vars_except_post_imr]),
P_value = as.numeric(p_mat["Post_IMR", vars_except_post_imr])
)
# Rename variables for readability
rename_map <- c(
"ECHO_VAL52" = "Septal e'",
"ECHO_VAL62" = "Lateral e'",
"mean_e" = "Mean e'",
"E_e_ratio" = "E/e' ratio",
"ECHO_VAL92" = "LAV",
"LAVI" = "LAVI",
"A4C_R_RR1" = "Reservoir strain",
"A4C_CD_RR1" = "Conduit Strain",
"A4C_CT_RR1" = "Contraction Strain",
"MRI1_INF_SZ_LV" = "Infarct size",
"Complete_LVEF" = "Complete LVEF"
)
# Rename variables in the post_imr_corr table for readability
post_imr_corr$Variable <- ifelse(
post_imr_corr$Variable %in% names(rename_map),
rename_map[post_imr_corr$Variable],
post_imr_corr$Variable
)
# Print correlation table for Post_IMR
print(post_imr_corr)
## Variable Correlation P_value
## 1 Septal e' -0.18833781 0.05434929
## 2 Lateral e' -0.16835404 0.08602644
## 3 Mean e' -0.20577300 0.03521325
## 4 E/e' ratio 0.04023148 0.68365531
## 5 LAV 0.09200763 0.35056504
## 6 LAVI 0.07281939 0.46037744
## 7 Reservoir strain -0.14628351 0.13647704
## 8 Conduit Strain 0.14597429 0.13731683
## 9 Contraction Strain 0.06510065 0.50938329
## 10 Infarct size -0.08258399 0.40229126
## 11 Complete LVEF -0.00803436 0.93516879
Interpretation: The Mean e’ (average early diastolic mitral annular velocity) shows a statistically significant weak negative correlation with Post_IMR, meaning higher microvascular resistance (Post_IMR) tends to be associated with worse diastolic function (lower Mean e’).
Septal e’ shows a similar negative trend but just misses statistical significance (p = 0.054).
Other diastolic function markers (like E/e’ ratio, LAVI) and infarct size or LVEF do not show significant correlations with Post_IMR in your data.
Overall, the associations are weak, indicating that Post_IMR may have a subtle relationship mainly with early diastolic function markers but not strongly related to infarct size or overall systolic function (LVEF).
# Create a copy of your dataset to preserve original
data_sensitivity <- data_final %>%
mutate(
CMD_20 = case_when(
is.na(Post_IMR) ~ NA_real_,
Post_IMR >= 20 ~ 1,
Post_IMR < 20 ~ 0
)
)
# Chi-squared test: CMD_20 vs diastolic_function
table_CMD20 <- table(data_sensitivity$CMD_20, data_sensitivity$diastolic_function)
chi_sq_result <- chisq.test(table_CMD20)
# Print results
print(table_CMD20)
##
## Diastolic Dysfunction Present Indeterminate Diastolic Function
## 0 14 15
## 1 70 50
##
## Normal Diastolic Function
## 0 21
## 1 79
print(chi_sq_result)
##
## Pearson's Chi-squared test
##
## data: table_CMD20
## X-squared = 1.0264, df = 2, p-value = 0.5986
Sensitivity analysis: The p-value (0.60) is not statistically significant.
This means there is no strong evidence of an association between CMD (using the more inclusive IMR ≥ 20) and diastolic dysfunction categories in your dataset.
This result is consistent with your primary analysis using IMR ≥ 25 (which also had a non-significant result).
Conclusion: Changing the CMD cutoff from 25 to 20 does not materially change the result — this suggests your original finding is robust to this threshold change.
# Calculate median ischemic time
median_time <- median(data_final$CAP_NUM1, na.rm = TRUE)
# Create stratified variable: early vs late
data_final <- data_final %>%
mutate(
ischemic_group = case_when(
is.na(CAP_NUM1) ~ NA_character_,
CAP_NUM1 <= median_time ~ "Early",
CAP_NUM1 > median_time ~ "Late"
)
)
# Boxplot of Post_IMR by ischemic time group
library(ggplot2)
ggplot(data_final, aes(x = ischemic_group, y = Post_IMR)) +
geom_boxplot(fill = "#69b3a2", alpha = 0.6) +
geom_jitter(width = 0.2, alpha = 0.4, color = "black") +
theme_minimal() +
labs(title = "Post_IMR by Ischemic Time Group", x = "Ischemic Time Group", y = "Post IMR")
## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).
wilcox.test(Post_IMR ~ ischemic_group, data = data_final)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Post_IMR by ischemic_group
## W = 6965, p-value = 0.1704
## alternative hypothesis: true location shift is not equal to 0
Sensitivity analysis using ischemic time: The p-value of 0.1704 indicates that there is no statistically significant difference in Post_IMR between patients who had early vs late revascularization (based on your median split of ischemic time).
ischemic time (as a binary early/late grouping) does not appear to significantly influence CMD severity (as measured by IMR) in this dataset.
# Convert to labeled factors
data_final <- data_final %>%
mutate(
Hypertension = factor(MHCR_NY1, levels = c(0, 1), labels = c("No", "Yes")),
Sex = factor(DEMO_SEX, levels = c(1, 2), labels = c("Male", "Female"))
)
# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1 + Hypertension + Sex, data = data_final)
# Show model summary
summary(model)
##
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1 + Hypertension +
## Sex, data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6308 -6.6552 -0.9912 4.7475 21.0493
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.536e+00 2.422e+00 3.112 0.0022 **
## Post_IMR -1.524e-03 6.143e-02 -0.025 0.9802
## CAP_NUM1 7.662e-03 9.828e-03 0.780 0.4368
## HypertensionYes 6.194e-01 1.403e+00 0.442 0.6594
## SexFemale 1.146e+00 1.549e+00 0.740 0.4605
## Post_IMR:CAP_NUM1 -2.971e-05 2.278e-04 -0.130 0.8964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.856 on 162 degrees of freedom
## (212 observations deleted due to missingness)
## Multiple R-squared: 0.02324, Adjusted R-squared: -0.006903
## F-statistic: 0.771 on 5 and 162 DF, p-value: 0.5719
None of the variables or interaction terms are statistically significant (all p-values > 0.05).
The interaction between Post_IMR and ischemic time is not significant → this suggests that the relationship between IMR and infarct size doesn’t change significantly depending on ischemic time.
Model Fit R-squared = 0.023 → the model explains only 2.3% of the variance in infarct size.
Adjusted R-squared = -0.007 → adjusting for predictors makes the model slightly worse.
F-test p = 0.5719 → the overall model is not statistically significant.
# Run regression model with interaction
model <- lm(MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)
# Show model summary
summary(model)
##
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR * CAP_NUM1, data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.066 -6.593 -1.186 4.954 22.096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.845e+00 2.389e+00 3.284 0.00125 **
## Post_IMR -1.430e-04 6.082e-02 -0.002 0.99813
## CAP_NUM1 7.968e-03 9.789e-03 0.814 0.41681
## Post_IMR:CAP_NUM1 -3.372e-05 2.269e-04 -0.149 0.88208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.829 on 164 degrees of freedom
## (212 observations deleted due to missingness)
## Multiple R-squared: 0.01805, Adjusted R-squared: 8.288e-05
## F-statistic: 1.005 on 3 and 164 DF, p-value: 0.3923
ischemic time does not appear to modify the relationship between IMR and infarct size.
# Create log-transformed infarct size
data_final <- data_final %>%
mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV)) # log1p handles 0 values safely
# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1 +
factor(MHCR_NY1) + factor(DEMO_SEX), data = data_final)
summary(model_log)
##
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1 + factor(MHCR_NY1) +
## factor(DEMO_SEX), data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1839 -0.5855 0.2618 0.7926 1.6269
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.754e+00 3.180e-01 5.516 1.35e-07 ***
## Post_IMR -2.658e-03 8.066e-03 -0.330 0.742
## CAP_NUM1 1.039e-03 1.290e-03 0.806 0.422
## factor(MHCR_NY1)1 1.424e-01 1.841e-01 0.773 0.441
## factor(DEMO_SEX)2 -4.678e-02 2.033e-01 -0.230 0.818
## Post_IMR:CAP_NUM1 1.399e-06 2.990e-05 0.047 0.963
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.031 on 162 degrees of freedom
## (212 observations deleted due to missingness)
## Multiple R-squared: 0.03197, Adjusted R-squared: 0.002095
## F-statistic: 1.07 on 5 and 162 DF, p-value: 0.3789
# Create log-transformed infarct size
data_final <- data_final %>%
mutate(log_infarct_size = log1p(MRI1_INF_SZ_LV)) # log1p handles 0 values safely
# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_final)
summary(model_log)
##
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2197 -0.6001 0.2754 0.7704 1.5947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.757e+00 3.134e-01 5.606 8.59e-08 ***
## Post_IMR -1.949e-03 7.979e-03 -0.244 0.807
## CAP_NUM1 1.051e-03 1.284e-03 0.818 0.414
## Post_IMR:CAP_NUM1 1.240e-06 2.977e-05 0.042 0.967
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.027 on 164 degrees of freedom
## (212 observations deleted due to missingness)
## Multiple R-squared: 0.02832, Adjusted R-squared: 0.01055
## F-statistic: 1.593 on 3 and 164 DF, p-value: 0.193
# Identify extreme IMR values (e.g., > 99th percentile)
imr_cutoff <- quantile(data_final$Post_IMR, 0.99, na.rm = TRUE)
# Create filtered dataset
data_no_outliers <- data_final %>%
filter(Post_IMR <= imr_cutoff)
# Rerun correlation or regression
model_no_outliers <- lm(MRI1_INF_SZ_LV ~ Post_IMR, data = data_no_outliers)
summary(model_no_outliers)
##
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR, data = data_no_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.341 -6.330 -1.093 5.088 22.960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.191247 1.431042 6.423 1.35e-09 ***
## Post_IMR 0.002176 0.037372 0.058 0.954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.853 on 166 degrees of freedom
## (79 observations deleted due to missingness)
## Multiple R-squared: 2.042e-05, Adjusted R-squared: -0.006004
## F-statistic: 0.00339 on 1 and 166 DF, p-value: 0.9536
# Rerun the model with transformed outcome
model_log <- lm(log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_no_outliers)
summary(model_log)
##
## Call:
## lm(formula = log_infarct_size ~ Post_IMR * CAP_NUM1, data = data_no_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2197 -0.6001 0.2754 0.7704 1.5947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.757e+00 3.134e-01 5.606 8.59e-08 ***
## Post_IMR -1.949e-03 7.979e-03 -0.244 0.807
## CAP_NUM1 1.051e-03 1.284e-03 0.818 0.414
## Post_IMR:CAP_NUM1 1.240e-06 2.977e-05 0.042 0.967
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.027 on 164 degrees of freedom
## (79 observations deleted due to missingness)
## Multiple R-squared: 0.02832, Adjusted R-squared: 0.01055
## F-statistic: 1.593 on 3 and 164 DF, p-value: 0.193
# Vector of diastolic dysfunction variables
diastolic_vars <- c("ECHO_VAL52", "ECHO_VAL62", "mean_e", "E_e_ratio", "ECHO_VAL92", "LAVI", "A4C_R_RR1", "A4C_CD_RR1", "A4C_CT_RR1", "ECHO_VAL32")
# Select all variables of interest including dependent, independent, and covariates
vars_for_model <- c(
diastolic_vars, # your dependent variables (e.g. ECHO_VAL52, etc.)
"Post_IMR",
"AGE_AT_INCLUSION",
"DEMO_SEX",
"VS_BMI1",
"MHCR_NY1",
"MHCR_NPC",
"HBA1C_22",
"LDL2",
"MHCR_NY2",
"MHCR_NY4",
"EGFR2",
"CAP_NUM1"
)
# Count how many complete cases you have in these vars
sum(complete.cases(data_final[, vars_for_model]))
## [1] 128
# Load necessary package
library(dplyr)
library(broom)
## Warning: package 'broom' was built under R version 4.3.3
# List of diastolic dysfunction dependent variables
diastolic_vars <- c(
"ECHO_VAL52", # Septal e'
"ECHO_VAL62", # Lateral e'
"mean_e",
"E_e_ratio",
"ECHO_VAL92", # LAV
"LAVI",
"A4C_R_RR1", # Reservoir strain
"A4C_CD_RR1", # Conduit Strain
"A4C_CT_RR1" # Contraction Strain
)
# Loop through each diastolic variable, run unadjusted regression, store results
results_list <- list()
for (var in diastolic_vars) {
formula <- as.formula(paste(var, "~ Post_IMR"))
model_data <- data_final %>%
select(all_of(c(var, "Post_IMR"))) %>%
filter(complete.cases(.))
model <- lm(formula, data = model_data)
tidy_res <- broom::tidy(model)
tidy_res$Dependent_Var <- var
results_list[[var]] <- tidy_res
}
all_results <- do.call(rbind, results_list)
all_results <- all_results[, c("Dependent_Var", names(all_results)[names(all_results) != "Dependent_Var"])]
print(all_results)
## # A tibble: 18 × 6
## Dependent_Var term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ECHO_VAL52 (Intercept) 8.21 0.343 24.0 6.39e-63
## 2 ECHO_VAL52 Post_IMR -0.0135 0.00907 -1.49 1.39e- 1
## 3 ECHO_VAL62 (Intercept) 10.2 0.508 20.1 2.84e-51
## 4 ECHO_VAL62 Post_IMR -0.0206 0.0134 -1.54 1.25e- 1
## 5 mean_e (Intercept) 9.17 0.384 23.9 7.16e-63
## 6 mean_e Post_IMR -0.0166 0.0101 -1.64 1.03e- 1
## 7 E_e_ratio (Intercept) 0.0781 0.00473 16.5 6.59e-40
## 8 E_e_ratio Post_IMR 0.000121 0.000124 0.973 3.31e- 1
## 9 ECHO_VAL92 (Intercept) 55.4 2.88 19.2 1.80e-47
## 10 ECHO_VAL92 Post_IMR 0.0126 0.0751 0.168 8.66e- 1
## 11 LAVI (Intercept) 27.9 1.38 20.3 1.90e-50
## 12 LAVI Post_IMR -0.00107 0.0359 -0.0299 9.76e- 1
## 13 A4C_R_RR1 (Intercept) 27.6 1.37 20.1 5.90e-49
## 14 A4C_R_RR1 Post_IMR 0.000211 0.0350 0.00602 9.95e- 1
## 15 A4C_CD_RR1 (Intercept) -15.8 1.01 -15.8 2.57e-36
## 16 A4C_CD_RR1 Post_IMR 0.0124 0.0256 0.484 6.29e- 1
## 17 A4C_CT_RR1 (Intercept) -11.7 0.948 -12.4 3.92e-26
## 18 A4C_CT_RR1 Post_IMR -0.0129 0.0242 -0.535 5.93e- 1
Interpretations None of the diastolic dysfunction parameters showed a statistically significant association with Post_IMR in these unadjusted models.
The direction of effect for septal, lateral, and mean e’ is negative, consistent with worse diastolic function with higher Post_IMR, but these are not statistically significant.
The E/e’ ratio and left atrial volume indices show small and non-significant effects.
Strain measures (reservoir, conduit, contraction) also show no significant relationship.
# Load necessary libraries
library(dplyr)
# Create a new dataset so original is not altered
data_final2 <- data_final %>%
mutate(
# Log-transform non-normal continuous dependent variables (excluding Septal e')
log_ECHO_VAL62 = ifelse(ECHO_VAL62 > 0, log(ECHO_VAL62), NA),
log_mean_e = ifelse(mean_e > 0, log(mean_e), NA),
log_E_e_ratio = ifelse(E_e_ratio > 0, log(E_e_ratio), NA),
log_LAVI = ifelse(LAVI > 0, log(LAVI), NA),
log_A4C_R_RR1 = ifelse(A4C_R_RR1 > 0, log(A4C_R_RR1), NA),
log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA),
log_A4C_CT_RR1 = ifelse(A4C_CT_RR1 > 0, log(A4C_CT_RR1), NA)
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `log_A4C_CD_RR1 = ifelse(A4C_CD_RR1 > 0, log(A4C_CD_RR1), NA)`.
## Caused by warning in `log()`:
## ! NaNs produced
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# List of dependent variables
dependent_vars <- list(
"ECHO_VAL52", # Septal e' (no transform)
"log_ECHO_VAL62", # Lateral e' (log)
"log_mean_e", # Mean e' (log)
"log_E_e_ratio", # E/e' ratio (log)
"log_LAVI", # LAVI (log)
"log_A4C_R_RR1", # Reservoir strain (log)
"log_A4C_CD_RR1", # Conduit strain (log)
"log_A4C_CT_RR1" # Contraction strain (log)
)
# Function to run a model for each dependent variable
run_model <- function(dep_var) {
model_formula <- as.formula(paste(dep_var, "~ Post_IMR + AGE_AT_INCLUSION + factor(DEMO_SEX) + factor(MHCR_NY1) + VS_BMI1 +
factor(MHCR_NY2) + factor(MHCR_NY4) + factor(MHCR_NY11) + factor(MHCR_NY13) +
factor(MHCR_NPC) + CAP_NUM1"))
# Filter only rows with complete data for this model
model_data <- data_final2 %>%
select(all.vars(model_formula)) %>%
filter(complete.cases(.))
# Only run the model if enough data is left
if (nrow(model_data) > 10) {
model <- lm(model_formula, data = model_data)
return(summary(model))
} else {
return(paste("Not enough data for:", dep_var))
}
}
# Run all models
results <- lapply(dependent_vars, run_model)
names(results) <- dependent_vars
# Print results
results
## $ECHO_VAL52
##
## Call:
## lm(formula = model_formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6749 -1.2902 -0.0376 1.1029 5.4068
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.302e+01 1.567e+00 8.310 1.27e-14 ***
## Post_IMR -9.664e-03 8.035e-03 -1.203 0.23043
## AGE_AT_INCLUSION -7.701e-02 1.410e-02 -5.461 1.36e-07 ***
## factor(DEMO_SEX)2 -1.878e-02 3.069e-01 -0.061 0.95126
## factor(MHCR_NY1)1 -7.721e-01 2.925e-01 -2.640 0.00893 **
## VS_BMI1 -2.115e-02 3.820e-02 -0.554 0.58048
## factor(MHCR_NY2)1 -1.727e-02 2.677e-01 -0.065 0.94862
## factor(MHCR_NY4)1 9.309e-01 1.906e+00 0.488 0.62573
## factor(MHCR_NY11)1 -1.286e+00 9.444e-01 -1.362 0.17462
## factor(MHCR_NY13)1 -1.434e-02 2.789e-01 -0.051 0.95905
## factor(MHCR_NPC)2 7.765e-01 3.791e-01 2.048 0.04182 *
## factor(MHCR_NPC)3 7.466e-01 3.278e-01 2.277 0.02379 *
## CAP_NUM1 -4.797e-05 8.370e-04 -0.057 0.95435
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.82 on 206 degrees of freedom
## Multiple R-squared: 0.2925, Adjusted R-squared: 0.2513
## F-statistic: 7.098 on 12 and 206 DF, p-value: 8.913e-11
##
##
## $log_ECHO_VAL62
##
## Call:
## lm(formula = model_formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.05876 -0.21812 0.04793 0.22229 0.73778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.0261402 0.2774712 10.906 < 2e-16 ***
## Post_IMR -0.0015792 0.0014137 -1.117 0.2653
## AGE_AT_INCLUSION -0.0100952 0.0024641 -4.097 6.03e-05 ***
## factor(DEMO_SEX)2 -0.0035316 0.0538089 -0.066 0.9477
## factor(MHCR_NY1)1 -0.1218967 0.0511617 -2.383 0.0181 *
## VS_BMI1 -0.0070952 0.0066448 -1.068 0.2869
## factor(MHCR_NY2)1 -0.0223579 0.0471547 -0.474 0.6359
## factor(MHCR_NY4)1 0.2081541 0.3342594 0.623 0.5342
## factor(MHCR_NY11)1 -0.1061490 0.1654612 -0.642 0.5219
## factor(MHCR_NY13)1 0.0382071 0.0491873 0.777 0.4382
## factor(MHCR_NPC)2 0.1572549 0.0668871 2.351 0.0197 *
## factor(MHCR_NPC)3 0.1090616 0.0582831 1.871 0.0627 .
## CAP_NUM1 -0.0001493 0.0001472 -1.014 0.3116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3192 on 205 degrees of freedom
## Multiple R-squared: 0.2166, Adjusted R-squared: 0.1707
## F-statistic: 4.722 on 12 and 205 DF, p-value: 8.512e-07
##
##
## $log_mean_e
##
## Call:
## lm(formula = model_formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6517 -0.1725 0.0000 0.1785 0.6115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.844e+00 2.135e-01 13.319 < 2e-16 ***
## Post_IMR -1.348e-03 1.092e-03 -1.235 0.21836
## AGE_AT_INCLUSION -9.863e-03 1.908e-03 -5.169 5.51e-07 ***
## factor(DEMO_SEX)2 -4.480e-03 4.202e-02 -0.107 0.91519
## factor(MHCR_NY1)1 -1.038e-01 3.991e-02 -2.602 0.00993 **
## VS_BMI1 -5.044e-03 5.181e-03 -0.973 0.33145
## factor(MHCR_NY2)1 -1.950e-02 3.655e-02 -0.533 0.59428
## factor(MHCR_NY4)1 1.810e-01 2.612e-01 0.693 0.48910
## factor(MHCR_NY11)1 -1.402e-01 1.293e-01 -1.084 0.27952
## factor(MHCR_NY13)1 2.500e-02 3.811e-02 0.656 0.51260
## factor(MHCR_NPC)2 1.548e-01 5.147e-02 3.007 0.00296 **
## factor(MHCR_NPC)3 1.273e-01 4.436e-02 2.870 0.00453 **
## CAP_NUM1 -7.967e-05 1.147e-04 -0.695 0.48804
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2495 on 208 degrees of freedom
## Multiple R-squared: 0.296, Adjusted R-squared: 0.2554
## F-statistic: 7.287 on 12 and 208 DF, p-value: 4.199e-11
##
##
## $log_E_e_ratio
##
## Call:
## lm(formula = model_formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65266 -0.20691 -0.02816 0.17271 0.85327
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.3617119 0.2388125 -14.077 < 2e-16 ***
## Post_IMR 0.0003635 0.0012227 0.297 0.766581
## AGE_AT_INCLUSION 0.0082685 0.0021354 3.872 0.000146 ***
## factor(DEMO_SEX)2 0.1381846 0.0472004 2.928 0.003810 **
## factor(MHCR_NY1)1 0.1015971 0.0452403 2.246 0.025810 *
## VS_BMI1 0.0092503 0.0057972 1.596 0.112136
## factor(MHCR_NY2)1 0.0013379 0.0412303 0.032 0.974145
## factor(MHCR_NY4)1 0.1955202 0.2900993 0.674 0.501100
## factor(MHCR_NY11)1 -0.0733406 0.1438800 -0.510 0.610797
## factor(MHCR_NY13)1 -0.0265422 0.0437074 -0.607 0.544357
## factor(MHCR_NPC)2 -0.0552739 0.0582650 -0.949 0.343931
## factor(MHCR_NPC)3 -0.0518744 0.0502307 -1.033 0.302975
## CAP_NUM1 0.0002109 0.0001279 1.649 0.100660
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2768 on 201 degrees of freedom
## Multiple R-squared: 0.2251, Adjusted R-squared: 0.1789
## F-statistic: 4.866 on 12 and 201 DF, p-value: 5.047e-07
##
##
## $log_LAVI
##
## Call:
## lm(formula = model_formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1287 -0.1678 0.0117 0.1749 0.9616
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.202e+00 2.638e-01 12.138 <2e-16 ***
## Post_IMR -3.935e-04 1.304e-03 -0.302 0.7631
## AGE_AT_INCLUSION 2.012e-03 2.430e-03 0.828 0.4086
## factor(DEMO_SEX)2 1.079e-02 5.165e-02 0.209 0.8347
## factor(MHCR_NY1)1 8.972e-02 4.797e-02 1.870 0.0630 .
## VS_BMI1 1.092e-03 6.404e-03 0.170 0.8648
## factor(MHCR_NY2)1 -1.041e-01 4.522e-02 -2.302 0.0224 *
## factor(MHCR_NY4)1 2.256e-01 2.166e-01 1.042 0.2988
## factor(MHCR_NY11)1 2.405e-02 1.530e-01 0.157 0.8753
## factor(MHCR_NY13)1 -4.597e-02 4.700e-02 -0.978 0.3293
## factor(MHCR_NPC)2 6.123e-03 6.457e-02 0.095 0.9245
## factor(MHCR_NPC)3 1.783e-02 5.457e-02 0.327 0.7442
## CAP_NUM1 -7.802e-05 1.357e-04 -0.575 0.5661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2939 on 190 degrees of freedom
## Multiple R-squared: 0.08126, Adjusted R-squared: 0.02323
## F-statistic: 1.4 on 12 and 190 DF, p-value: 0.1686
##
##
## $log_A4C_R_RR1
##
## Call:
## lm(formula = model_formula, data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28726 -0.16347 -0.00903 0.23255 0.58418
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6204528 0.2662794 13.596 <2e-16 ***
## Post_IMR 0.0010891 0.0013722 0.794 0.4285
## AGE_AT_INCLUSION -0.0064413 0.0024951 -2.582 0.0106 *
## factor(DEMO_SEX)2 -0.0353346 0.0531805 -0.664 0.5073
## factor(MHCR_NY1)1 -0.0524587 0.0517049 -1.015 0.3117
## VS_BMI1 0.0023758 0.0062718 0.379 0.7053
## factor(MHCR_NY2)1 -0.1173030 0.0481184 -2.438 0.0158 *
## factor(MHCR_NY4)1 -0.3085844 0.2245066 -1.375 0.1710
## factor(MHCR_NY11)1 -0.2250985 0.1579848 -1.425 0.1560
## factor(MHCR_NY13)1 0.0778675 0.0482985 1.612 0.1087
## factor(MHCR_NPC)2 0.1179118 0.0699447 1.686 0.0936 .
## factor(MHCR_NPC)3 0.0574994 0.0566515 1.015 0.3115
## CAP_NUM1 -0.0001480 0.0001452 -1.020 0.3092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3035 on 179 degrees of freedom
## Multiple R-squared: 0.1469, Adjusted R-squared: 0.08968
## F-statistic: 2.568 on 12 and 179 DF, p-value: 0.003665
##
##
## $log_A4C_CD_RR1
## [1] "Not enough data for: log_A4C_CD_RR1"
##
## $log_A4C_CT_RR1
## [1] "Not enough data for: log_A4C_CT_RR1"
Septal E and mean e: - Age: Significant negative association. Older age is associated with lower - Septal e’ velocity (worse diastolic function). Hypertension: Significant. Hypertensive patients tend to have lower Septal e’. - Smoking: current and former smokers have a higher septal E’. Residual confousing or selection bias?
E/e ratio age, female, hypertension
LAVI: - hypercholesterolemia
# Check levels and counts for all your categorical vars
table(data_final2$MHCR_NY2)
##
## 0 1
## 141 239
table(data_final2$MHCR_NY4)
##
## 0 1
## 377 3
table(data_final2$MHCR_NY11)
##
## 0 1
## 376 4
table(data_final2$MHCR_NY13)
##
## 0 1
## 253 127
table(data_final2$MHCR_NY1)
##
## 0 1
## 267 113
table(data_final2$DEMO_SEX)
##
## 1 2
## 285 95
table(data_final2$MHCR_NPC)
##
## 1 2 3
## 106 65 209
table(data_final$HF_HOSP)
##
## 0 1
## 373 7
table(data_final$MORT_ALL)
##
## 0 1
## 336 44
table(data_final$REVASC)
##
## 0 1
## 337 43
table(data_final$NEW_ONSET_HF)
##
## 0 1
## 355 25
table(data_final$MHCR_NY2)
##
## 0 1
## 141 239
table(data_final$MHCR_NY4)
##
## 0 1
## 377 3
table(data_final$MHCR_NY10)
##
## 0
## 380
table(data_final$MHCR_NY11)
##
## 0 1
## 376 4
table(data_final$MHCR_NY13)
##
## 0 1
## 253 127
table(data_final$MHCR_NY1)
##
## 0 1
## 267 113
table(data_final$DEMO_SEX)
##
## 1 2
## 285 95
table(data_final$MHCR_NPC)
##
## 1 2 3
## 106 65 209
library(dplyr)
# Step 1: Prepare a clean dataset with proper factor levels
data_final3 <- data_final %>%
mutate(
DEMO_SEX = factor(DEMO_SEX, levels = c(1, 2), labels = c("Male", "Female")),
MHCR_NY1 = factor(MHCR_NY1, levels = c(0, 1), labels = c("No", "Yes")), # Hypertension
MHCR_NY11 = factor(MHCR_NY11, levels = c(0, 1), labels = c("No", "Yes")), # Prior PCI
MHCR_NPC = factor(MHCR_NPC, levels = c(1, 2, 3), labels = c("Never", "Former", "Current")) # Smoking
)
# Step 2: Run the multiple linear regression model
model_infarct <- lm(
MRI1_INF_SZ_LV ~ Post_IMR + AGE_AT_INCLUSION + DEMO_SEX +
MHCR_NPC + MHCR_NY1 + CAP_NUM1 + MHCR_NY11,
data = data_final3
)
# Step 3: View model summary
summary(model_infarct)
##
## Call:
## lm(formula = MRI1_INF_SZ_LV ~ Post_IMR + AGE_AT_INCLUSION + DEMO_SEX +
## MHCR_NPC + MHCR_NY1 + CAP_NUM1 + MHCR_NY11, data = data_final3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.3480 -6.4028 -0.8143 4.5715 22.2468
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.947851 4.497287 0.655 0.5131
## Post_IMR -0.007633 0.038205 -0.200 0.8419
## AGE_AT_INCLUSION 0.102202 0.064350 1.588 0.1142
## DEMO_SEXFemale 1.120838 1.556878 0.720 0.4726
## MHCR_NPCFormer -1.963378 1.818701 -1.080 0.2820
## MHCR_NPCCurrent -1.341415 1.525943 -0.879 0.3807
## MHCR_NY1Yes -0.237621 1.444234 -0.165 0.8695
## CAP_NUM1 0.006587 0.003839 1.716 0.0881 .
## MHCR_NY11Yes 3.919770 4.639052 0.845 0.3994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.786 on 159 degrees of freedom
## (212 observations deleted due to missingness)
## Multiple R-squared: 0.05836, Adjusted R-squared: 0.01098
## F-statistic: 1.232 on 8 and 159 DF, p-value: 0.2838
no significant association between IMR en infract size
data_final <- data_final %>%
mutate(
MORT_ALL_DATE = format(mdy(MORT_ALL_DATE), "%d/%m/%Y"),
NEW_ONSET_HF_DATE = format(mdy(NEW_ONSET_HF_DATE), "%d/%m/%Y"),
HF_HOSP_DATE = format(mdy(HF_HOSP_DATE), "%d/%m/%Y"),
REVASC_DATE = format(mdy(REVASC_DATE), "%d/%m/%Y")
)
library(dplyr)
library(lubridate)
# Define your reference date
reference_date <- dmy("07/08/2015")
# Create a new dataset with calculated time differences in days
data_final_with_time_diff <- data_final %>%
mutate(
MORT_ALL_DATE = mdy(MORT_ALL_DATE),
NEW_ONSET_HF_DATE = mdy(NEW_ONSET_HF_DATE),
HF_HOSP_DATE = mdy(HF_HOSP_DATE),
REVASC_DATE = mdy(REVASC_DATE),
# Calculate difference in days
days_since_mortality = as.numeric(MORT_ALL_DATE - reference_date),
days_since_new_onset_HF = as.numeric(NEW_ONSET_HF_DATE - reference_date),
days_since_HF_hosp = as.numeric(HF_HOSP_DATE - reference_date),
days_since_revasc = as.numeric(REVASC_DATE - reference_date)
)