Assignment 3: Survival II

DSAN 6150

Author

Parsa Keyvani (PK760)

Introduction

In this homework, you are given a heart failure data set with 300 patients that includes several patient characteristics including blood markers, heart performance, and smoking. You will build a model to associate survival time to these factors, and evaluate how well your model fits the data.

(1) Draw separate stratified Kaplan-Meier curves, showing at least one binary/categorical predictor (not sex), and at least one continuous predictor (not age) where you subdivide the data into quintiles and then plot the survival curves by quintile.

To be clear, I expect at least 2 plots here, but show me what interesting effects are present in the data

Binary Univariate Analysis

Code
# Stratified Kaplan-Meier Curve for high_blood_pressure
km_high_blood_pressure <- survfit(Surv(time, DEATH_EVENT) ~ high_blood_pressure, data = heart_data)
km_high_blood_pressure_plot <- ggsurvplot(
  km_high_blood_pressure,
  data = heart_data,
  pval = TRUE,
  #conf.int = TRUE,
  risk.table = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by high_blood_pressure",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("No high_blood_pressure", "high_blood_pressure")
)
km_high_blood_pressure_plot

Code
# Stratified Kaplan-Meier Curve for sex
km_sex <- survfit(Surv(time, DEATH_EVENT) ~ sex, data = heart_data)
km_sex_plot <- ggsurvplot(
  km_sex,
  data = heart_data,
  pval = TRUE,
  #conf.int = TRUE,
  risk.table = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by sex",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Female", "Male")
)
km_sex_plot

Code
# Stratified Kaplan-Meier Curve for smoking
km_smoking <- survfit(Surv(time, DEATH_EVENT) ~ smoking, data = heart_data)
km_smoking_plot <- ggsurvplot(
  km_smoking,
  data = heart_data,
  pval = TRUE,
  #conf.int = TRUE,
  risk.table = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by smoking",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Not smoking", "smoking")
)
km_smoking_plot

Code
# Stratified Kaplan-Meier Curve for Anemia
km_anaemia <- survfit(Surv(time, DEATH_EVENT) ~ anaemia, data = heart_data)
km_anaemia_plot <- ggsurvplot(
  km_anaemia,
  data = heart_data,
  pval = TRUE,
  #conf.int = TRUE,
  risk.table = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by Anaemia",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("No Anaemia", "Anaemia")
)
km_anaemia_plot

Code
# Stratified Kaplan-Meier Curve for diabetes
km_diabetes <- survfit(Surv(time, DEATH_EVENT) ~ diabetes, data = heart_data)
km_diabetes_plot <- ggsurvplot(
  km_diabetes,
  data = heart_data,
  pval = TRUE,
  #conf.int = TRUE,
  risk.table = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by Diabetes",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("No Diabetes", "Diabetes")
)
km_diabetes_plot

From the binary univariate analysis plots above, only high blood pressure and Anemia exhibit association with survival (based on significant p-values from the log-rank test). Therefore, we know that these two would be candidates for inclusion in the multivariate Cox model. Here, we reduced the pool of variables from 5 binary variables to 2. Next, we do a similar analysis but for continuous variables instead.

Continous Univariate Analysis

Code
# Stratified Kaplan-Meier Curve for a continuous predictor subdivided into quintiles (e.g., serum_creatinine)
# Create quintiles for serum_creatinine
heart_data <- heart_data %>%
  mutate(serum_creatinine_quintile = ntile(serum_creatinine, 5))

km_creatinine <- survfit(Surv(time, DEATH_EVENT) ~ serum_creatinine_quintile, data = heart_data)
ggsurvplot(
  km_creatinine,
  data = heart_data,
  #conf.int = TRUE,
  risk.table = TRUE,
  pval = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by Serum Creatinine Quintiles",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Q1 (Lowest)", "Q2", "Q3", "Q4", "Q5 (Highest)"),
  risk.table.height = 0.35,
  width = 60
)

Code
# Stratified Kaplan-Meier Curve for a continuous predictor: creatinine_phosphokinase
heart_data <- heart_data %>%
  mutate(creatinine_phosphokinase_quintile = ntile(creatinine_phosphokinase, 5))

km_creatinine_phosphokinase <- survfit(Surv(time, DEATH_EVENT) ~ creatinine_phosphokinase_quintile, data = heart_data)
ggsurvplot(
  km_creatinine_phosphokinase,
  data = heart_data,
  #conf.int = TRUE,
  risk.table = TRUE,
  pval = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by creatinine_phosphokinase Quintiles",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Q1 (Lowest)", "Q2", "Q3", "Q4", "Q5 (Highest)"),
  risk.table.height = 0.35,
  width = 60
)

Code
# Stratified Kaplan-Meier Curve for a continuous predictor: platelets
heart_data <- heart_data %>%
  mutate(platelets_quintile = ntile(platelets, 5))

km_platelets <- survfit(Surv(time, DEATH_EVENT) ~ platelets_quintile, data = heart_data)
ggsurvplot(
  km_platelets,
  data = heart_data,
  #conf.int = TRUE,
  risk.table = TRUE,
  pval = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by platelets Quintiles",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Q1 (Lowest)", "Q2", "Q3", "Q4", "Q5 (Highest)"),
  risk.table.height = 0.35,
  width = 60
)

Code
# Stratified Kaplan-Meier Curve for a continuous predictor: serum_sodium
heart_data <- heart_data %>%
  mutate(serum_sodium_quintile = ntile(serum_sodium, 5))

km_serum_sodium <- survfit(Surv(time, DEATH_EVENT) ~ serum_sodium_quintile, data = heart_data)
ggsurvplot(
  km_serum_sodium,
  data = heart_data,
  #conf.int = TRUE,
  risk.table = TRUE,
  pval = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by serum_sodium Quintiles",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Q1 (Lowest)", "Q2", "Q3", "Q4", "Q5 (Highest)"),
  risk.table.height = 0.35,
  width = 60
)

Code
# Stratified Kaplan-Meier Curve for a continuous predictor: age
heart_data <- heart_data %>%
  mutate(age_quintile = ntile(age, 5))

km_age <- survfit(Surv(time, DEATH_EVENT) ~ age_quintile, data = heart_data)
ggsurvplot(
  km_age,
  data = heart_data,
  #conf.int = TRUE,
  risk.table = TRUE,
  pval = TRUE,
  ggtheme = theme_minimal(),
  title = "Kaplan-Meier Curve Stratified by age quintile",
  xlab = "Time (days)",
  ylab = "Survival Probability",
  legend.labs = c("Q1 (Lowest)", "Q2", "Q3", "Q4", "Q5 (Highest)"),
  risk.table.height = 0.35,
  width = 60
)

From the continous univariate analysis plots above, only serum creatinine, serum sodium and age exhibit association with survival (based on significant p-values from the log-rank test). Therefore, we know that these two would be candidates for inclusion in the multivariate Cox model. Here, we reduced the pool of variables from 5 binary variables to 2. Next, we do a similar analysis but for continuous variables instead.

Correlation Matrices and Chi-Square Tests

Code
continuous_vars <- heart_data %>% select(serum_creatinine, age, serum_sodium)

# correlation matrix
cor_matrix <- cor(continuous_vars, use = "complete.obs")
print(cor_matrix)
                 serum_creatinine         age serum_sodium
serum_creatinine        1.0000000  0.15918713  -0.18909521
age                     0.1591871  1.00000000  -0.04596584
serum_sodium           -0.1890952 -0.04596584   1.00000000
Code
# plot correlation matrix 
library(corrplot)
corrplot 0.95 loaded
Code
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)

When looking at the results of the correlations between continuous variables (serum_creatinine, age, and serum_sodium), we don’t observe a highly correlated variable >0.8. therefore, we do not remove any continuous variables here.

Code
# categorical variables
categorical_vars <- heart_data %>% select(high_blood_pressure, anaemia)

#  pairwise Chi-square tests for association
chi_square_results <- combn(names(categorical_vars), 2, function(pair) {
  var1 <- pair[1]
  var2 <- pair[2]
  test <- chisq.test(table(categorical_vars[[var1]], categorical_vars[[var2]]))
  c(var1, var2, test$statistic, test$p.value)
})

# Converting results to a data frame
chi_square_results <- data.frame(
  Var1 = chi_square_results[1, ],
  Var2 = chi_square_results[2, ],
  ChiSquareStatistic = as.numeric(chi_square_results[3, ]),
  PValue = as.numeric(chi_square_results[4, ])
)
print(chi_square_results)
                 Var1    Var2 ChiSquareStatistic    PValue
1 high_blood_pressure anaemia          0.2893564 0.5906333

The above code tests the association between categorical variables that showed significance in KM Curve plots (high_blood_pressure, anaemia) in the dataset. We used Chi-square test to evaluate this. Our method of evaluating association was to look for the PValues in the above table and look for significant p-values (value <0.05), which suggests association between categorical variables. Since none of them are significant, we will move on. Next we will fit a cox preliminary model to see if these predictors become insignificant or exhibit multicolinearity using the VIF score.

Code
# Fitting a preliminary Cox model with predictors
cox_model <- coxph(Surv(time, DEATH_EVENT) ~ age + serum_creatinine + serum_sodium + high_blood_pressure + anaemia, data = heart_data)

# Calculating VIF
vif_values <- vif(cox_model)
print(vif_values)
                age    serum_creatinine        serum_sodium high_blood_pressure 
           1.004613            1.061879            1.060048            1.037329 
            anaemia 
           1.055997 

The VIF values are all below 5, indicating no multicollinearity among the variables based on continuous and categorical predictors combined. However, we can see that Anemia gets insignificant in the model, so in the final model, we will remove this variable. Therefore, we conclude that we’ve successfully ran exploratory analysis to build a multivariate cox model.

(2) Based on your exploratory analysis, build a multivariate Cox proportional hazards model only including features that appear to have an effect on survival.

Code
# Fitting Final Cox model with optimal predictors
cox_model <- coxph(Surv(time, DEATH_EVENT) ~ age + serum_creatinine + serum_sodium + high_blood_pressure, data = heart_data)

summary(cox_model)
Call:
coxph(formula = Surv(time, DEATH_EVENT) ~ age + serum_creatinine + 
    serum_sodium + high_blood_pressure, data = heart_data)

  n= 299, number of events= 96 

                         coef exp(coef)  se(coef)      z Pr(>|z|)    
age                  0.042212  1.043116  0.008774  4.811 1.50e-06 ***
serum_creatinine     0.260583  1.297686  0.060135  4.333 1.47e-05 ***
serum_sodium        -0.061887  0.939989  0.020952 -2.954  0.00314 ** 
high_blood_pressure  0.449206  1.567068  0.213367  2.105  0.03526 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

                    exp(coef) exp(-coef) lower .95 upper .95
age                     1.043     0.9587    1.0253    1.0612
serum_creatinine        1.298     0.7706    1.1534    1.4600
serum_sodium            0.940     1.0638    0.9022    0.9794
high_blood_pressure     1.567     0.6381    1.0315    2.3807

Concordance= 0.693  (se = 0.028 )
Likelihood ratio test= 51.47  on 4 df,   p=2e-10
Wald test            = 56.64  on 4 df,   p=1e-11
Score (logrank) test = 61.89  on 4 df,   p=1e-12

The Cox proportional hazards model indicates that age, serum creatinine, serum sodium, and high blood pressure are significant predictors of survival (p < 0.05). The hazard ratios suggest that higher age, serum creatinine, and high blood pressure are associated with an increased risk of death, while higher serum sodium is associated with a reduced risk of death.

(3) Repeat with an accelerated failure time model

Code
aft_model <- survreg(Surv(time, DEATH_EVENT) ~ age + serum_sodium + serum_creatinine + high_blood_pressure, 
                     data = heart_data, 
                     dist = "weibull")  

summary(aft_model)

Call:
survreg(formula = Surv(time, DEATH_EVENT) ~ age + serum_sodium + 
    serum_creatinine + high_blood_pressure, data = heart_data, 
    dist = "weibull")
                      Value Std. Error     z       p
(Intercept)          0.5257     3.1651  0.17   0.868
age                 -0.0474     0.0099 -4.79 1.7e-06
serum_sodium         0.0671     0.0233  2.88   0.004
serum_creatinine    -0.2817     0.0680 -4.15 3.4e-05
high_blood_pressure -0.4994     0.2334 -2.14   0.032
Log(scale)           0.0928     0.0893  1.04   0.298

Scale= 1.1 

Weibull distribution
Loglik(model)= -644.1   Loglik(intercept only)= -670.4
    Chisq= 52.63 on 4 degrees of freedom, p= 1e-10 
Number of Newton-Raphson Iterations: 5 
n= 299 

The accelerated failure time (AFT) model using the Weibull distribution shows that age, serum sodium, serum creatinine, and high blood pressure are significant predictors of survival time (p < 0.05). Negative coefficients for age, serum creatinine, and high blood pressure suggest they are associated with shorter survival, while higher serum sodium is linked to longer survival.

(4) For the proportional hazards model, provide appropriate diagnostic measures to indicate that assumptions have been met and feature transformations applied as needed to improve model fit

Code
# Testing proportional hazards assumption
ph_test <- cox.zph(cox_model)
ph_test
                    chisq df    p
age                 0.160  1 0.69
serum_creatinine    0.225  1 0.64
serum_sodium        0.200  1 0.65
high_blood_pressure 0.303  1 0.58
GLOBAL              1.201  4 0.88
Code
# Plotting Schoenfeld residuals
ggcoxzph(ph_test, ggtheme = theme_minimal())

The proportional hazards assumption was tested using Schoenfeld residuals, and the results indicate that the PH assumption is not violated for any individual predictor or the global model. The p-values for all individual tests (age, serum creatinine, serum sodium, and high blood pressure) are above 0.05, as is the global test (p = 0.878), indicating no significant time-dependent effects. The Schoenfeld residual plots show no clear trends over time, further supporting the validity of the proportional hazards assumption for this model.

Code
# Deviance residuals
residuals_deviance <- residuals(cox_model, type = "deviance")
plot(residuals_deviance, 
     ylab = "Deviance Residuals", 
     main = "Deviance Residuals")

Code
# Identifying influential observations using Cook's distance
ggcoxdiagnostics(cox_model, type = "dfbeta", linear.predictions = FALSE, ggtheme = theme_minimal())

The deviance residuals show that most observations fit the Cox model well, with residuals centered around zero, though a few larger residuals that can indicate outliers. The DFBETA plots suggest minimal influence of most observations on predictor coefficients.

(5) Provide appropriate diagnostics for the AFT model as well

Code
# Comparing models using AIC 
aft_lognormal <- survreg(Surv(time, DEATH_EVENT) ~ serum_sodium + serum_creatinine + anaemia + high_blood_pressure, 
                         data = heart_data, dist = "lognormal")
AIC(aft_model, aft_lognormal) 
              df      AIC
aft_model      6 1300.249
aft_lognormal  6 1310.951
Code
# Cox-Snell residuals for goodness of fit
cox_snell_residuals <- residuals(aft_model, type = "response")
km_fit <- survfit(Surv(cox_snell_residuals, DEATH_EVENT) ~ 1, data = heart_data)
plot(km_fit, xlab = "Residuals", ylab = "Survival Probability", main = "Cox-Snell Residuals")
abline(0, 1, col = "red", lty = 2) 

Code
# Deviance residuals to detect outliers
deviance_residuals <- residuals(aft_model, type = "deviance")
plot(deviance_residuals, ylab = "Deviance Residuals", xlab = "Index", main = "Deviance Residuals")
abline(h = 0, col = "red", lty = 2) 

The AFT model diagnostics indicate that the Weibull distribution (AIC = 1300.25) provides a better fit compared to the Lognormal distribution (AIC = 1310.95), as it has a lower AIC. The Cox-Snell residuals plot shows the survival probabilities are reasonably close to the reference line, indicating a good fit for the model. The deviance residuals are mostly centered around zero, suggesting a well-fitted model, though a few points have higher residuals which can be potential outliers.

(6) Provide a plot of Cox-Snell residuals for both models to show that they fit the data adequately (see https://rpubs.com/kaz_yos/resid_cox for an example with coxph)

Code
#  Cox-Snell residuals for the Cox model
cox_snell_cox <- residuals(cox_model, type = "martingale") * -1
km_fit_cox <- survfit(Surv(cox_snell_cox, DEATH_EVENT) ~ 1, data = heart_data)

#  Cox-Snell residuals for the AFT model
cox_snell_aft <- residuals(aft_model, type = "response")
km_fit_aft <- survfit(Surv(cox_snell_aft, DEATH_EVENT) ~ 1, data = heart_data)

# Plotting Cox-Snell residuals for both models
par(mfrow = c(1, 2)) 

# Cox model plot
plot(km_fit_cox, xlab = "Residuals", ylab = "Survival Probability", 
     main = "Cox-Snell Residuals (Cox Model)")
abline(0, 1, col = "red", lty = 2)

# AFT model plot
plot(km_fit_aft, xlab = "Residuals", ylab = "Survival Probability", 
     main = "Cox-Snell Residuals (AFT Model)")
abline(0, 1, col = "red", lty = 2)

For the AFT model, the survival probabilities closely follow the reference line (red dashed line), indicating a good overall fit to the data. In contrast, the Cox model shows deviation from the reference line, especially for smaller residual values, suggesting that it does not fit the data as well as the AFT model. This shows that the AFT model with the Weibull distribution can better capture the survival patterns in this dataset.