Task


By using SURVIVAL package and other packages in R for plots.

Choose survival data of your choice in R Program

Write R codes for the following models, results and interpretation.

  1. Kaplan Meier

  2. Nelson Aalen

  3. Log rank test

  4. Cox proportion Hazard and its parameters results and test of hypothesis interpretations.


Load libraries


library(here) # to easy read data from folder path
library(tidyverse) # filter, read.csv, manipulate data
library(survival) # survival library
library(survminer) #
library(summarytools)
library(kableExtra)
library(patchwork)
library(gtsummary)
library(broom)

Load Data


heart_failure<-read.csv(here("data/heart_failure_clinical_records_dataset.csv"))
df<-heart_failure %>% 
  select(
    age,
         diabetes,
         smoking,
         anaemia,
         high_blood_pressure,
         sex,
         time,
         DEATH_EVENT
    ) %>% 
  as.data.frame() %>% 
  mutate(across(
    .cols = c(diabetes, smoking, anaemia,high_blood_pressure),  
    .fns = ~ifelse(. == 1, "Yes", "No")    
  )) %>% 
  mutate(
    sex = ifelse(sex == 1, "Male", "Female"),
  ) %>% 
  as.data.frame()

The dataset contains health information for heart failure patients, focusing on key health indicators (diabetes, smoking, anemia, high blood pressure) along with basic demographics (sex, age). It tracks survival time in days and records whether death occurred during the study period, allowing researchers to analyze how various patient characteristics might influence heart failure outcomes.


characteristics of the dataset


data_structure_summary <- function(df) {
  result <- data.frame(
    Seq = 1:length(names(df)),
    Variable = names(df),
    Class = sapply(df, class),
    First_Values = sapply(df, function(x) paste0(head(x, 3), collapse = ", ")),
    Missing = sapply(df, function(x) sum(is.na(x))),
    Unique_Values = sapply(df, function(x) length(unique(x))),
    row.names = NULL  # This ensures no row names are set
  )
  return(result)
}

data_structure_summary(df) %>% kable(
      caption = "Data Structure Summary",
      align = c("r","l", "l", "l", "r", "r"),
      row.names = FALSE,
      booktabs = T,
      escape = FALSE,
      format =  ifelse(knitr::is_html_output(), 'html', 'latex')
) %>% kable_styling(
  bootstrap_options = c("striped", "hover", "condensed"),
  latex_options = c("scale_down", "hold_position"),
  full_width = F  # Changed to False for better column sizing
) %>% 
  column_spec(1, bold = TRUE, width = "3em") %>%  # Format sequence number
  column_spec(2, bold = TRUE)               # Format variable name
Data Structure Summary
Seq Variable Class First_Values Missing Unique_Values
1 age numeric 75, 55, 65 0 47
2 diabetes character No, No, No 0 2
3 smoking character No, No, Yes 0 2
4 anaemia character No, No, No 0 2
5 high_blood_pressure character Yes, No, No 0 2
6 sex character Male, Male, Male 0 2
7 time integer 4, 6, 7 0 148
8 DEATH_EVENT integer 1, 1, 1 0 2

sample of dataset


fmt<-function(k) {
  res<-k %>% t() %>% kable(
    longtable = TRUE,
    booktabs = T,
      escape = FALSE,
      format =  ifelse(knitr::is_html_output(), 'html', 'latex')
  ) %>% kable_styling(
     bootstrap_options = c("striped","condensed","hover"),
     latex_options = c("repeat_header", "hold_position"),
     full_width = F
  ) %>% 
    column_spec(1, bold = TRUE)
  return (res)
}

df %>% sample_n(7) %>% fmt()
age 66 42 49 50 65 58 55
diabetes Yes No No No No Yes Yes
smoking No Yes No No No No No
anaemia Yes Yes Yes No Yes No No
high_blood_pressure Yes No No Yes Yes Yes No
sex Female Male Female Male Male Female Female
time 95 201 147 108 194 83 246
DEATH_EVENT 0 0 0 0 0 0 0

Data Description and Properties


freq(df$diabetes) %>% kable() %>% kable_classic() ## diabetes
Freq % Valid % Valid Cum. % Total % Total Cum.
No 174 58.19398 58.19398 58.19398 58.19398
Yes 125 41.80602 100.00000 41.80602 100.00000
<NA> 0 NA NA 0.00000 100.00000
Total 299 100.00000 100.00000 100.00000 100.00000
freq(df$smoking) %>% kable() %>% kable_classic() ## smoking
Freq % Valid % Valid Cum. % Total % Total Cum.
No 203 67.89298 67.89298 67.89298 67.89298
Yes 96 32.10702 100.00000 32.10702 100.00000
<NA> 0 NA NA 0.00000 100.00000
Total 299 100.00000 100.00000 100.00000 100.00000
freq(df$high_blood_pressure) %>% kable() %>% kable_classic() #high blood pressure
Freq % Valid % Valid Cum. % Total % Total Cum.
No 194 64.88294 64.88294 64.88294 64.88294
Yes 105 35.11706 100.00000 35.11706 100.00000
<NA> 0 NA NA 0.00000 100.00000
Total 299 100.00000 100.00000 100.00000 100.00000
freq(df$sex) %>% kable() %>% kable_classic() #sex
Freq % Valid % Valid Cum. % Total % Total Cum.
Female 105 35.11706 35.11706 35.11706 35.11706
Male 194 64.88294 100.00000 64.88294 100.00000
<NA> 0 NA NA 0.00000 100.00000
Total 299 100.00000 100.00000 100.00000 100.00000
descr(df$age) %>% kable() %>% kable_classic() #age
age
Mean 60.8338930
Std.Dev 11.8948091
Min 40.0000000
Q1 51.0000000
Median 60.0000000
Q3 70.0000000
Max 95.0000000
MAD 14.8260000
IQR 19.0000000
CV 0.1955293
Skewness 0.4188266
SE.Skewness 0.1409539
Kurtosis -0.2204793
N.Valid 299.0000000
Pct.Valid 100.0000000
# Create histogram with normal curve
age_dist<-df %>%
  ggplot(aes(x = age)) +
  # First, add the histogram
  geom_histogram(aes(y = after_stat(density)), 
                bins = 15,           # Adjust number of bins for smoothness
                fill = "lightblue",  # Make histogram slightly transparent
                alpha = 0.7) +
  # Add the normal distribution curve
  stat_function(fun = dnorm, 
                args = list(
                  mean = mean(df$age),
                  sd = sd(df$age)
                ),
                color = "orange",
                linewidth = .5) +
  # Add proper labels
  labs(
    title = "Age Distribution with Normal Curve",
    x = "Age",
    y = "Density"
  ) +
  theme_bw()+
  theme(
     legend.position = "bottom",
    legend.box = "horizontal"
  )
chrt <- function(df, var_name, t, labels = c("No", "Yes"), label_text = "(0=No, 1=Yes)") {
  df %>%
    count(.data[[var_name]]) %>%
    ggplot(aes(x = factor(.data[[var_name]]), y = n, 
               fill = factor(.data[[var_name]]))) +
    geom_col() +
    labs(title = paste0(t, " ", label_text), 
         x = "Status", 
         y = "No of Patients") +
    scale_fill_discrete(name = "Status", labels = labels) +
    scale_x_discrete(labels = labels) +
    theme_bw() +
    theme(
      legend.position = "bottom",
      legend.box = "horizontal"
    )
}


diab_dist<-df %>% chrt(var_name = "diabetes", t = "Diabetes Status")
anaemia_dist<-df %>% chrt(var_name = "anaemia",t="Anaemia Status")
hb_dist<-df %>% chrt(var_name = "high_blood_pressure",t="High Blood Pressure Status")
sex_dist<-df %>% chrt(var_name = "sex",t="Patient Distribution By Sex",labels = c("Male","Female"),label_text="")
smoking_dist<-df %>% chrt(var_name = "smoking",t="Smoking Status")


combined_plot <- (sex_dist + age_dist) / (diab_dist + smoking_dist) /(anaemia_dist + hb_dist) +
  plot_layout(
    heights = c(2,2,2)
  )
combined_plot + 
  
  plot_annotation(
    title = "Heart Failure Patient Visual Characteristics",
    theme = theme(plot.title = element_text(size = 8, hjust = 0.5))
  )

# this is the surv object representation of the data
s_ob<-Surv(df$time,df$DEATH_EVENT)
make_km_frame<-function(km) {
  
  strata_factor <- rep(names(km$strata), km$strata)
  frame<- data.frame(
    strata=strata_factor,
    time=km$time,
    risk=km$n.risk,
    event=km$n.event,
    survival=km$surv,
    lower=km$lower,
    upper=km$upper
  ) %>% filter(event!=0,time %in%c(5*(1:50)))
  
  return (frame %>% select(strata,everything()))
  
}
km_tbl_options<-function(df,caption="s") {
  return (kable(
    df,
    digits = 4,  # Round numeric columns to 3 decimal places
    caption = caption,
    col.names = c("Time", "At Risk", "Events", "Survival", "Lower CI", "Upper CI"),
    longtable = TRUE
  ) %>% 
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    latex_options = c("scale_down", "hold_position", "repeat_header", "striped"),
    font_size = 8,  # Reduce font size further
    full_width = F  # Don't force full width
  )
  )
}

Kaplan Meier (Age)


km_sex<-survfit(s_ob~sex,data = df)

sdf<-make_km_frame(km_sex)

sdf %>% filter(strata == "sex=Female") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Female Patients")
Survival Analysis - Female Patients
Time At Risk Events Survival Lower CI Upper CI
10 104 1 0.9810 0.9552 1.0000
15 102 2 0.9617 0.9256 0.9992
20 99 1 0.9520 0.9118 0.9940
30 94 2 0.9032 0.8480 0.9621
45 86 1 0.8345 0.7656 0.9096
60 85 1 0.8247 0.7543 0.9017
65 83 1 0.8147 0.7429 0.8936
90 69 1 0.7618 0.6827 0.8500
95 64 1 0.7499 0.6692 0.8403
100 59 1 0.7372 0.6546 0.8301
115 51 1 0.7227 0.6379 0.8188
130 47 1 0.6926 0.6032 0.7952
sdf %>% filter(strata == "sex=Male") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Male Patients")
Survival Analysis - Male Patients
Time At Risk Events Survival Lower CI Upper CI
10 189 5 0.9485 0.9178 0.9801
20 179 1 0.9175 0.8796 0.9571
30 170 2 0.8709 0.8249 0.9194
35 164 1 0.8552 0.8070 0.9063
40 162 1 0.8447 0.7951 0.8974
50 159 1 0.8290 0.7775 0.8839
55 156 1 0.8236 0.7715 0.8793
60 154 1 0.8130 0.7597 0.8700
65 150 1 0.7970 0.7420 0.8560
90 120 1 0.7630 0.7046 0.8261
135 86 1 0.7178 0.6537 0.7881
150 75 1 0.7082 0.6427 0.7804
170 72 1 0.6795 0.6100 0.7569
180 68 2 0.6502 0.5774 0.7322
235 23 1 0.5915 0.5022 0.6967
ggsurvplot(km_sex,
           data = df
           ,pval = T,
           tables.height = 0.3,
           censor=F,
            conf.int = F,   # Show confidence intervals
            risk.table = TRUE, # Add risk table below
            ggtheme = theme_bw(),
            xlab = "Time in Days",
            ylab = "Kaplan Meir Survival Estimates",
            title = "Kaplan Meir Survival Estimates by sex")


Kaplan Meier (Diabetes)


km_dia<-survfit(s_ob~diabetes,data = df)

sdf_dia<-make_km_frame(km_dia)

sdf_dia %>% filter(strata == "diabetes=No") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients with No Diabetes")
Survival Analysis - Patients with No Diabetes
Time At Risk Events Survival Lower CI Upper CI
10 169 5 0.9425 0.9086 0.9778
15 158 1 0.9079 0.8659 0.9519
20 156 1 0.9021 0.8589 0.9474
50 146 1 0.8439 0.7915 0.8998
55 143 1 0.8380 0.7848 0.8948
60 142 1 0.8321 0.7781 0.8898
65 137 1 0.8142 0.7581 0.8744
90 111 2 0.7628 0.7009 0.8301
95 106 1 0.7556 0.6929 0.8239
115 85 1 0.7151 0.6475 0.7897
170 64 1 0.6931 0.6219 0.7723
180 60 2 0.6593 0.5835 0.7450
sdf_dia %>% filter(strata == "diabetes=Yes") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients with Diabetes")
Survival Analysis - Patients with Diabetes
Time At Risk Events Survival Lower CI Upper CI
10 124 1 0.9840 0.9622 1.0000
15 123 1 0.9760 0.9495 1.0000
20 122 1 0.9680 0.9376 0.9994
30 112 4 0.8790 0.8234 0.9383
35 103 1 0.8460 0.7847 0.9122
40 102 1 0.8377 0.7751 0.9055
45 99 1 0.8129 0.7467 0.8849
60 97 1 0.7963 0.7280 0.8709
65 96 1 0.7880 0.7188 0.8638
100 72 1 0.7518 0.6782 0.8333
130 60 1 0.7154 0.6362 0.8044
135 59 1 0.7033 0.6225 0.7945
150 53 1 0.6900 0.6074 0.7839
235 21 1 0.6025 0.5027 0.7222
ggsurvplot(km_dia,
           data = df
           ,pval = T,
           tables.height = 0.3,
           censor=F,
            conf.int = F,   # Show confidence intervals
            risk.table = TRUE, # Add risk table below
            ggtheme = theme_bw(),
            xlab = "Time in Days",
            ylab = "Kaplan Meir Survival Estimates",
            title = "Kaplan Meir Survival Estimates by Diabetes Status"
           )


Nelson-Aalen (Smoking)


na_smoke <- survfit(
  s_ob ~ smoking,  # Proper survival object formula
  data = df,
  type = "fleming-harrington",        # This gives us Nelson-Aalen estimation
  conf.type = "log-log"               # Appropriate confidence interval type
)

sdf_smoke<-make_km_frame(na_smoke)

sdf_smoke %>% filter(strata == "smoking=No") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who do not smoke")
Survival Analysis - Patients who do not smoke
Time At Risk Events Survival Lower CI Upper CI
10 199 2 0.9705 0.9356 0.9867
15 191 2 0.9361 0.8925 0.9624
20 188 1 0.9311 0.8865 0.9586
30 178 4 0.8763 0.8224 0.9147
50 164 1 0.8307 0.7712 0.8759
60 163 2 0.8206 0.7600 0.8671
65 159 1 0.8104 0.7488 0.8582
90 131 1 0.7783 0.7137 0.8301
95 124 1 0.7720 0.7067 0.8247
100 118 1 0.7592 0.6922 0.8135
115 102 1 0.7377 0.6679 0.7951
130 92 1 0.7143 0.6414 0.7750
135 91 1 0.7065 0.6327 0.7682
180 75 2 0.6554 0.5757 0.7238
235 25 1 0.5712 0.4728 0.6579
sdf_smoke %>% filter(strata == "smoking=Yes") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who smoke")
Survival Analysis - Patients who smoke
Time At Risk Events Survival Lower CI Upper CI
10 94 4 0.9385 0.8682 0.9719
20 90 1 0.9281 0.8551 0.9651
35 83 1 0.8556 0.7684 0.9118
40 82 1 0.8453 0.7565 0.9037
45 81 1 0.8349 0.7447 0.8954
55 78 1 0.8243 0.7326 0.8869
65 74 1 0.7921 0.6964 0.8606
90 58 1 0.7336 0.6310 0.8118
150 34 1 0.6985 0.5878 0.7848
170 32 1 0.6568 0.5368 0.7527
ggsurvplot(
  na_smoke,
  fun = "cumhaz",    # This tells ggsurvplot to show cumulative hazard
  data = df,
  censor=F,
  conf.int = F,   # Show confidence intervals
  risk.table = TRUE, # Add risk table below
  ggtheme = theme_bw(),
  xlab = "Time in Days",
  ylab = "Cumulative Hazard",
  title = "Nelson-Aalen Cumulative Hazard Estimate by Smoking Status"
)


Nelson Aalen (HBP)


na_hbp <- survfit(
  s_ob ~ high_blood_pressure,  # Proper survival object formula
  data = df,
  type = "fleming-harrington",        # This gives us Nelson-Aalen estimation
  conf.type = "log-log"               # Appropriate confidence interval type
)

sdf_hbp<-make_km_frame(na_hbp)

sdf_hbp %>% filter(strata == "high_blood_pressure=No") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who do not HBP")
Survival Analysis - Patients who do not HBP
Time At Risk Events Survival Lower CI Upper CI
10 190 3 0.9641 0.9262 0.9827
15 183 1 0.9384 0.8941 0.9646
30 176 3 0.9023 0.8511 0.9365
35 171 1 0.8918 0.8389 0.9281
45 168 1 0.8761 0.8209 0.9152
50 167 1 0.8709 0.8149 0.9109
65 160 2 0.8495 0.7907 0.8929
90 132 2 0.7990 0.7342 0.8496
95 126 1 0.7927 0.7271 0.8442
135 97 1 0.7492 0.6775 0.8072
150 86 1 0.7405 0.6675 0.8000
170 84 1 0.7232 0.6476 0.7853
180 77 1 0.6883 0.6084 0.7553
235 34 1 0.6324 0.5391 0.7119
sdf_hbp %>% filter(strata == "high_blood_pressure=Yes") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who have HBP")
Survival Analysis - Patients who have HBP
Time At Risk Events Survival Lower CI Upper CI
10 103 3 0.9529 0.8905 0.9801
15 98 1 0.9338 0.8662 0.9679
20 96 2 0.9146 0.8422 0.9546
30 88 1 0.8468 0.7621 0.9032
40 82 1 0.7982 0.7074 0.8635
55 77 1 0.7494 0.6541 0.8219
60 75 2 0.7201 0.6228 0.7963
100 51 1 0.6842 0.5830 0.7658
115 43 1 0.6535 0.5474 0.7405
130 36 1 0.6355 0.5263 0.7261
180 29 1 0.5951 0.4784 0.6939
ggsurvplot(
  na_hbp,
  fun = "cumhaz",    # This tells ggsurvplot to show cumulative hazard
  data = df,
  censor=F,
  conf.int = F,   # Show confidence intervals
  risk.table = TRUE, # Add risk table below
  ggtheme = theme_bw(),
  xlab = "Time in Days",
  ylab = "Cumulative Hazard",
  title = "Nelson-Aalen Cumulative Hazard Estimate by HBP Status"
)


Log-Rank Test


using HBP Estimates


lr_table<-function(diff) {
  df<-data.frame(
  name=diff$n,
  obs=diff$obs,
  exp=diff$exp,
  pv=diff$pvalue,
  chq=diff$chisq
) %>%t() %>%  kable() %>% kable_styling(
  bootstrap_options = c("striped", "hover", "condensed"),
  latex_options = c("scale_down", "hold_position"),  # Added hold_position
  full_width = F
)
  return(df)
}

sdiff_hbp<-survdiff(s_ob~high_blood_pressure,data = df,rho = 0)
lr_table(sdiff_hbp)
name.groups high_blood_pressure=No high_blood_pressure=Yes
name.Freq 194 105
obs 57 39
exp 66.42245 29.57755
pv 0.03580752 0.03580752
chq 4.406248 4.406248
sdiff_smoke<-survdiff(s_ob~smoking,data=df,rho = 0)
lr_table(sdiff_smoke)
name.groups smoking=No smoking=Yes
name.Freq 203 96
obs 66 30
exp 65.79487 30.20513
pv 0.9639597 0.9639597
chq 0.002041704 0.002041704
sdiff_sex<-survdiff(s_ob~sex,data=df,rho = 0)
lr_table(sdiff_sex)
name.groups sex=Female sex=Male
name.Freq 105 194
obs 34 62
exp 34.2953 61.7047
pv 0.9497523 0.9497523
chq 0.003971242 0.003971242
sdiff_dia<-survdiff(s_ob~diabetes,data=df,rho = 0)
lr_table(sdiff_dia)
name.groups diabetes=No diabetes=Yes
name.Freq 174 125
obs 56 40
exp 55.02732 40.97268
pv 0.840452 0.840452
chq 0.04052788 0.04052788
sdiff_anae<-survdiff(s_ob~anaemia,data=df,rho = 0)
lr_table(sdiff_anae)
name.groups anaemia=No anaemia=Yes
name.Freq 170 129
obs 50 46
exp 57.8783 38.1217
pv 0.09869758 0.09869758
chq 2.726464 2.726464

Conclusion

Based on the log-rank test results, only high blood pressure demonstrated a significant difference in survival patterns among heart failure patients (p = 0.035). Other clinical variables examined showed no statistically significant impact on survival distributions at the 5% significance level.


Cox-Proportional Hazard


cox_sex<-coxph(s_ob~sex,data = df)
tidy(cox_sex) %>%t() %>%  kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)
term sexMale
estimate 0.01356273
std.error 0.2134404
statistic 0.06354342
p.value 0.9493338
cox_smoking<-coxph(s_ob~smoking,data = df)
tidy(cox_smoking) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)
term smokingYes
estimate -0.009586323
std.error 0.2203002
statistic -0.04351483
p.value 0.9652911
cox_hbp<-coxph(s_ob~high_blood_pressure,data = df)
tidy(cox_hbp) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)
term high_blood_pressureYes
estimate 0.4359492
std.error 0.2093982
statistic 2.081915
p.value 0.03735024
cox_anae<-coxph(s_ob~anaemia,data = df)
tidy(cox_anae) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)
term anaemiaYes
estimate 0.3373673
std.error 0.2049819
statistic 1.64584
p.value 0.0997968
cox_dia<-coxph(s_ob~diabetes,data = df)
tidy(cox_dia) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)
term diabetesYes
estimate -0.04184191
std.error 0.2072847
statistic -0.2018572
p.value 0.8400284

Conclusion

Based on the Cox proportional hazards regression analysis, the results align with our log-rank test findings - high blood pressure remains the only significant predictor of survival (p = 0.037) at the 5% significance level, while other clinical variables show no statistically significant impact on survival outcomes.