Assignment 3

Task

By using SURVIVAL package and other packages in R for plots.

Choose survival data of your choice in R Program

Write R codes for the following models, results and interpretation.

Kaplan Meier
Nelson Aalen
Log rank test
Cox proportion Hazard and its parameters results and test of hypothesis interpretations.

Load libraries

library(here) # to easy read data from folder path
library(tidyverse) # filter, read.csv, manipulate data
library(survival) # survival library
library(survminer) #
library(summarytools)
library(kableExtra)
library(patchwork)
library(gtsummary)
library(broom)

Load Data

heart_failure<-read.csv(here("data/heart_failure_clinical_records_dataset.csv"))
df<-heart_failure %>% 
  select(
    age,
         diabetes,
         smoking,
         anaemia,
         high_blood_pressure,
         sex,
         time,
         DEATH_EVENT
    ) %>% 
  as.data.frame() %>% 
  mutate(across(
    .cols = c(diabetes, smoking, anaemia,high_blood_pressure),  
    .fns = ~ifelse(. == 1, "Yes", "No")    
  )) %>% 
  mutate(
    sex = ifelse(sex == 1, "Male", "Female"),
  ) %>% 
  as.data.frame()

The dataset contains health information for heart failure patients, focusing on key health indicators (diabetes, smoking, anemia, high blood pressure) along with basic demographics (sex, age). It tracks survival time in days and records whether death occurred during the study period, allowing researchers to analyze how various patient characteristics might influence heart failure outcomes.

characteristics of the dataset

data_structure_summary <- function(df) {
  result <- data.frame(
    Seq = 1:length(names(df)),
    Variable = names(df),
    Class = sapply(df, class),
    First_Values = sapply(df, function(x) paste0(head(x, 3), collapse = ", ")),
    Missing = sapply(df, function(x) sum(is.na(x))),
    Unique_Values = sapply(df, function(x) length(unique(x))),
    row.names = NULL  # This ensures no row names are set
  )
  return(result)
}

data_structure_summary(df) %>% kable(
      caption = "Data Structure Summary",
      align = c("r","l", "l", "l", "r", "r"),
      row.names = FALSE,
      booktabs = T,
      escape = FALSE,
      format =  ifelse(knitr::is_html_output(), 'html', 'latex')
) %>% kable_styling(
  bootstrap_options = c("striped", "hover", "condensed"),
  latex_options = c("scale_down", "hold_position"),
  full_width = F  # Changed to False for better column sizing
) %>% 
  column_spec(1, bold = TRUE, width = "3em") %>%  # Format sequence number
  column_spec(2, bold = TRUE)               # Format variable name

Data Structure Summary
Seq	Variable	Class	First_Values	Unique_Values
1	age	numeric	75, 55, 65	47
2	diabetes	character	No, No, No	2
3	smoking	character	No, No, Yes	2
4	anaemia	character	No, No, No	2
5	high_blood_pressure	character	Yes, No, No	2
6	sex	character	Male, Male, Male	2
7	time	integer	4, 6, 7	148
8	DEATH_EVENT	integer	1, 1, 1	2

sample of dataset

fmt<-function(k) {
  res<-k %>% t() %>% kable(
    longtable = TRUE,
    booktabs = T,
      escape = FALSE,
      format =  ifelse(knitr::is_html_output(), 'html', 'latex')
  ) %>% kable_styling(
     bootstrap_options = c("striped","condensed","hover"),
     latex_options = c("repeat_header", "hold_position"),
     full_width = F
  ) %>% 
    column_spec(1, bold = TRUE)
  return (res)
}

df %>% sample_n(7) %>% fmt()

age	66	42	49	50	65	58	55
diabetes	Yes	No	No	No	No	Yes	Yes
smoking	No	Yes	No	No	No	No	No
anaemia	Yes	Yes	Yes	No	Yes	No	No
high_blood_pressure	Yes	No	No	Yes	Yes	Yes	No
sex	Female	Male	Female	Male	Male	Female	Female
time	95	201	147	108	194	83	246
DEATH_EVENT	0	0	0	0	0	0	0

Data Description and Properties

freq(df$diabetes) %>% kable() %>% kable_classic() ## diabetes

	Freq	% Valid	% Valid Cum.	% Total	% Total Cum.
No	174	58.19398	58.19398	58.19398	58.19398
Yes	125	41.80602	100.00000	41.80602	100.00000
<NA>	0	NA	NA	0.00000	100.00000
Total	299	100.00000	100.00000	100.00000	100.00000

freq(df$smoking) %>% kable() %>% kable_classic() ## smoking

	Freq	% Valid	% Valid Cum.	% Total	% Total Cum.
No	203	67.89298	67.89298	67.89298	67.89298
Yes	96	32.10702	100.00000	32.10702	100.00000
<NA>	0	NA	NA	0.00000	100.00000
Total	299	100.00000	100.00000	100.00000	100.00000

freq(df$high_blood_pressure) %>% kable() %>% kable_classic() #high blood pressure

	Freq	% Valid	% Valid Cum.	% Total	% Total Cum.
No	194	64.88294	64.88294	64.88294	64.88294
Yes	105	35.11706	100.00000	35.11706	100.00000
<NA>	0	NA	NA	0.00000	100.00000
Total	299	100.00000	100.00000	100.00000	100.00000

freq(df$sex) %>% kable() %>% kable_classic() #sex

	Freq	% Valid	% Valid Cum.	% Total	% Total Cum.
Female	105	35.11706	35.11706	35.11706	35.11706
Male	194	64.88294	100.00000	64.88294	100.00000
<NA>	0	NA	NA	0.00000	100.00000
Total	299	100.00000	100.00000	100.00000	100.00000

descr(df$age) %>% kable() %>% kable_classic() #age

	age
Mean	60.8338930
Std.Dev	11.8948091
Min	40.0000000
Q1	51.0000000
Median	60.0000000
Q3	70.0000000
Max	95.0000000
MAD	14.8260000
IQR	19.0000000
CV	0.1955293
Skewness	0.4188266
SE.Skewness	0.1409539
Kurtosis	-0.2204793
N.Valid	299.0000000
Pct.Valid	100.0000000

# Create histogram with normal curve
age_dist<-df %>%
  ggplot(aes(x = age)) +
  # First, add the histogram
  geom_histogram(aes(y = after_stat(density)), 
                bins = 15,           # Adjust number of bins for smoothness
                fill = "lightblue",  # Make histogram slightly transparent
                alpha = 0.7) +
  # Add the normal distribution curve
  stat_function(fun = dnorm, 
                args = list(
                  mean = mean(df$age),
                  sd = sd(df$age)
                ),
                color = "orange",
                linewidth = .5) +
  # Add proper labels
  labs(
    title = "Age Distribution with Normal Curve",
    x = "Age",
    y = "Density"
  ) +
  theme_bw()+
  theme(
     legend.position = "bottom",
    legend.box = "horizontal"
  )

chrt <- function(df, var_name, t, labels = c("No", "Yes"), label_text = "(0=No, 1=Yes)") {
  df %>%
    count(.data[[var_name]]) %>%
    ggplot(aes(x = factor(.data[[var_name]]), y = n, 
               fill = factor(.data[[var_name]]))) +
    geom_col() +
    labs(title = paste0(t, " ", label_text), 
         x = "Status", 
         y = "No of Patients") +
    scale_fill_discrete(name = "Status", labels = labels) +
    scale_x_discrete(labels = labels) +
    theme_bw() +
    theme(
      legend.position = "bottom",
      legend.box = "horizontal"
    )
}


diab_dist<-df %>% chrt(var_name = "diabetes", t = "Diabetes Status")
anaemia_dist<-df %>% chrt(var_name = "anaemia",t="Anaemia Status")
hb_dist<-df %>% chrt(var_name = "high_blood_pressure",t="High Blood Pressure Status")
sex_dist<-df %>% chrt(var_name = "sex",t="Patient Distribution By Sex",labels = c("Male","Female"),label_text="")
smoking_dist<-df %>% chrt(var_name = "smoking",t="Smoking Status")


combined_plot <- (sex_dist + age_dist) / (diab_dist + smoking_dist) /(anaemia_dist + hb_dist) +
  plot_layout(
    heights = c(2,2,2)
  )
combined_plot + 
  
  plot_annotation(
    title = "Heart Failure Patient Visual Characteristics",
    theme = theme(plot.title = element_text(size = 8, hjust = 0.5))
  )

# this is the surv object representation of the data
s_ob<-Surv(df$time,df$DEATH_EVENT)
make_km_frame<-function(km) {
  
  strata_factor <- rep(names(km$strata), km$strata)
  frame<- data.frame(
    strata=strata_factor,
    time=km$time,
    risk=km$n.risk,
    event=km$n.event,
    survival=km$surv,
    lower=km$lower,
    upper=km$upper
  ) %>% filter(event!=0,time %in%c(5*(1:50)))
  
  return (frame %>% select(strata,everything()))
  
}
km_tbl_options<-function(df,caption="s") {
  return (kable(
    df,
    digits = 4,  # Round numeric columns to 3 decimal places
    caption = caption,
    col.names = c("Time", "At Risk", "Events", "Survival", "Lower CI", "Upper CI"),
    longtable = TRUE
  ) %>% 
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    latex_options = c("scale_down", "hold_position", "repeat_header", "striped"),
    font_size = 8,  # Reduce font size further
    full_width = F  # Don't force full width
  )
  )
}

Kaplan Meier (Age)

km_sex<-survfit(s_ob~sex,data = df)

sdf<-make_km_frame(km_sex)

sdf %>% filter(strata == "sex=Female") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Female Patients")

Survival Analysis - Female Patients
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	104	1	0.9810	0.9552	1.0000
15	102	2	0.9617	0.9256	0.9992
20	99	1	0.9520	0.9118	0.9940
30	94	2	0.9032	0.8480	0.9621
45	86	1	0.8345	0.7656	0.9096
60	85	1	0.8247	0.7543	0.9017
65	83	1	0.8147	0.7429	0.8936
90	69	1	0.7618	0.6827	0.8500
95	64	1	0.7499	0.6692	0.8403
100	59	1	0.7372	0.6546	0.8301
115	51	1	0.7227	0.6379	0.8188
130	47	1	0.6926	0.6032	0.7952

sdf %>% filter(strata == "sex=Male") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Male Patients")

Survival Analysis - Male Patients
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	189	5	0.9485	0.9178	0.9801
20	179	1	0.9175	0.8796	0.9571
30	170	2	0.8709	0.8249	0.9194
35	164	1	0.8552	0.8070	0.9063
40	162	1	0.8447	0.7951	0.8974
50	159	1	0.8290	0.7775	0.8839
55	156	1	0.8236	0.7715	0.8793
60	154	1	0.8130	0.7597	0.8700
65	150	1	0.7970	0.7420	0.8560
90	120	1	0.7630	0.7046	0.8261
135	86	1	0.7178	0.6537	0.7881
150	75	1	0.7082	0.6427	0.7804
170	72	1	0.6795	0.6100	0.7569
180	68	2	0.6502	0.5774	0.7322
235	23	1	0.5915	0.5022	0.6967

ggsurvplot(km_sex,
           data = df
           ,pval = T,
           tables.height = 0.3,
           censor=F,
            conf.int = F,   # Show confidence intervals
            risk.table = TRUE, # Add risk table below
            ggtheme = theme_bw(),
            xlab = "Time in Days",
            ylab = "Kaplan Meir Survival Estimates",
            title = "Kaplan Meir Survival Estimates by sex")

Kaplan Meier (Diabetes)

km_dia<-survfit(s_ob~diabetes,data = df)

sdf_dia<-make_km_frame(km_dia)

sdf_dia %>% filter(strata == "diabetes=No") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients with No Diabetes")

Survival Analysis - Patients with No Diabetes
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	169	5	0.9425	0.9086	0.9778
15	158	1	0.9079	0.8659	0.9519
20	156	1	0.9021	0.8589	0.9474
50	146	1	0.8439	0.7915	0.8998
55	143	1	0.8380	0.7848	0.8948
60	142	1	0.8321	0.7781	0.8898
65	137	1	0.8142	0.7581	0.8744
90	111	2	0.7628	0.7009	0.8301
95	106	1	0.7556	0.6929	0.8239
115	85	1	0.7151	0.6475	0.7897
170	64	1	0.6931	0.6219	0.7723
180	60	2	0.6593	0.5835	0.7450

sdf_dia %>% filter(strata == "diabetes=Yes") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients with Diabetes")

Survival Analysis - Patients with Diabetes
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	124	1	0.9840	0.9622	1.0000
15	123	1	0.9760	0.9495	1.0000
20	122	1	0.9680	0.9376	0.9994
30	112	4	0.8790	0.8234	0.9383
35	103	1	0.8460	0.7847	0.9122
40	102	1	0.8377	0.7751	0.9055
45	99	1	0.8129	0.7467	0.8849
60	97	1	0.7963	0.7280	0.8709
65	96	1	0.7880	0.7188	0.8638
100	72	1	0.7518	0.6782	0.8333
130	60	1	0.7154	0.6362	0.8044
135	59	1	0.7033	0.6225	0.7945
150	53	1	0.6900	0.6074	0.7839
235	21	1	0.6025	0.5027	0.7222

ggsurvplot(km_dia,
           data = df
           ,pval = T,
           tables.height = 0.3,
           censor=F,
            conf.int = F,   # Show confidence intervals
            risk.table = TRUE, # Add risk table below
            ggtheme = theme_bw(),
            xlab = "Time in Days",
            ylab = "Kaplan Meir Survival Estimates",
            title = "Kaplan Meir Survival Estimates by Diabetes Status"
           )

Nelson-Aalen (Smoking)

na_smoke <- survfit(
  s_ob ~ smoking,  # Proper survival object formula
  data = df,
  type = "fleming-harrington",        # This gives us Nelson-Aalen estimation
  conf.type = "log-log"               # Appropriate confidence interval type
)

sdf_smoke<-make_km_frame(na_smoke)

sdf_smoke %>% filter(strata == "smoking=No") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who do not smoke")

Survival Analysis - Patients who do not smoke
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	199	2	0.9705	0.9356	0.9867
15	191	2	0.9361	0.8925	0.9624
20	188	1	0.9311	0.8865	0.9586
30	178	4	0.8763	0.8224	0.9147
50	164	1	0.8307	0.7712	0.8759
60	163	2	0.8206	0.7600	0.8671
65	159	1	0.8104	0.7488	0.8582
90	131	1	0.7783	0.7137	0.8301
95	124	1	0.7720	0.7067	0.8247
100	118	1	0.7592	0.6922	0.8135
115	102	1	0.7377	0.6679	0.7951
130	92	1	0.7143	0.6414	0.7750
135	91	1	0.7065	0.6327	0.7682
180	75	2	0.6554	0.5757	0.7238
235	25	1	0.5712	0.4728	0.6579

sdf_smoke %>% filter(strata == "smoking=Yes") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who smoke")

Survival Analysis - Patients who smoke
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	94	4	0.9385	0.8682	0.9719
20	90	1	0.9281	0.8551	0.9651
35	83	1	0.8556	0.7684	0.9118
40	82	1	0.8453	0.7565	0.9037
45	81	1	0.8349	0.7447	0.8954
55	78	1	0.8243	0.7326	0.8869
65	74	1	0.7921	0.6964	0.8606
90	58	1	0.7336	0.6310	0.8118
150	34	1	0.6985	0.5878	0.7848
170	32	1	0.6568	0.5368	0.7527

ggsurvplot(
  na_smoke,
  fun = "cumhaz",    # This tells ggsurvplot to show cumulative hazard
  data = df,
  censor=F,
  conf.int = F,   # Show confidence intervals
  risk.table = TRUE, # Add risk table below
  ggtheme = theme_bw(),
  xlab = "Time in Days",
  ylab = "Cumulative Hazard",
  title = "Nelson-Aalen Cumulative Hazard Estimate by Smoking Status"
)

Nelson Aalen (HBP)

na_hbp <- survfit(
  s_ob ~ high_blood_pressure,  # Proper survival object formula
  data = df,
  type = "fleming-harrington",        # This gives us Nelson-Aalen estimation
  conf.type = "log-log"               # Appropriate confidence interval type
)

sdf_hbp<-make_km_frame(na_hbp)

sdf_hbp %>% filter(strata == "high_blood_pressure=No") %>%
   select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who do not HBP")

Survival Analysis - Patients who do not HBP
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	190	3	0.9641	0.9262	0.9827
15	183	1	0.9384	0.8941	0.9646
30	176	3	0.9023	0.8511	0.9365
35	171	1	0.8918	0.8389	0.9281
45	168	1	0.8761	0.8209	0.9152
50	167	1	0.8709	0.8149	0.9109
65	160	2	0.8495	0.7907	0.8929
90	132	2	0.7990	0.7342	0.8496
95	126	1	0.7927	0.7271	0.8442
135	97	1	0.7492	0.6775	0.8072
150	86	1	0.7405	0.6675	0.8000
170	84	1	0.7232	0.6476	0.7853
180	77	1	0.6883	0.6084	0.7553
235	34	1	0.6324	0.5391	0.7119

sdf_hbp %>% filter(strata == "high_blood_pressure=Yes") %>%
  select(time,risk,event,survival,lower,upper) %>% 
  km_tbl_options("Survival Analysis - Patients who have HBP")

Survival Analysis - Patients who have HBP
Time	At Risk	Events	Survival	Lower CI	Upper CI
10	103	3	0.9529	0.8905	0.9801
15	98	1	0.9338	0.8662	0.9679
20	96	2	0.9146	0.8422	0.9546
30	88	1	0.8468	0.7621	0.9032
40	82	1	0.7982	0.7074	0.8635
55	77	1	0.7494	0.6541	0.8219
60	75	2	0.7201	0.6228	0.7963
100	51	1	0.6842	0.5830	0.7658
115	43	1	0.6535	0.5474	0.7405
130	36	1	0.6355	0.5263	0.7261
180	29	1	0.5951	0.4784	0.6939

ggsurvplot(
  na_hbp,
  fun = "cumhaz",    # This tells ggsurvplot to show cumulative hazard
  data = df,
  censor=F,
  conf.int = F,   # Show confidence intervals
  risk.table = TRUE, # Add risk table below
  ggtheme = theme_bw(),
  xlab = "Time in Days",
  ylab = "Cumulative Hazard",
  title = "Nelson-Aalen Cumulative Hazard Estimate by HBP Status"
)

Log-Rank Test

using HBP Estimates

lr_table<-function(diff) {
  df<-data.frame(
  name=diff$n,
  obs=diff$obs,
  exp=diff$exp,
  pv=diff$pvalue,
  chq=diff$chisq
) %>%t() %>%  kable() %>% kable_styling(
  bootstrap_options = c("striped", "hover", "condensed"),
  latex_options = c("scale_down", "hold_position"),  # Added hold_position
  full_width = F
)
  return(df)
}

sdiff_hbp<-survdiff(s_ob~high_blood_pressure,data = df,rho = 0)
lr_table(sdiff_hbp)


name.groups	high_blood_pressure=No	high_blood_pressure=Yes
name.Freq	194	105
obs	57	39
exp	66.42245	29.57755
pv	0.03580752	0.03580752
chq	4.406248	4.406248

sdiff_smoke<-survdiff(s_ob~smoking,data=df,rho = 0)
lr_table(sdiff_smoke)


name.groups	smoking=No	smoking=Yes
name.Freq	203	96
obs	66	30
exp	65.79487	30.20513
pv	0.9639597	0.9639597
chq	0.002041704	0.002041704

sdiff_sex<-survdiff(s_ob~sex,data=df,rho = 0)
lr_table(sdiff_sex)


name.groups	sex=Female	sex=Male
name.Freq	105	194
obs	34	62
exp	34.2953	61.7047
pv	0.9497523	0.9497523
chq	0.003971242	0.003971242

sdiff_dia<-survdiff(s_ob~diabetes,data=df,rho = 0)
lr_table(sdiff_dia)


name.groups	diabetes=No	diabetes=Yes
name.Freq	174	125
obs	56	40
exp	55.02732	40.97268
pv	0.840452	0.840452
chq	0.04052788	0.04052788

sdiff_anae<-survdiff(s_ob~anaemia,data=df,rho = 0)
lr_table(sdiff_anae)


name.groups	anaemia=No	anaemia=Yes
name.Freq	170	129
obs	50	46
exp	57.8783	38.1217
pv	0.09869758	0.09869758
chq	2.726464	2.726464

Conclusion

Based on the log-rank test results, only high blood pressure demonstrated a significant difference in survival patterns among heart failure patients (p = 0.035). Other clinical variables examined showed no statistically significant impact on survival distributions at the 5% significance level.

Cox-Proportional Hazard

cox_sex<-coxph(s_ob~sex,data = df)
tidy(cox_sex) %>%t() %>%  kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)


term	sexMale
estimate	0.01356273
std.error	0.2134404
statistic	0.06354342
p.value	0.9493338

cox_smoking<-coxph(s_ob~smoking,data = df)
tidy(cox_smoking) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)


term	smokingYes
estimate	-0.009586323
std.error	0.2203002
statistic	-0.04351483
p.value	0.9652911

cox_hbp<-coxph(s_ob~high_blood_pressure,data = df)
tidy(cox_hbp) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)


term	high_blood_pressureYes
estimate	0.4359492
std.error	0.2093982
statistic	2.081915
p.value	0.03735024

cox_anae<-coxph(s_ob~anaemia,data = df)
tidy(cox_anae) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)


term	anaemiaYes
estimate	0.3373673
std.error	0.2049819
statistic	1.64584
p.value	0.0997968

cox_dia<-coxph(s_ob~diabetes,data = df)
tidy(cox_dia) %>% t() %>% kable() %>% kable_styling(
  bootstrap_options = c("striped", "condensed", "hover"),
  latex_options = c("scale_down", "hold_position"),
  font_size = 9,
  position = "center",  # Centers the table
  full_width = F  # Better for transposed tables
)


term	diabetesYes
estimate	-0.04184191
std.error	0.2072847
statistic	-0.2018572
p.value	0.8400284