Homework - Survival Analysis
logo

Question: Use the dataframe nafld1 of dataset “ndfld” of the “survival” package:

1. Plot the overall survival curve, estimate median survival time, estimate survival at 1.5 year

2. Plot the survival curves of males and females, test the difference between the curves

3. Examine each of other covariates (age, weight, height, bmi) and check if any covariate significantly affect survival

4. Report and provide rationale for the best multivariate Cox model.

1 Exploratory Data Analysis

1.1 Import data

rm(list = ls()) # Clear all

# Package loading
library(survival)
library(survminer)
library(concordance)
library(dplyr)

df <- nafld1
glimpse(df)
## Rows: 17,549
## Columns: 9
## $ id      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
## $ age     <dbl> 57, 67, 53, 56, 68, 39, 49, 30, 47, 79, 47, 59, 54, 41, 51,...
## $ male    <dbl> 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,...
## $ weight  <dbl> 60.0, 70.4, 105.8, 109.3, NA, 63.9, 66.2, NA, 110.8, 56.6, ...
## $ height  <dbl> 163, 168, 186, 170, NA, 155, 161, 180, 188, 155, NA, 182, 1...
## $ bmi     <dbl> 22.69094, 24.88403, 30.45354, 37.83010, NA, 26.61559, 25.51...
## $ case.id <int> 10630, 14817, 3, 6628, 1871, 7144, 7507, 13417, 9, 13518, 9...
## $ futime  <dbl> 6261, 624, 1783, 3143, 1836, 1581, 3109, 1339, 1671, 2239, ...
## $ status  <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
  • nafld1 is a data frame with 17,549 observations on the following 10 variables:
  • id- subject identifier
  • age - age at entry to the study
  • male - gender orientation, with 0 = female, 1 = male
  • weight - weight in kg
  • height - height in cm
  • bmi - body mass index
  • case.id - the id of the NAFLD case to whom this subject is matched
  • futime - time to death or last follow-up
  • status - status of patients, with 0 = alive at last follow-up, 1 = dead

1.2 Data summary

summary(df)
##        id             age             male            weight      
##  Min.   :    1   Min.   :18.00   Min.   :0.0000   Min.   : 33.40  
##  1st Qu.: 4393   1st Qu.:42.00   1st Qu.:0.0000   1st Qu.: 70.00  
##  Median : 8786   Median :53.00   Median :0.0000   Median : 83.90  
##  Mean   : 8784   Mean   :52.66   Mean   :0.4673   Mean   : 86.35  
##  3rd Qu.:13175   3rd Qu.:63.00   3rd Qu.:1.0000   3rd Qu.: 99.20  
##  Max.   :17566   Max.   :98.00   Max.   :1.0000   Max.   :181.70  
##                                                   NA's   :4786    
##      height           bmi            case.id          futime    
##  Min.   :123.0   Min.   : 9.207   Min.   :    3   Min.   :   7  
##  1st Qu.:162.0   1st Qu.:25.136   1st Qu.: 4598   1st Qu.:1132  
##  Median :169.0   Median :28.876   Median : 8781   Median :2148  
##  Mean   :169.4   Mean   :30.074   Mean   : 8841   Mean   :2411  
##  3rd Qu.:177.0   3rd Qu.:33.710   3rd Qu.:13249   3rd Qu.:3353  
##  Max.   :215.0   Max.   :84.396   Max.   :17563   Max.   :7268  
##  NA's   :3168    NA's   :4961     NA's   :31                    
##      status       
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.07773  
##  3rd Qu.:0.00000  
##  Max.   :1.00000  
## 

2 Question 1

2.1 Overall survival curve

fit <- survfit(Surv(futime, status)~1, data = df)
surv_table <- data.frame(time = fit$time,
                         n_risk = fit$n.risk,
                         n_event = fit$n.event,
                         n_censor = fit$n.censor,
                         sur_prob = fit$surv)

surv_table
ggsurvplot(
  fit = fit, 
  palette = "LIGHTSALMON", 
  ggtheme = theme_bw(),
  linetype = 1, 
  risk.table = TRUE,
  cumcensor = TRUE,
  cumevents = TRUE,
  tables.height = 0.15
)

2.2 Estimate survival time

# Median survival time
print(fit)
## Call: survfit(formula = Surv(futime, status) ~ 1, data = df)
## 
##       n  events  median 0.95LCL 0.95UCL 
##   17549    1364      NA      NA      NA
# Survival at 1.5 years
summary(fit, times = 548)
## Call: survfit(formula = Surv(futime, status) ~ 1, data = df)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##   548  16305     246    0.986 0.000909        0.984        0.987

3 Question 2

3.1 Test the difference between

fit2 <- survfit(Surv(futime, status) ~ male, data = df)
print(fit2)
## Call: survfit(formula = Surv(futime, status) ~ male, data = df)
## 
##           n events median 0.95LCL 0.95UCL
## male=0 9348    674     NA      NA      NA
## male=1 8201    690     NA      NA      NA
survdiff(Surv(futime, status) ~ male, data = df)
## Call:
## survdiff(formula = Surv(futime, status) ~ male, data = df)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## male=0 9348      674      736      5.20      11.3
## male=1 8201      690      628      6.09      11.3
## 
##  Chisq= 11.3  on 1 degrees of freedom, p= 8e-04

3.2 Survival curves by gender

ggsurvplot(
  fit = fit2, 
  palette = c("LIGHTSALMON", "dodgerblue"), 
  ggtheme = theme_bw(),
  linetype = 1, 
  risk.table = TRUE,
  cumcensor = TRUE,
  cumevents = TRUE,
  tables.height = 0.15
)

4 Question 3 Examine covariates

4.1 Age

model1 <- coxph(Surv(futime, status) ~ age,data=df)
print(model1, digits=3)
## Call:
## coxph(formula = Surv(futime, status) ~ age, data = df)
## 
##        coef exp(coef) se(coef)    z      p
## age 0.09827   1.10326  0.00223 44.1 <2e-16
## 
## Likelihood ratio test=2252  on 1 df, p=<2e-16
## n= 17549, number of events= 1364

4.2 Weight

model2 <- coxph(Surv(futime, status) ~ weight,data=df)
print(model2, digits=3)
## Call:
## coxph(formula = Surv(futime, status) ~ weight, data = df)
## 
##            coef exp(coef) se(coef)    z       p
## weight -0.00546   0.99456  0.00147 -3.7 0.00021
## 
## Likelihood ratio test=14.1  on 1 df, p=0.000173
## n= 12763, number of events= 1046 
##    (4786 observations deleted due to missingness)

4.3 Height

model3 <- coxph(Surv(futime, status) ~ height,data=df)
print(model3, digits=3)
## Call:
## coxph(formula = Surv(futime, status) ~ height, data = df)
## 
##            coef exp(coef) se(coef)     z       p
## height -0.02270   0.97756  0.00294 -7.72 1.2e-14
## 
## Likelihood ratio test=59.8  on 1 df, p=1.05e-14
## n= 14381, number of events= 1129 
##    (3168 observations deleted due to missingness)

4.4 BMI

model4 <- coxph(Surv(futime, status) ~ bmi,data=df)
print(model4, digits=3)
## Call:
## coxph(formula = Surv(futime, status) ~ bmi, data = df)
## 
##         coef exp(coef) se(coef)    z    p
## bmi 0.000518  1.000518 0.004492 0.12 0.91
## 
## Likelihood ratio test=0.01  on 1 df, p=0.908
## n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)

5 Question 4

  • With age: p-value = <2e-16 ==> select
  • With weight: p-value = 0.00021 ==> select
  • With height: p-value = 1.2e-14 ==> select
  • With bmi: p-value = 0.91 ==> non-select

The final model will include gender, age, weight and height

model <- coxph(Surv(futime, status) ~ male + age + weight + height,data=df)
print(model, digits=3)
## Call:
## coxph(formula = Surv(futime, status) ~ male + age + weight + 
##     height, data = df)
## 
##            coef exp(coef) se(coef)     z       p
## male    0.54590   1.72616  0.09039  6.04 1.5e-09
## age     0.09836   1.10336  0.00277 35.52 < 2e-16
## weight  0.00569   1.00570  0.00183  3.10  0.0019
## height -0.01795   0.98221  0.00460 -3.90 9.4e-05
## 
## Likelihood ratio test=1758  on 4 df, p=<2e-16
## n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
 




A work by by Luong Nguyen - Student ID: 17 - 16 June 2020