Survival Analysis for data nafld

1. Load Package

library(survival)
library(ggplot2)
library(ggpubr)
library(survminer)

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

library(concordance)
options(scipen=999)

2. Load Data

data(nafld)
head(nafld1)

##       id age male weight height      bmi case.id futime status
## 3631   1  57    0   60.0    163 22.69094   10630   6261      0
## 8458   2  67    0   70.4    168 24.88403   14817    624      0
## 6298   3  53    1  105.8    186 30.45354       3   1783      0
## 15398  4  56    1  109.3    170 37.83010    6628   3143      0
## 13261  5  68    1     NA     NA       NA    1871   1836      1
## 14423  6  39    0   63.9    155 26.61559    7144   1581      0

head(nafld2)

##        id days test value
## 135077  1 -459  hdl    75
## 313143  1 -459 chol    75
## 135078  1  183  hdl    64
## 313144  1  183 chol    64
## 135079  1 2030  hdl    74
## 313145  1 2030 chol    74

head(nafld3)

##   id  days        event
## 2  3     0        nafld
## 3  4 -1287          htn
## 4  4 -1287 dyslipidemia
## 5  4 -1226      ang/isc
## 7  4   -90      ang/isc
## 8  5   617          htn

df = nafld1
head(df)

##       id age male weight height      bmi case.id futime status
## 3631   1  57    0   60.0    163 22.69094   10630   6261      0
## 8458   2  67    0   70.4    168 24.88403   14817    624      0
## 6298   3  53    1  105.8    186 30.45354       3   1783      0
## 15398  4  56    1  109.3    170 37.83010    6628   3143      0
## 13261  5  68    1     NA     NA       NA    1871   1836      1
## 14423  6  39    0   63.9    155 26.61559    7144   1581      0

3. Survival Plot

3.1 Membuat Survive Table

Variabel tersensor adalah status. status bernilai 0 artinya dia tersensor.

fit <- survfit(Surv(futime, status)~1, data = df)
surv_table <- data.frame(time=fit$time,
                         n_risk=fit$n.risk,
                         n_event=fit$n.event,
                         n_censor=fit$n.censor,
                         sur_prob=fit$surv)
head(surv_table)

##   time n_risk n_event n_censor sur_prob
## 1    7  17549       0        3 1.000000
## 2    9  17546       0        3 1.000000
## 3   10  17543       1        4 0.999943
## 4   11  17538       1        2 0.999886
## 5   12  17535       0        1 0.999886
## 6   13  17534       0        2 0.999886

3.2 Membuat Kurva Survival

Kurva yang menunjukkan peluang survive di setiap waktunya. Berdasarkan Grafik di bawah ini, dapat dikatakan bahwa peluang survive penderita penyakit paru-paru menurun seiring waktu berjalan.

plot(x=surv_table$time,
     y=surv_table$sur_prob,
     type="l", 
     ylab="survival probability",
     xlab="time")

4. Kurva Kaplan Meyer

Jika variabel yang diteliti bertipe kategori, maka digunakan kurva kaplan Meyer. Fungsinya untuk mengetahui apakah terdapat perbedaan survival diantara kategori tersebut. Dalam penelitian ini, ingin diketahui apakah penyakit paru-paru tergantung dari jenis kelaminnya.

ggsurvplot(fit,
           surv.median.line = "hv",
           ggtheme = theme_bw(),
           palette="blue",
           legend="none",
           xlab = "Time in Days") +
ggtitle("Kaplan Meier Curve nafld")

## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, : Median
## survival not reached.

fit2 <- survfit(Surv(futime,status)~male,data=df)
surv_table <- data.frame(time=fit2$time,
                         n_risk=fit2$n.risk,
                         n_event = fit2$n.event,
                         n_censor = fit2$n.censor,
                         sur_prob=fit2$surv)
head(surv_table)

##   time n_risk n_event n_censor  sur_prob
## 1    7   9348       0        1 1.0000000
## 2    9   9347       0        2 1.0000000
## 3   10   9345       0        2 1.0000000
## 4   11   9343       1        0 0.9998930
## 5   15   9342       2        0 0.9996789
## 6   18   9340       0        1 0.9996789

fit2 <- survfit(Surv(futime, status)~male, data = df)

ggsurvplot(fit2,
           pval = TRUE,
           surv.median.line = "hv",
           ggtheme = theme_bw(),
           palette = c("red","blue"),
           legend="left",
           legend.labs = c("Male", "Female"),
           xlab = "Time in Days") +
  ggtitle("Kaplan Meier Curve Lung Dataset")

## Warning in .add_surv_median(p, fit, type = surv.median.line, fun = fun, : Median
## survival not reached.

5. Regresi Cox Proportional Hazard

Jika kaplan meyer hanya untuk variabel kategorik, maka model regresi cox proportional hazard bisa dari variabel numerik maupun kategorik (Lauaknya regresi linier berganda.).

options(scipen=999)
df$male <- as.factor(df$male) # convert to dummy variable
model_coxph <- coxph (Surv(futime, status) ~ age + male + weight +  height +
                        bmi,data=df)

summary(model_coxph)

## Call:
## coxph(formula = Surv(futime, status) ~ age + male + weight + 
##     height + bmi, data = df)
## 
##   n= 12588, number of events= 1018 
##    (4961 observations deleted due to missingness)
## 
##             coef exp(coef)  se(coef)      z             Pr(>|z|)    
## age     0.098330  1.103327  0.002771 35.483 < 0.0000000000000002 ***
## male1   0.550169  1.733547  0.091513  6.012        0.00000000183 ***
## weight  0.009734  1.009781  0.012206  0.797                0.425    
## height -0.022267  0.977979  0.013629 -1.634                0.102    
## bmi    -0.011134  0.988927  0.033234 -0.335                0.738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##        exp(coef) exp(-coef) lower .95 upper .95
## age       1.1033     0.9063    1.0974     1.109
## male1     1.7335     0.5769    1.4489     2.074
## weight    1.0098     0.9903    0.9859     1.034
## height    0.9780     1.0225    0.9522     1.004
## bmi       0.9889     1.0112    0.9266     1.055
## 
## Concordance= 0.828  (se = 0.007 )
## Likelihood ratio test= 1758  on 5 df,   p=<0.0000000000000002
## Wald test            = 1489  on 5 df,   p=<0.0000000000000002
## Score (logrank) test = 1729  on 5 df,   p=<0.0000000000000002

6. Model Survival Dengan Disribusi Exponensial

options(scipen=999)
df$male <- as.factor(df$male) # convert to dummy variable
model_exp <- survreg(Surv(futime, status) ~ age + male + weight +  height +
                       bmi,dist="exponential",data=df)

summary(model_exp)

## 
## Call:
## survreg(formula = Surv(futime, status) ~ age + male + weight + 
##     height + bmi, data = df, dist = "exponential")
##                Value Std. Error      z                    p
## (Intercept) 12.63031    2.22604   5.67         0.0000000140
## age         -0.09252    0.00265 -34.91 < 0.0000000000000002
## male1       -0.53499    0.09073  -5.90         0.0000000037
## weight      -0.01029    0.01193  -0.86                0.389
## height       0.02419    0.01340   1.81                0.071
## bmi          0.01422    0.03245   0.44                0.661
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -10669.3   Loglik(intercept only)= -11502.9
##  Chisq= 1667.3 on 5 degrees of freedom, p= 0 
## Number of Newton-Raphson Iterations: 7 
## n=12588 (4961 observations deleted due to missingness)

7. Metrik Evaluation

concordance(model_coxph)

## Call:
## concordance.coxph(object = model_coxph)
## 
## n= 12588 
## Concordance= 0.8281 se= 0.007028
## concordant discordant     tied.x     tied.y    tied.xy 
##    6415282    1331979          1        146          0

concordance(model_exp)

## Call:
## concordance.survreg(object = model_exp)
## 
## n= 12588 
## Concordance= 0.8281 se= 0.007029
## concordant discordant     tied.x     tied.y    tied.xy 
##    6415221    1332040          1        146          0