#install.packages("survminer")
# install.packages("survival")
library(survival)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Import the ovarian cancer dataset

data(ovarian)
glimpse(ovarian)

## Rows: 26
## Columns: 6
## $ futime   <dbl> 59, 115, 156, 421, 431, 448, 464, 475, 477, 563, 638, 744,...
## $ fustat   <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ age      <dbl> 72.3315, 74.4932, 66.4658, 53.3644, 50.3397, 56.4301, 56.9...
## $ resid.ds <dbl> 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2...
## $ rx       <dbl> 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2...
## $ ecog.ps  <dbl> 1, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1...

Read and attach data

help(ovarian)

## starting httpd help server ... done

Ovarian Cancer Survival Data

Description

Survival in a randomised trial comparing two treatments for ovarian cancer

Format

futime: survival or censoring time

fustat: censoring status

age: in years

resid.ds: residual disease present (1=no,2=yes)

rx: treatment group

ecog.ps: ECOG performance #### status (1 is better, see reference)

change age to groups.

first we check how many groups and at what should be the cut off value

hist(ovarian$age)

so it seems bimodal distribution,therefore dividing age to 2 groups

ovarian <- ovarian %>% mutate(age_group = ifelse(age >=50, "above50", "below50"))

head(ovarian$age_group)

## [1] "above50" "above50" "above50" "above50" "above50" "above50"

ovarian$age_group <- factor(ovarian$age_group)
head(ovarian$age_group)

## [1] above50 above50 above50 above50 above50 above50
## Levels: above50 below50

changing data labels for the following 3 features

rx : it contains 2 groups of patients

ecog: performance status where 1 is good and 2 is

bad

resid : residual disease present (1=no,2=yes)

ovarian$rx <- factor(ovarian$rx, 
                     levels = c("1", "2"), 
                     labels = c("treatment-A", "treatment-B"))
ovarian$resid.ds <- factor(ovarian$resid.ds, 
                           levels = c("1", "2"), 
                           labels = c("no_residual", "yes_residual"))
ovarian$ecog.ps <- factor(ovarian$ecog.ps, 
                          levels = c("1", "2"), 
                          labels = c("good", "bad"))

hypothetical censoring variable would = 1 - fustat

glimpse(ovarian)

## Rows: 26
## Columns: 7
## $ futime    <dbl> 59, 115, 156, 421, 431, 448, 464, 475, 477, 563, 638, 744...
## $ fustat    <dbl> 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ age       <dbl> 72.3315, 74.4932, 66.4658, 53.3644, 50.3397, 56.4301, 56....
## $ resid.ds  <fct> yes_residual, yes_residual, yes_residual, yes_residual, y...
## $ rx        <fct> treatment-A, treatment-A, treatment-A, treatment-B, treat...
## $ ecog.ps   <fct> good, good, bad, good, good, bad, bad, bad, good, bad, ba...
## $ age_group <fct> above50, above50, above50, above50, above50, above50, abo...

Defining Variables

time<-ovarian$futime
#event<-ovarian$resid.ds
event<-(1-ovarian$fustat)
group1<-ovarian$age_group
group2<-ovarian$rx

x=cbind(ovarian$age,ovarian$rx)

Non Parametric Survival Analysis

Steps taken for Non Parametric Survival Analysis

1-Sort the observations based on duration from smallest to largest t1 =<t2 =<t3…..=<tn

2-For each duration, determine the number of observations at risk n_j (those still in the sample),d_j (the number of events) and m_j (the number of censored observations) .

3-Calculate the hazard function as the number of events as a proportion of the number of observations at risk

Hazard Function

Hazard Function

Nelson-Aalen estimator of the cumulative hazard function – calculated by summing up hazard functions over time:

Nelson-Aalen estimator of the cumulative hazard function

Kaplan-Meier non-parametric analysis

The Kaplan-Meier estimator of the survival function – take the ratios of those without events over those at risk and multiply that over time

Kaplan-Meier non-parametric analysis

Where n are the total time slots and d is death or event.

Important Facts about Kaplan-Meier survival function.

It is a decreasing step function with a jump at each discrete event time.

Without censoring, the Kaplan-Meier estimator is just the empirical distribution of the data.

kmsurvival <- survfit(Surv(time,event) ~ 1)
summary(kmsurvival)

## Call: survfit(formula = Surv(time, event) ~ 1)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   377     19       1   0.9474  0.0512       0.8521        1.000
##   421     18       1   0.8947  0.0704       0.7669        1.000
##   448     16       1   0.8388  0.0854       0.6871        1.000
##   477     13       1   0.7743  0.1003       0.6007        0.998
##   744     10       1   0.6969  0.1164       0.5024        0.967
##   769      9       1   0.6194  0.1266       0.4150        0.925
##   770      8       1   0.5420  0.1323       0.3359        0.875
##   803      7       1   0.4646  0.1342       0.2637        0.818
##   855      6       1   0.3871  0.1323       0.1982        0.756
##  1040      5       1   0.3097  0.1265       0.1391        0.690
##  1106      4       1   0.2323  0.1162       0.0872        0.619
##  1129      3       1   0.1549  0.1000       0.0437        0.549
##  1206      2       1   0.0774  0.0741       0.0119        0.506
##  1227      1       1   0.0000     NaN           NA           NA

plot(kmsurvival, xlab="Time", ylab="Survival Probability",col=c("blue","red","green"))

Kaplan-Meier non-parametric analysis by group

Here we will check the KM estimation of survival function for group1 and group2

Note :group1=ovarian$age_group and group2=ovarian$rx

kmsurvival1 <- survfit(Surv(time, event) ~ group1)
summary(kmsurvival1)

## Call: survfit(formula = Surv(time, event) ~ group1)
## 
##                 group1=above50 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   377     14       1    0.929  0.0688       0.8030        1.000
##   421     13       1    0.857  0.0935       0.6921        1.000
##   448     11       1    0.779  0.1129       0.5866        1.000
##   477      8       1    0.682  0.1344       0.4633        1.000
##   744      5       1    0.545  0.1626       0.3041        0.978
##   769      4       1    0.409  0.1698       0.1814        0.923
##   770      3       1    0.273  0.1588       0.0871        0.854
##  1129      2       1    0.136  0.1249       0.0227        0.821
##  1227      1       1    0.000     NaN           NA           NA
## 
##                 group1=below50 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   803      5       1      0.8   0.179       0.5161            1
##   855      4       1      0.6   0.219       0.2933            1
##  1040      3       1      0.4   0.219       0.1367            1
##  1106      2       1      0.2   0.179       0.0346            1
##  1206      1       1      0.0     NaN           NA           NA

Plotting the KM non-parametric analysis by group1 curve

plot(kmsurvival1,xlab="Time", ylab="Survival Probability",col=c("blue","red"))

legend("left",
c("Above 50","Below 50"),
fill=c("red","blue"))

## Result of Non parametric Survival Analysis by age groups:

Its clear that the hazard of event (i.e death) is for the age group above 50 than that of below 50

KM estimation of survivalf group2

Note : group2=ovarian$rx

kmsurvival2 <- survfit(Surv(time, event) ~ group2)

plot(kmsurvival2,xlab="Time", ylab="Survival Probability",col=c("blue","red"))

legend("left",
c("A","B"),
fill=c("red","blue"))

### Result of Non parametric Survival Analysis by rx groups #### The above result shows that at some time intervals,group A has more surival prbaibility than group B and also at some other stime intervals its the vice versa situation

Nelson-Aalen non-parametric analysis

nasurvival <- survfit(coxph(Surv(time,event)~1), type="aalen")
summary(nasurvival)

## Call: survfit(formula = coxph(Surv(time, event) ~ 1), type = "aalen")
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   377     19       1   0.9487  0.0499      0.85574        1.000
##   421     18       1   0.8975  0.0687      0.77246        1.000
##   448     16       1   0.8431  0.0833      0.69465        1.000
##   477     13       1   0.7807  0.0978      0.61077        0.998
##   744     10       1   0.7064  0.1132      0.51598        0.967
##   769      9       1   0.6321  0.1233      0.43131        0.926
##   770      8       1   0.5578  0.1292      0.35427        0.878
##   803      7       1   0.4836  0.1316      0.28367        0.824
##   855      6       1   0.4093  0.1306      0.21900        0.765
##  1040      5       1   0.3351  0.1262      0.16019        0.701
##  1106      4       1   0.2610  0.1180      0.10761        0.633
##  1129      3       1   0.1870  0.1050      0.06220        0.562
##  1206      2       1   0.1134  0.0853      0.02598        0.495
##  1227      1       1   0.0417  0.0522      0.00359        0.485

plot(nasurvival, xlab="Time", ylab="Survival Probability")

Resemblance of both Non Parametric Survival Analysis:

Both non parametric survival analysis have almost same results

#Semi Perimetric Model

Cox proportional hazard model (Semi Parametric Model)

note:x=cbind(ovarian$age,ovarian$rx)

Cox semi-parametric analysis model

coxph <- coxph(Surv(time,event) ~ x, method="breslow")
summary(coxph)

## Call:
## coxph(formula = Surv(time, event) ~ x, method = "breslow")
## 
##   n= 26, number of events= 14 
## 
##        coef exp(coef) se(coef)      z Pr(>|z|)
## x1  0.04580   1.04686  0.04454  1.028    0.304
## x2 -0.86227   0.42220  0.73753 -1.169    0.242
## 
##    exp(coef) exp(-coef) lower .95 upper .95
## x1    1.0469     0.9552   0.95934     1.142
## x2    0.4222     2.3685   0.09948     1.792
## 
## Concordance= 0.542  (se = 0.106 )
## Likelihood ratio test= 1.56  on 2 df,   p=0.5
## Wald test            = 1.58  on 2 df,   p=0.5
## Score (logrank) test = 1.6  on 2 df,   p=0.4

Estimation of Paremetric Model

Results of Cox parametric model

We have positive co-efficient for age which shows with increase of age increae the Hazard.

The 1.0469 value of age shows that the unit inrease in the “age” is associated with 4.7% increase in the hazard rate Note : The calculation is shown here =>(1.0469-1 => 0.0469*100 => 4.7% )

Other Perametric Models.

The following are widely used Parametric models.

parametric survival models

1-Exponential parametric model coefficients

exponential <- survreg(Surv(time,event) ~ x, dist="exponential")
summary(exponential)

## 
## Call:
## survreg(formula = Surv(time, event) ~ x, dist = "exponential")
##               Value Std. Error     z       p
## (Intercept)  6.3719     1.6513  3.86 0.00011
## x1           0.0161     0.0349  0.46 0.64529
## x2          -0.1234     0.6000 -0.21 0.83699
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -112.1   Loglik(intercept only)= -112.2
##  Chisq= 0.22 on 2 degrees of freedom, p= 0.9 
## Number of Newton-Raphson Iterations: 5 
## n= 26

2-Weibulll parametric model coefficients

weibull <- survreg(Surv(time,event) ~ x, dist="weibull")
summary(weibull)

## 
## Call:
## survreg(formula = Surv(time, event) ~ x, dist = "weibull")
##               Value Std. Error     z       p
## (Intercept)  7.3442     0.4450 16.50 < 2e-16
## x1          -0.0183     0.0105 -1.73   0.083
## x2           0.2682     0.1736  1.55   0.122
## Log(scale)  -1.3652     0.1918 -7.12 1.1e-12
## 
## Scale= 0.255 
## 
## Weibull distribution
## Loglik(model)= -98.4   Loglik(intercept only)= -99.7
##  Chisq= 2.78 on 2 degrees of freedom, p= 0.25 
## Number of Newton-Raphson Iterations: 7 
## n= 26

3-Loglogistic parametric model

loglogistic <- survreg(Surv(time,event) ~ x, dist="loglogistic")
summary(loglogistic)

## 
## Call:
## survreg(formula = Surv(time, event) ~ x, dist = "loglogistic")
##                Value Std. Error     z       p
## (Intercept)  7.61771    0.47180 16.15 < 2e-16
## x1          -0.02461    0.00928 -2.65   0.008
## x2           0.21703    0.16385  1.32   0.185
## Log(scale)  -1.71981    0.21375 -8.05 8.6e-16
## 
## Scale= 0.179 
## 
## Log logistic distribution
## Loglik(model)= -98.8   Loglik(intercept only)= -101.1
##  Chisq= 4.53 on 2 degrees of freedom, p= 0.1 
## Number of Newton-Raphson Iterations: 6 
## n= 26

Results of above 3 Parametric Models

Only the exponential model is giving result which is consistant with the Kaplan-Meier survival function ,Nelson-Aalen estimator and COX proportional hazard model showing that older age have more hazard rate (having positive co-efficient values)

Survival Analysis of Cancer Patients