DEM 7223 - Event History Analysis - Kaplan-Meier survival

Published

September 12, 2022

library(haven)
library(survival)
library(car)

Loading required package: carData

library(survey)

Loading required package: grid

Loading required package: Matrix


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

library(survminer)

Loading required package: ggplot2

Loading required package: ggpubr


Attaching package: 'survminer'

The following object is masked from 'package:survival':

    myeloma

library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)


Attaching package: 'dplyr'

The following object is masked from 'package:car':

    recode

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(tidycensus)
library(ipumsr)
library(questionr)
library(forcats)
library(srvyr)


Attaching package: 'srvyr'

The following object is masked from 'package:stats':

    filter

library(caret)

Loading required package: lattice


Attaching package: 'caret'

The following object is masked from 'package:survival':

    cluster

library(muhaz)

For this assignment, I am using Demographic and Health Survey (DHS) Nepal, 2011 data.

Event: Infant mortality

Time variable: If the child is alive during interview then the age at death is censored.

Censoring indicator: Child is alive by 1 is censored and dead by 1 is the event.

dat<- read_dta("/Users/jyotinepal/Downloads/NPKR61DT/NPKR61FL.DTA")

dat$death.age<-ifelse(dat$b5==1,
                          ((((dat$v008))+1900)-(((dat$b3))+1900)) 
                          ,dat$b7)

#censoring indicator for death by age 1, in months (12 months)
dat$d.event<-ifelse(is.na(dat$b7)==T|dat$b7>12,0,1)
dat$d.eventfac<-factor(dat$d.event); levels(dat$d.eventfac)<-c("Alive at 1", "Dead by 1")
table(dat$d.eventfac)


Alive at 1  Dead by 1 
      5067        239

Estimating Survival Time Functions

#Here we see the data
head(dat[,c("death.age","d.event")], n=20)

# A tibble: 20 × 2
   death.age d.event
       <dbl>   <dbl>
 1        30       0
 2         3       0
 3         1       0
 4        20       0
 5        37       0
 6        19       0
 7        16       0
 8        18       0
 9        32       0
10        44       0
11        43       0
12        49       0
13        27       0
14        40       0
15        26       0
16        58       0
17         4       0
18        46       0
19        50       0
20         5       0

#The Surv() object
head(Surv(dat$death.age, dat$d.event), n=20)

 [1] 30+  3+  1+ 20+ 37+ 19+ 16+ 18+ 32+ 44+ 43+ 49+ 27+ 40+ 26+ 58+  4+ 46+ 50+
[20]  5+

mort<-survfit(Surv(death.age, d.event)~1, data=dat,conf.type="none")
plot(mort, ylim=c(.9,1), xlim=c(0,12), main="Survival Function for Infant Mortality")

summary(mort)

Call: survfit(formula = Surv(death.age, d.event) ~ 1, data = dat, conf.type = "none")

 time n.risk n.event survival std.err
    0   5306     164    0.969 0.00238
    1   5097      19    0.965 0.00251
    2   4997      11    0.963 0.00258
    3   4894       8    0.962 0.00264
    4   4786      10    0.960 0.00271
    5   4687       6    0.959 0.00275
    6   4589       2    0.958 0.00276
    7   4507       4    0.957 0.00279
    8   4414       3    0.957 0.00282
    9   4302       4    0.956 0.00285
   10   4210       3    0.955 0.00288
   11   4128       3    0.954 0.00290
   12   4059       2    0.954 0.00292

Most of the children died between 0 to 1 month; the number of survival increased after five months.

Estimating the hazard function using the Kaplan-Meier method

haz<-kphaz.fit(time=dat$death.age, status=dat$d.event, method = "product-limit")
kphaz.plot(haz, main="Hazard Function Plot")

data.frame(haz)

   time          haz          var
1   0.5 0.0037747355 7.499564e-07
2   1.5 0.0022187469 4.475407e-07
3   2.5 0.0016551719 3.424698e-07
4   3.5 0.0021147384 4.472385e-07
5   4.5 0.0012926908 2.785194e-07
6   5.5 0.0004387909 9.626912e-08
7   6.5 0.0008996092 2.023298e-07
8   7.5 0.0006924995 1.598531e-07
9   8.5 0.0009418839 2.217962e-07
10  9.5 0.0007226365 1.740693e-07
11 10.5 0.0007349784 1.800680e-07
12 11.5 0.0004958583 1.229404e-07

Cumulative Hazard Function

plot(cumsum(haz$haz)~haz$time, 
     main = "Cumulative Hazard function",
     ylab="H(t)",xlab="Time in Months", 
     type="l",xlim=c(0,12), lwd=2,col=3)

This figure indicates censored ages at death for children under age 1.

Kaplan-Meier survival analysis of the outcome

The grouping variable is place of birth (Urban and Rural) which is a categorical in nature. My research question is: What would be chances of survival of the infants in Nepal based on the place of birth (urban and rural area)? My null hypothesis (H0) states that infant survivorship based on rural and urban area of Nepal are not different. My H1 states that infant survivorship based on rural and urban area of Nepal are different.

table(dat$v025)


   1    2 
1091 4215

# dat%>%
#   mutate(rural = Recode(dat$V025, recodes ="2 = '0rural'; 1='1urban'; else=NA", as.factor = T))

#dat$rural <- Recode(dat$v025, recodes ="2='Rural'; 1='Urban'", as.factor=F)

dat$rural <- ifelse(dat$v025==1,1,0)

fit2<-survfit(Surv(death.age, d.event)~rural, data=dat, conf.type = "log")
fit2

Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat, 
    conf.type = "log")

           n events median 0.95LCL 0.95UCL
rural=0 4215    199     NA      NA      NA
rural=1 1091     40     NA      NA      NA

summary(fit2)

Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat, 
    conf.type = "log")

                rural=0 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0   4215     139    0.967 0.00275        0.962        0.972
    1   4041      16    0.963 0.00290        0.958        0.969
    2   3961       9    0.961 0.00299        0.955        0.967
    3   3879       7    0.959 0.00305        0.953        0.965
    4   3791       6    0.958 0.00311        0.952        0.964
    5   3714       4    0.957 0.00315        0.951        0.963
    6   3633       1    0.956 0.00316        0.950        0.963
    7   3568       3    0.956 0.00319        0.949        0.962
    8   3492       2    0.955 0.00321        0.949        0.961
    9   3405       4    0.954 0.00326        0.948        0.960
   10   3330       3    0.953 0.00329        0.947        0.960
   11   3260       3    0.952 0.00333        0.946        0.959
   12   3205       2    0.952 0.00335        0.945        0.958

                rural=1 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0   1091      25    0.977 0.00453        0.968        0.986
    1   1056       3    0.974 0.00479        0.965        0.984
    2   1036       2    0.972 0.00496        0.963        0.982
    3   1015       1    0.971 0.00505        0.962        0.981
    4    995       4    0.968 0.00539        0.957        0.978
    5    973       2    0.966 0.00556        0.955        0.977
    6    956       1    0.965 0.00565        0.954        0.976
    7    939       1    0.964 0.00574        0.952        0.975
    8    922       1    0.962 0.00582        0.951        0.974

ggsurvplot(fit2, xlim=c(0,12), ylim=c(.9, 1), conf.int=T, title="Survival Function for Infant mortality - Rural vs Urban")

prop.table(table(dat$d.event, dat$rural), margin = 2)

   
             0          1
  0 0.95278766 0.96333639
  1 0.04721234 0.03666361

chisq.test(table(dat$d.event, dat$rural))


    Pearson's Chi-squared test with Yates' continuity correction

data:  table(dat$d.event, dat$rural)
X-squared = 2.0035, df = 1, p-value = 0.1569

survdiff(Surv(death.age, d.event)~rural, data=dat)

Call:
survdiff(formula = Surv(death.age, d.event) ~ rural, data = dat)

           N Observed Expected (O-E)^2/E (O-E)^2/V
rural=0 4215      199    189.7     0.457      2.26
rural=1 1091       40     49.3     1.758      2.26

 Chisq= 2.3  on 1 degrees of freedom, p= 0.1

We fail to reject null hypothesis as P value is greater than .05, indicating that children born at urban and rural parts of Nepal have similar survivor-ship.

Plot the hazard function

haz2<-kphaz.fit(dat$death.age, dat$d.event, dat$rural)
haz2

$time
 [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5  0.5  1.5  2.5
[16]  3.5  4.5  5.5  6.5  7.5

$haz
 [1] 0.0040133440 0.0022903931 0.0018288256 0.0015959140 0.0010879093
 [6] 0.0002767017 0.0008529513 0.0005847134 0.0011899079 0.0009144571
[11] 0.0009308261 0.0006279556 0.0028563755 0.0019426911 0.0009930487
[16] 0.0040973206 0.0020779361 0.0010515247 0.0010775862 0.0010989011

$var
 [1] 1.006711e-06 5.828926e-07 4.778333e-07 4.245166e-07 2.959046e-07
 [6] 7.656384e-08 2.425179e-07 1.709460e-07 3.539854e-07 2.787466e-07
[11] 2.888174e-07 1.971679e-07 2.719760e-06 1.887025e-06 9.861456e-07
[16] 4.197019e-06 2.158924e-06 1.105704e-06 1.161192e-06 1.207584e-06

$strata
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1

plot(y=haz2$haz[1:12], x=haz2$time[1:12], col=1, lty=1, type="s")
lines(y=haz2$haz[13:24], x=haz2$time[13:24], col=2, lty=1, type="s")

Hence, it is evident that rural infant death is higher at the age of 0-1 months, however, the noticeable difference is prevalent at the age of 3-5 months.