DEM 7223 - Event History Analysis - Kaplan-Meier survival

Published

September 12, 2022

library(haven)
library(survival)
library(car)
Loading required package: carData
library(survey)
Loading required package: grid
Loading required package: Matrix

Attaching package: 'survey'
The following object is masked from 'package:graphics':

    dotchart
library(survminer)
Loading required package: ggplot2
Loading required package: ggpubr

Attaching package: 'survminer'
The following object is masked from 'package:survival':

    myeloma
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)

Attaching package: 'dplyr'
The following object is masked from 'package:car':

    recode
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidycensus)
library(ipumsr)
library(questionr)
library(forcats)
library(srvyr)

Attaching package: 'srvyr'
The following object is masked from 'package:stats':

    filter
library(caret)
Loading required package: lattice

Attaching package: 'caret'
The following object is masked from 'package:survival':

    cluster
library(muhaz)

For this assignment, I am using Demographic and Health Survey (DHS) Nepal, 2011 data.

Event: Infant mortality

Time variable: If the child is alive during interview then the age at death is censored.

Censoring indicator: Child is alive by 1 is censored and dead by 1 is the event.

dat<- read_dta("/Users/jyotinepal/Downloads/NPKR61DT/NPKR61FL.DTA")
dat$death.age<-ifelse(dat$b5==1,
                          ((((dat$v008))+1900)-(((dat$b3))+1900)) 
                          ,dat$b7)

#censoring indicator for death by age 1, in months (12 months)
dat$d.event<-ifelse(is.na(dat$b7)==T|dat$b7>12,0,1)
dat$d.eventfac<-factor(dat$d.event); levels(dat$d.eventfac)<-c("Alive at 1", "Dead by 1")
table(dat$d.eventfac)

Alive at 1  Dead by 1 
      5067        239 

Estimating Survival Time Functions

#Here we see the data
head(dat[,c("death.age","d.event")], n=20)
# A tibble: 20 × 2
   death.age d.event
       <dbl>   <dbl>
 1        30       0
 2         3       0
 3         1       0
 4        20       0
 5        37       0
 6        19       0
 7        16       0
 8        18       0
 9        32       0
10        44       0
11        43       0
12        49       0
13        27       0
14        40       0
15        26       0
16        58       0
17         4       0
18        46       0
19        50       0
20         5       0
#The Surv() object
head(Surv(dat$death.age, dat$d.event), n=20)
 [1] 30+  3+  1+ 20+ 37+ 19+ 16+ 18+ 32+ 44+ 43+ 49+ 27+ 40+ 26+ 58+  4+ 46+ 50+
[20]  5+
mort<-survfit(Surv(death.age, d.event)~1, data=dat,conf.type="none")
plot(mort, ylim=c(.9,1), xlim=c(0,12), main="Survival Function for Infant Mortality")

summary(mort)
Call: survfit(formula = Surv(death.age, d.event) ~ 1, data = dat, conf.type = "none")

 time n.risk n.event survival std.err
    0   5306     164    0.969 0.00238
    1   5097      19    0.965 0.00251
    2   4997      11    0.963 0.00258
    3   4894       8    0.962 0.00264
    4   4786      10    0.960 0.00271
    5   4687       6    0.959 0.00275
    6   4589       2    0.958 0.00276
    7   4507       4    0.957 0.00279
    8   4414       3    0.957 0.00282
    9   4302       4    0.956 0.00285
   10   4210       3    0.955 0.00288
   11   4128       3    0.954 0.00290
   12   4059       2    0.954 0.00292

Most of the children died between 0 to 1 month; the number of survival increased after five months.

Estimating the hazard function using the Kaplan-Meier method

haz<-kphaz.fit(time=dat$death.age, status=dat$d.event, method = "product-limit")
kphaz.plot(haz, main="Hazard Function Plot")

data.frame(haz)
   time          haz          var
1   0.5 0.0037747355 7.499564e-07
2   1.5 0.0022187469 4.475407e-07
3   2.5 0.0016551719 3.424698e-07
4   3.5 0.0021147384 4.472385e-07
5   4.5 0.0012926908 2.785194e-07
6   5.5 0.0004387909 9.626912e-08
7   6.5 0.0008996092 2.023298e-07
8   7.5 0.0006924995 1.598531e-07
9   8.5 0.0009418839 2.217962e-07
10  9.5 0.0007226365 1.740693e-07
11 10.5 0.0007349784 1.800680e-07
12 11.5 0.0004958583 1.229404e-07

Cumulative Hazard Function

plot(cumsum(haz$haz)~haz$time, 
     main = "Cumulative Hazard function",
     ylab="H(t)",xlab="Time in Months", 
     type="l",xlim=c(0,12), lwd=2,col=3)

This figure indicates censored ages at death for children under age 1.

Kaplan-Meier survival analysis of the outcome

The grouping variable is place of birth (Urban and Rural) which is a categorical in nature. My research question is: What would be chances of survival of the infants in Nepal based on the place of birth (urban and rural area)? My null hypothesis (H0) states that infant survivorship based on rural and urban area of Nepal are not different. My H1 states that infant survivorship based on rural and urban area of Nepal are different.

table(dat$v025)

   1    2 
1091 4215 
# dat%>%
#   mutate(rural = Recode(dat$V025, recodes ="2 = '0rural'; 1='1urban'; else=NA", as.factor = T))

#dat$rural <- Recode(dat$v025, recodes ="2='Rural'; 1='Urban'", as.factor=F)

dat$rural <- ifelse(dat$v025==1,1,0)

fit2<-survfit(Surv(death.age, d.event)~rural, data=dat, conf.type = "log")
fit2
Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat, 
    conf.type = "log")

           n events median 0.95LCL 0.95UCL
rural=0 4215    199     NA      NA      NA
rural=1 1091     40     NA      NA      NA
summary(fit2)
Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat, 
    conf.type = "log")

                rural=0 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0   4215     139    0.967 0.00275        0.962        0.972
    1   4041      16    0.963 0.00290        0.958        0.969
    2   3961       9    0.961 0.00299        0.955        0.967
    3   3879       7    0.959 0.00305        0.953        0.965
    4   3791       6    0.958 0.00311        0.952        0.964
    5   3714       4    0.957 0.00315        0.951        0.963
    6   3633       1    0.956 0.00316        0.950        0.963
    7   3568       3    0.956 0.00319        0.949        0.962
    8   3492       2    0.955 0.00321        0.949        0.961
    9   3405       4    0.954 0.00326        0.948        0.960
   10   3330       3    0.953 0.00329        0.947        0.960
   11   3260       3    0.952 0.00333        0.946        0.959
   12   3205       2    0.952 0.00335        0.945        0.958

                rural=1 
 time n.risk n.event survival std.err lower 95% CI upper 95% CI
    0   1091      25    0.977 0.00453        0.968        0.986
    1   1056       3    0.974 0.00479        0.965        0.984
    2   1036       2    0.972 0.00496        0.963        0.982
    3   1015       1    0.971 0.00505        0.962        0.981
    4    995       4    0.968 0.00539        0.957        0.978
    5    973       2    0.966 0.00556        0.955        0.977
    6    956       1    0.965 0.00565        0.954        0.976
    7    939       1    0.964 0.00574        0.952        0.975
    8    922       1    0.962 0.00582        0.951        0.974
ggsurvplot(fit2, xlim=c(0,12), ylim=c(.9, 1), conf.int=T, title="Survival Function for Infant mortality - Rural vs Urban")

prop.table(table(dat$d.event, dat$rural), margin = 2)
   
             0          1
  0 0.95278766 0.96333639
  1 0.04721234 0.03666361
chisq.test(table(dat$d.event, dat$rural))

    Pearson's Chi-squared test with Yates' continuity correction

data:  table(dat$d.event, dat$rural)
X-squared = 2.0035, df = 1, p-value = 0.1569
survdiff(Surv(death.age, d.event)~rural, data=dat)
Call:
survdiff(formula = Surv(death.age, d.event) ~ rural, data = dat)

           N Observed Expected (O-E)^2/E (O-E)^2/V
rural=0 4215      199    189.7     0.457      2.26
rural=1 1091       40     49.3     1.758      2.26

 Chisq= 2.3  on 1 degrees of freedom, p= 0.1 

We fail to reject null hypothesis as P value is greater than .05, indicating that children born at urban and rural parts of Nepal have similar survivor-ship.

Plot the hazard function

haz2<-kphaz.fit(dat$death.age, dat$d.event, dat$rural)
haz2
$time
 [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5  0.5  1.5  2.5
[16]  3.5  4.5  5.5  6.5  7.5

$haz
 [1] 0.0040133440 0.0022903931 0.0018288256 0.0015959140 0.0010879093
 [6] 0.0002767017 0.0008529513 0.0005847134 0.0011899079 0.0009144571
[11] 0.0009308261 0.0006279556 0.0028563755 0.0019426911 0.0009930487
[16] 0.0040973206 0.0020779361 0.0010515247 0.0010775862 0.0010989011

$var
 [1] 1.006711e-06 5.828926e-07 4.778333e-07 4.245166e-07 2.959046e-07
 [6] 7.656384e-08 2.425179e-07 1.709460e-07 3.539854e-07 2.787466e-07
[11] 2.888174e-07 1.971679e-07 2.719760e-06 1.887025e-06 9.861456e-07
[16] 4.197019e-06 2.158924e-06 1.105704e-06 1.161192e-06 1.207584e-06

$strata
 [1] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
plot(y=haz2$haz[1:12], x=haz2$time[1:12], col=1, lty=1, type="s")
lines(y=haz2$haz[13:24], x=haz2$time[13:24], col=2, lty=1, type="s")

Hence, it is evident that rural infant death is higher at the age of 0-1 months, however, the noticeable difference is prevalent at the age of 3-5 months.