library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)
dat <- read_dta ("C:/Users/nahin/Google Drive/MSc Demography/Fall 2020/Event History/Week3/BDKR72FL.dta")

Event - Infant mortality

Time variable: if the child is alive during interviw then the age at death is censored.

Censoring indicator: Child is alive by 1 is censored and dead by 1 is the event.

dat$death.age<-ifelse(dat$b5==1,
                          ((((dat$v008))+1900)-(((dat$b3))+1900)) 
                          ,dat$b7)

#censoring indicator for death by age 1, in months (12 months)
dat$d.event<-ifelse(is.na(dat$b7)==T|dat$b7>12,0,1)
dat$d.eventfac<-factor(dat$d.event); levels(dat$d.eventfac)<-c("Alive at 1", "Dead by 1")
table(dat$d.eventfac)
## 
## Alive at 1  Dead by 1 
##       7592        294

Estimating survival time functions

#Here we see the data
head(dat[,c("death.age","d.event")], n=20)
## # A tibble: 20 x 2
##    death.age d.event
##        <dbl>   <dbl>
##  1        13       0
##  2         0       1
##  3         0       1
##  4        47       0
##  5        23       0
##  6        11       0
##  7        51       0
##  8         5       0
##  9        37       0
## 10        30       0
## 11         4       0
## 12        44       0
## 13        45       0
## 14         2       0
## 15        55       0
## 16        11       0
## 17         6       0
## 18        35       0
## 19         6       0
## 20        54       0
#The Surv() object
head(Surv(dat$death.age, dat$d.event), n=20)
##  [1] 13+  0   0  47+ 23+ 11+ 51+  5+ 37+ 30+  4+ 44+ 45+  2+ 55+ 11+  6+ 35+  6+
## [20] 54+

In the first 20 cases from the data, we can see that except two all the deaths are censored. censored cases are shown with a ‘+’ sign.

mort<-survfit(Surv(death.age, d.event)~1, data=dat,conf.type="none")
plot(mort, ylim=c(.9,1), xlim=c(0,12), main="Survival Function for Infant Mortality")

summary(mort)
## Call: survfit(formula = Surv(death.age, d.event) ~ 1, data = dat, conf.type = "none")
## 
##  time n.risk n.event survival std.err
##     0   7886     222    0.972 0.00186
##     1   7594      18    0.970 0.00194
##     2   7465      12    0.968 0.00198
##     3   7337      11    0.967 0.00203
##     4   7217       6    0.966 0.00205
##     5   7102       5    0.965 0.00207
##     6   6972       4    0.964 0.00209
##     7   6836       5    0.964 0.00211
##     8   6701       1    0.964 0.00212
##     9   6556       2    0.963 0.00213
##    10   6418       1    0.963 0.00213
##    11   6271       3    0.963 0.00215
##    12   6134       4    0.962 0.00217

This is the so-called Kaplan-Meier estimate of the survival function. We can see that most of the children died between 0 to 1 month. And the number of survived increased after 7 month of age.

Estimating the hazard function using the Kaplan-Meier method

library(muhaz)
haz<-kphaz.fit(time=dat$death.age, status=dat$d.event, method = "product-limit")
kphaz.plot(haz, main="Hazard function plot")

data.frame(haz)
##    time          haz          var
## 1   0.5 0.0023911261 3.176450e-07
## 2   1.5 0.0016242218 2.198467e-07
## 3   2.5 0.0015135495 2.082626e-07
## 4   3.5 0.0008380988 1.170700e-07
## 5   4.5 0.0007111660 1.011542e-07
## 6   5.5 0.0005801322 8.414165e-08
## 7   6.5 0.0007386055 1.091106e-07
## 8   7.5 0.0001522186 2.317050e-08
## 9   8.5 0.0003090431 4.775683e-08
## 10  9.5 0.0001579405 2.494519e-08
## 11 10.5 0.0004866885 7.895599e-08
## 12 11.5 0.0006622013 1.096374e-07

The hazard plot indicates that deaths increased in the first month and gradually decreased after 2nd month. As we saw that the survival rate increased since the months of 7, where the hazard rate decreases at the lowest on that point.

Cumulative hazard

plot(cumsum(haz$haz)~haz$time, 
     main = "Cumulative Hazard function",
     ylab="H(t)",xlab="Time in Months", 
     type="l",xlim=c(0,12), lwd=2,col=3)

Here we calculated the censored ages at death for children under age 1.

Kaplan-Meier survival analysis of the outcome

Grouping variable: place of birth of the infants. Research question: Based on the place of birth what would be chances of survival of the infants in Bangladesh,2017? HO: rural and urban survival of the infants are not different. H1: rural and urban survival of the infants survival of the infants are different. Comparison of Kaplan-Meier survival across grouping variable in your data. Interpret your results.

table(dat$v025)
## 
##    1    2 
## 2488 5398
# dat%>%
#   mutate(rural = Recode(dat$V025, recodes ="2 = '0rural'; 1='1urban'; else=NA", as.factor = T))

#dat$rural <- Recode(dat$v025, recodes ="2='Rural'; 1='Urban'", as.factor=F)

dat$rural <- ifelse(dat$v025==1,1,0)

fit2<-survfit(Surv(death.age, d.event)~rural, data=dat, conf.type = "log")
fit2
## Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat, 
##     conf.type = "log")
## 
##            n events median 0.95LCL 0.95UCL
## rural=0 5398    207     NA      NA      NA
## rural=1 2488     87     NA      NA      NA
summary(fit2)
## Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat, 
##     conf.type = "log")
## 
##                 rural=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   5398     158    0.971 0.00229        0.966        0.975
##     1   5189      12    0.968 0.00238        0.964        0.973
##     2   5100      10    0.967 0.00245        0.962        0.971
##     3   5002       7    0.965 0.00250        0.960        0.970
##     4   4928       4    0.964 0.00253        0.960        0.969
##     5   4852       3    0.964 0.00255        0.959        0.969
##     6   4770       3    0.963 0.00257        0.958        0.968
##     7   4674       2    0.963 0.00259        0.958        0.968
##     8   4590       1    0.963 0.00259        0.958        0.968
##     9   4500       1    0.962 0.00260        0.957        0.968
##    10   4407       1    0.962 0.00261        0.957        0.967
##    11   4309       2    0.962 0.00263        0.957        0.967
##    12   4208       3    0.961 0.00266        0.956        0.966
## 
##                 rural=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   2488      64    0.974 0.00317        0.968        0.981
##     1   2405       6    0.972 0.00332        0.965        0.978
##     2   2365       2    0.971 0.00337        0.964        0.978
##     3   2335       4    0.969 0.00346        0.963        0.976
##     4   2289       2    0.969 0.00351        0.962        0.975
##     5   2250       2    0.968 0.00356        0.961        0.975
##     6   2202       1    0.967 0.00358        0.960        0.974
##     7   2162       3    0.966 0.00366        0.959        0.973
##     9   2056       1    0.965 0.00369        0.958        0.973
##    11   1962       1    0.965 0.00372        0.958        0.972
##    12   1926       1    0.964 0.00375        0.957        0.972
ggsurvplot(fit2, xlim=c(0,12), ylim=c(.9, 1), conf.int=T, title="Survival Function for Infant mortality - Rural vs Urban")

prop.table(table(dat$d.event, dat$rural), margin = 2)
##    
##              0          1
##   0 0.96165246 0.96503215
##   1 0.03834754 0.03496785
chisq.test(table(dat$d.event, dat$rural))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(dat$d.event, dat$rural)
## X-squared = 0.45191, df = 1, p-value = 0.5014
survdiff(Surv(death.age, d.event)~rural, data=dat)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ rural, data = dat)
## 
##            N Observed Expected (O-E)^2/E (O-E)^2/V
## rural=0 5398      207    201.2     0.169     0.546
## rural=1 2488       87     92.8     0.366     0.546
## 
##  Chisq= 0.5  on 1 degrees of freedom, p= 0.5

We can not reject null hypothesis that rural and urban child survivorship is different because the p value is not less than .05.

Plot the hazard function

haz2<-kphaz.fit(dat$death.age, dat$d.event, dat$rural)
haz2
## $time
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5  0.5  1.5  2.5
## [16]  3.5  4.5  5.5  6.5  8.0 10.0 11.5
## 
## $haz
##  [1] 0.0023331448 0.0019833940 0.0014126992 0.0008196832 0.0006258291
##  [6] 0.0006377360 0.0004311287 0.0002218279 0.0002268088 0.0002297266
## [11] 0.0004724279 0.0007227172 0.0025152256 0.0008523472 0.0017277384
## [16] 0.0008768091 0.0008930640 0.0004551661 0.0014037122 0.0002437835
## [21] 0.0002590674 0.0005288207
## 
## $var
##  [1] 4.536431e-07 3.933959e-07 2.851079e-07 1.679724e-07 1.305547e-07
##  [6] 1.355746e-07 9.293625e-08 4.920760e-08 5.144223e-08 5.277432e-08
## [11] 1.115954e-07 1.741208e-07 1.054406e-06 3.632539e-07 7.462972e-07
## [16] 3.843974e-07 3.987850e-07 2.071762e-07 6.568468e-07 5.943040e-08
## [21] 6.711590e-08 2.796514e-07
## 
## $strata
##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
plot(y=haz2$haz[1:12], x=haz2$time[1:12], col=1, lty=1, type="s")
lines(y=haz2$haz[13:24], x=haz2$time[13:24], col=2, lty=1, type="s")

Interpretation: Rural infant death is higher at the age of 0-1 months. The striking difference between urbna and rural is seen at the age of 7-8 months otherwise they more or less followed the similar pattern.