library(haven)
library(survival)
library(car)
library(survey)
library(survminer)
library(ggplot2)
library(ggpubr)
library(muhaz)
library(car)
library(dplyr)
dat <- read_dta ("C:/Users/nahin/Google Drive/MSc Demography/Fall 2020/Event History/Week3/BDKR72FL.dta")
Event - Infant mortality
Time variable: if the child is alive during interviw then the age at death is censored.
Censoring indicator: Child is alive by 1 is censored and dead by 1 is the event.
dat$death.age<-ifelse(dat$b5==1,
((((dat$v008))+1900)-(((dat$b3))+1900))
,dat$b7)
#censoring indicator for death by age 1, in months (12 months)
dat$d.event<-ifelse(is.na(dat$b7)==T|dat$b7>12,0,1)
dat$d.eventfac<-factor(dat$d.event); levels(dat$d.eventfac)<-c("Alive at 1", "Dead by 1")
table(dat$d.eventfac)
##
## Alive at 1 Dead by 1
## 7592 294
#Here we see the data
head(dat[,c("death.age","d.event")], n=20)
## # A tibble: 20 x 2
## death.age d.event
## <dbl> <dbl>
## 1 13 0
## 2 0 1
## 3 0 1
## 4 47 0
## 5 23 0
## 6 11 0
## 7 51 0
## 8 5 0
## 9 37 0
## 10 30 0
## 11 4 0
## 12 44 0
## 13 45 0
## 14 2 0
## 15 55 0
## 16 11 0
## 17 6 0
## 18 35 0
## 19 6 0
## 20 54 0
#The Surv() object
head(Surv(dat$death.age, dat$d.event), n=20)
## [1] 13+ 0 0 47+ 23+ 11+ 51+ 5+ 37+ 30+ 4+ 44+ 45+ 2+ 55+ 11+ 6+ 35+ 6+
## [20] 54+
In the first 20 cases from the data, we can see that except two all the deaths are censored. censored cases are shown with a ‘+’ sign.
mort<-survfit(Surv(death.age, d.event)~1, data=dat,conf.type="none")
plot(mort, ylim=c(.9,1), xlim=c(0,12), main="Survival Function for Infant Mortality")
summary(mort)
## Call: survfit(formula = Surv(death.age, d.event) ~ 1, data = dat, conf.type = "none")
##
## time n.risk n.event survival std.err
## 0 7886 222 0.972 0.00186
## 1 7594 18 0.970 0.00194
## 2 7465 12 0.968 0.00198
## 3 7337 11 0.967 0.00203
## 4 7217 6 0.966 0.00205
## 5 7102 5 0.965 0.00207
## 6 6972 4 0.964 0.00209
## 7 6836 5 0.964 0.00211
## 8 6701 1 0.964 0.00212
## 9 6556 2 0.963 0.00213
## 10 6418 1 0.963 0.00213
## 11 6271 3 0.963 0.00215
## 12 6134 4 0.962 0.00217
This is the so-called Kaplan-Meier estimate of the survival function. We can see that most of the children died between 0 to 1 month. And the number of survived increased after 7 month of age.
library(muhaz)
haz<-kphaz.fit(time=dat$death.age, status=dat$d.event, method = "product-limit")
kphaz.plot(haz, main="Hazard function plot")
data.frame(haz)
## time haz var
## 1 0.5 0.0023911261 3.176450e-07
## 2 1.5 0.0016242218 2.198467e-07
## 3 2.5 0.0015135495 2.082626e-07
## 4 3.5 0.0008380988 1.170700e-07
## 5 4.5 0.0007111660 1.011542e-07
## 6 5.5 0.0005801322 8.414165e-08
## 7 6.5 0.0007386055 1.091106e-07
## 8 7.5 0.0001522186 2.317050e-08
## 9 8.5 0.0003090431 4.775683e-08
## 10 9.5 0.0001579405 2.494519e-08
## 11 10.5 0.0004866885 7.895599e-08
## 12 11.5 0.0006622013 1.096374e-07
The hazard plot indicates that deaths increased in the first month and gradually decreased after 2nd month. As we saw that the survival rate increased since the months of 7, where the hazard rate decreases at the lowest on that point.
plot(cumsum(haz$haz)~haz$time,
main = "Cumulative Hazard function",
ylab="H(t)",xlab="Time in Months",
type="l",xlim=c(0,12), lwd=2,col=3)
Here we calculated the censored ages at death for children under age 1.
Grouping variable: place of birth of the infants. Research question: Based on the place of birth what would be chances of survival of the infants in Bangladesh,2017? HO: rural and urban survival of the infants are not different. H1: rural and urban survival of the infants survival of the infants are different. Comparison of Kaplan-Meier survival across grouping variable in your data. Interpret your results.
table(dat$v025)
##
## 1 2
## 2488 5398
# dat%>%
# mutate(rural = Recode(dat$V025, recodes ="2 = '0rural'; 1='1urban'; else=NA", as.factor = T))
#dat$rural <- Recode(dat$v025, recodes ="2='Rural'; 1='Urban'", as.factor=F)
dat$rural <- ifelse(dat$v025==1,1,0)
fit2<-survfit(Surv(death.age, d.event)~rural, data=dat, conf.type = "log")
fit2
## Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat,
## conf.type = "log")
##
## n events median 0.95LCL 0.95UCL
## rural=0 5398 207 NA NA NA
## rural=1 2488 87 NA NA NA
summary(fit2)
## Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = dat,
## conf.type = "log")
##
## rural=0
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 5398 158 0.971 0.00229 0.966 0.975
## 1 5189 12 0.968 0.00238 0.964 0.973
## 2 5100 10 0.967 0.00245 0.962 0.971
## 3 5002 7 0.965 0.00250 0.960 0.970
## 4 4928 4 0.964 0.00253 0.960 0.969
## 5 4852 3 0.964 0.00255 0.959 0.969
## 6 4770 3 0.963 0.00257 0.958 0.968
## 7 4674 2 0.963 0.00259 0.958 0.968
## 8 4590 1 0.963 0.00259 0.958 0.968
## 9 4500 1 0.962 0.00260 0.957 0.968
## 10 4407 1 0.962 0.00261 0.957 0.967
## 11 4309 2 0.962 0.00263 0.957 0.967
## 12 4208 3 0.961 0.00266 0.956 0.966
##
## rural=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 2488 64 0.974 0.00317 0.968 0.981
## 1 2405 6 0.972 0.00332 0.965 0.978
## 2 2365 2 0.971 0.00337 0.964 0.978
## 3 2335 4 0.969 0.00346 0.963 0.976
## 4 2289 2 0.969 0.00351 0.962 0.975
## 5 2250 2 0.968 0.00356 0.961 0.975
## 6 2202 1 0.967 0.00358 0.960 0.974
## 7 2162 3 0.966 0.00366 0.959 0.973
## 9 2056 1 0.965 0.00369 0.958 0.973
## 11 1962 1 0.965 0.00372 0.958 0.972
## 12 1926 1 0.964 0.00375 0.957 0.972
ggsurvplot(fit2, xlim=c(0,12), ylim=c(.9, 1), conf.int=T, title="Survival Function for Infant mortality - Rural vs Urban")
prop.table(table(dat$d.event, dat$rural), margin = 2)
##
## 0 1
## 0 0.96165246 0.96503215
## 1 0.03834754 0.03496785
chisq.test(table(dat$d.event, dat$rural))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(dat$d.event, dat$rural)
## X-squared = 0.45191, df = 1, p-value = 0.5014
survdiff(Surv(death.age, d.event)~rural, data=dat)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ rural, data = dat)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## rural=0 5398 207 201.2 0.169 0.546
## rural=1 2488 87 92.8 0.366 0.546
##
## Chisq= 0.5 on 1 degrees of freedom, p= 0.5
We can not reject null hypothesis that rural and urban child survivorship is different because the p value is not less than .05.
haz2<-kphaz.fit(dat$death.age, dat$d.event, dat$rural)
haz2
## $time
## [1] 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 0.5 1.5 2.5
## [16] 3.5 4.5 5.5 6.5 8.0 10.0 11.5
##
## $haz
## [1] 0.0023331448 0.0019833940 0.0014126992 0.0008196832 0.0006258291
## [6] 0.0006377360 0.0004311287 0.0002218279 0.0002268088 0.0002297266
## [11] 0.0004724279 0.0007227172 0.0025152256 0.0008523472 0.0017277384
## [16] 0.0008768091 0.0008930640 0.0004551661 0.0014037122 0.0002437835
## [21] 0.0002590674 0.0005288207
##
## $var
## [1] 4.536431e-07 3.933959e-07 2.851079e-07 1.679724e-07 1.305547e-07
## [6] 1.355746e-07 9.293625e-08 4.920760e-08 5.144223e-08 5.277432e-08
## [11] 1.115954e-07 1.741208e-07 1.054406e-06 3.632539e-07 7.462972e-07
## [16] 3.843974e-07 3.987850e-07 2.071762e-07 6.568468e-07 5.943040e-08
## [21] 6.711590e-08 2.796514e-07
##
## $strata
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
plot(y=haz2$haz[1:12], x=haz2$time[1:12], col=1, lty=1, type="s")
lines(y=haz2$haz[13:24], x=haz2$time[13:24], col=2, lty=1, type="s")
Interpretation: Rural infant death is higher at the age of 0-1 months. The striking difference between urbna and rural is seen at the age of 7-8 months otherwise they more or less followed the similar pattern.