library(foreign)
library(haven)
## Warning: package 'haven' was built under R version 3.4.1
library(survival)
## Warning: package 'survival' was built under R version 3.4.1
library(car)
## Warning: package 'car' was built under R version 3.4.1
library(survey)
## Warning: package 'survey' was built under R version 3.4.1
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(muhaz)
## Warning: package 'muhaz' was built under R version 3.4.1
library(eha)
## Warning: package 'eha' was built under R version 3.4.2
library(survminer)
## Warning: package 'survminer' was built under R version 3.4.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.1
## Loading required package: ggpubr
## Warning: package 'ggpubr' was built under R version 3.4.1
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.4.1
dhs_2016<-read.dta("C:/Users/atq765/Google Drive/Dem7283/DHS Resources/ET_2016_DHS_08152017_2025_103714/etir70dt/ETIR70FL.dta", convert.factors = F)

edhs16<-subset(dhs_2016,select=c( "caseid","v445","b0_01", "b1_01", "b2_01", "b3_01", "b4_01", "b5_01","b6_01","b7_01","v002", "v003", "v005", "v007", "v008",
                                  "v011","v012", "v013","v022", "v024", "v025","v102", "v106", "v212","v113", "v115","v116", "v119", "v120", "bidx_01","bidx_02","bidx_03",
                                 "v121", "v122", "v123", "v124", "v125", "v127", "v129", "v153", "v190" , "v201","v211", "v213", "v217", "v302", "v501",
                                  "v511","v525", "v623", "v701","v705", "v717"))

PART A

Event Variable - Under 5 Mortality 1) Under 5 Mortality: Probability of dying between birth and exactly 5 years of age, expressed per 1,000 live births

  1. If the child is dead at the time of interview, then the variable B5_01!=1, then the age at death in months is the variable B7_01.

  2. If the child is alive at the time of interview, then the variable B5_01==1, and the age at death is censored. If the age at death is censored, then the age at the date of interview (censored age at death) is the date of the interview - date of birth (in months).

edhs16<-edhs16[edhs16$v201!=0,]

edhs16$death.age<-ifelse(edhs16$b5_01==1,((((edhs16$v008))+1900)-(((edhs16$b3_01))+1900)),edhs16$b7_01)

#censoring indicator for death by age 5, in months (60 months)

edhs16$d.event<-ifelse(is.na(edhs16$b7_01)==T|edhs16$b7_01>60,0,1)

edhs16$d.eventfac<-factor(edhs16$d.event); levels(edhs16$d.eventfac)<-c("Alive at 1", "Dead by 1")
table(edhs16$d.eventfac)
## 
## Alive at 1  Dead by 1 
##       9741        533
  1. Estimating the Four Functions of Survival time

a) Estimating Survival Time Functions from Ethiopian DHS 2016

#Here we see the data
head(edhs16[,c("death.age","d.event")], n=20)
##    death.age d.event
## 1         31       0
## 3          0       1
## 4         44       0
## 5          7       0
## 6         30       0
## 7         28       0
## 8          1       0
## 9         14       0
## 11       105       0
## 13         1       0
## 14        38       0
## 15         9       1
## 16        92       0
## 18        25       0
## 20        12       0
## 21        22       0
## 23         0       0
## 26         0       1
## 27        19       0
## 28         5       0
#The Surv() object
head(Surv(edhs16$death.age, edhs16$d.event), n=20)
##  [1]  31+   0   44+   7+  30+  28+   1+  14+ 105+   1+  38+   9   92+  25+
## [15]  12+  22+   0+   0   19+   5+
mort<-survfit(Surv(death.age, d.event)~1, data=edhs16,conf.type="none")
plot(mort, ylim=c(.9,1), xlim=c(0,60), main="Survival Function for Under 5 Child Mortality", xlab="Child Age in Months")

summary(mort) 
## Call: survfit(formula = Surv(death.age, d.event) ~ 1, data = edhs16, 
##     conf.type = "none")
## 
##  time n.risk n.event survival std.err
##     0  10274     273    0.973 0.00159
##     1   9903      26    0.971 0.00166
##     2   9689      16    0.969 0.00170
##     3   9503      18    0.967 0.00176
##     4   9300       9    0.966 0.00178
##     5   9119      12    0.965 0.00182
##     6   8917      21    0.963 0.00188
##     7   8695      12    0.962 0.00191
##     8   8496      16    0.960 0.00196
##     9   8311      14    0.958 0.00201
##    10   8117       7    0.957 0.00203
##    11   7972       8    0.956 0.00206
##    12   7830      22    0.954 0.00213
##    13   7587       2    0.953 0.00214
##    14   7377       3    0.953 0.00215
##    15   7189       2    0.953 0.00215
##    16   7026       3    0.952 0.00217
##    17   6852       1    0.952 0.00217
##    18   6689       7    0.951 0.00220
##    19   6538       2    0.951 0.00221
##    20   6395       2    0.951 0.00222
##    23   6030       2    0.950 0.00223
##    24   5916      32    0.945 0.00240
##    36   4476      12    0.943 0.00250
##    48   3563       5    0.941 0.00256
##    60   2845       6    0.939 0.00268
1000*(1-summary(mort)$surv[26])
## [1] 60.61247

The Overall under-five mortality rate is 60.6 deaths per 1000 live births s

b) Estimating Hazard Functions from Ethiopian DHS 2016

library(muhaz)

haz<-kphaz.fit(time=edhs16$death.age, status=edhs16$d.event, method = "product-limit")
kphaz.plot(haz, main="Hazard function plot")

data.frame(haz)
##    time          haz          var
## 1   0.5 0.0026471490 2.695226e-07
## 2   1.5 0.0016672234 1.737320e-07
## 3   2.5 0.0019159952 2.039577e-07
## 4   3.5 0.0009794258 1.065886e-07
## 5   4.5 0.0013300522 1.474248e-07
## 6   5.5 0.0023830811 2.704482e-07
## 7   6.5 0.0013960534 1.624202e-07
## 8   7.5 0.0019068440 2.272614e-07
## 9   8.5 0.0016990693 2.062093e-07
## 10  9.5 0.0008678057 1.075862e-07
## 11 10.5 0.0010124706 1.281397e-07
## 12 11.5 0.0028574081 3.711564e-07
## 13 12.5 0.0002673581 3.574208e-08
## 14 13.5 0.0004103285 5.612792e-08
## 15 14.5 0.0002793132 3.900838e-08
## 16 15.5 0.0004336751 6.269631e-08
## 17 16.5 0.0001470480 2.162312e-08
## 18 17.5 0.0010575851 1.597890e-07
## 19 18.5 0.0003075286 4.728720e-08
## 20 19.5 0.0003160352 4.993984e-08
## 21 21.5 0.0001119138 6.262389e-09
## 22 23.5 0.0054789456 9.381428e-07
## 23 30.0 0.0002268511 4.288603e-09
## 24 42.0 0.0001179906 2.784468e-09
## 25 54.0 0.0001769402 5.218012e-09

This illustrates, that while the largest drop in survivorship occurred between 0 and 5 years, the hazard is actually higher in the 1-12 month and 25-30 months range, illustrating the conditionality of that probability.

c) Estimating Cummulative Hazard Functions from Ethiopian DHS 2016

#cumulative hazard
plot(cumsum(haz$haz)~haz$time, main = "Cumulative Hazard function", ylab="H(t)",xlab="Time in Months", type="l",xlim=c(0,60), lwd=2,col=3)

#Survival function, I just store this in an object so I can use it
surv<-mort

d) Estimating the PDF from Ethiopian DHS 2016

#here is a cheap version of the pdf
ft<- -diff(mort$surv)
plot(ft, xlim=c(.5,59.5), type="s", ylab="f(t)",xlab="Time in Months",main="Probability Density Function")

#here is the cumulative distribution function
Ft<-cumsum(ft)
plot(Ft, xlim=c(0.5,60), type="s", ylab="F(t)",xlab="Time in Months", main="Cumulative Distribution Function")

Hypothesis:

Under-five mortality is higher in rural areas and among poorer and less educated communities. The risk of Under five child mortality also varies from one region to another.

edhs16<-edhs16[edhs16$v201!=0,]
edhs16$age_fb<-recode(edhs16$v212, recodes="0:19='<20'; 20:34='20-34'; 35:49='35-49'; else='NA'", as.factor.result=F)

fit8<-survfit(Surv(death.age, d.event)~age_fb, data=edhs16)
plot(fit8, ylim=c(0.9,1), xlim=c(0,60), col=c(1,3), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c('<20','20-29','30-39','40-49'), col=c(1,4), lty=1)

ggsurvplot(fit8,xlab = "Time in Months", xlim=c(0,60), ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")

#two group compairison
survdiff(Surv(death.age, d.event)~age_fb, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ age_fb, data = edhs16)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## age_fb=<20   6430      350    335.8     0.602      1.65
## age_fb=20-34 3823      183    196.1     0.876      1.41
## age_fb=35-49   21        0      1.1     1.105      1.12
## 
##  Chisq= 2.6  on 2 degrees of freedom, p= 0.27
edhs16$age<-recode(edhs16$v013, recodes="1='15-19'; 2:3='20-29'; 4:5='30-39'; 6:7='40-49'; else='NA'", as.factor.result=T)

fit6<-survfit(Surv(death.age, d.event)~age, data=edhs16)
plot(fit6, ylim=c(0.9,1), xlim=c(0,60), col=c(1,3), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c('15-19','20-29', '30-39','40-49'), col=c(1,3), lty=1)

ggsurvplot(fit6,xlab = "Time in Months", xlim=c(0,60), ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")
## Warning: Removed 7 rows containing missing values (geom_path).
## Warning: Removed 6 rows containing missing values (geom_point).

## Warning: Removed 7 rows containing missing values (geom_path).

## Warning: Removed 6 rows containing missing values (geom_point).

#two group compairison
survdiff(Surv(death.age, d.event)~age, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ age, data = edhs16)
## 
##              N Observed Expected (O-E)^2/E (O-E)^2/V
## age=15-19  359       22     15.6      2.65      2.79
## age=20-29 3874      151    188.8      7.55     11.99
## age=30-39 3861      175    203.4      3.97      6.52
## age=40-49 2180      185    125.2     28.51     38.28
## 
##  Chisq= 43.7  on 3 degrees of freedom, p= 1.72e-09
edhs16$mar_stat<-recode(edhs16$v501, recodes='0="1nevermarried"; 1:2="2married"; 3="3widowed"; 4:5="4divorced"; else=NA', as.factor.result = T)

fit5<-survfit(Surv(death.age, d.event)~mar_stat, data=edhs16)
plot(fit5, ylim=c(0.9,1), xlim=c(0,60), col=c(1,3), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c("1nevermarried", "2married", "3widowed", "4divorced"), col=c(1,3), lty=1)

ggsurvplot(fit5,xlab = "Time in Months", xlim=c(0,60), ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")

#two group compairison
survdiff(Surv(death.age, d.event)~mar_stat, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ mar_stat, data = edhs16)
## 
##                           N Observed Expected (O-E)^2/E (O-E)^2/V
## mar_stat=1nevermarried  120        5     6.62     0.398     0.409
## mar_stat=2married      8839      422   451.50     1.928    12.856
## mar_stat=3widowed       443       37    25.74     4.922     5.255
## mar_stat=4divorced      872       69    49.13     8.038     8.999
## 
##  Chisq= 15.6  on 3 degrees of freedom, p= 0.00139
edhs16$wealth_index<-recode(edhs16$v190, recodes ='1:2 ="Poor"; 3="Middle"; 4:5="Rich"; else=NA')
fit1<-survfit(Surv(death.age, d.event)~wealth_index, data=edhs16)
plot(fit1, ylim=c(0.9,1), xlim=c(0,60), col=c(1,3), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c("Poor","Middle","Rich"), col=c(1,3), lty=1)

ggsurvplot(fit1,xlab = "Time in Months", ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")

edhs16$partner_educ<-recode(edhs16$v701, recodes="0='0Noeducation'; 1='1Primary'; 2='2Secodary'; 3='3Higher'; else=NA", as.factor.result=T)
fit9<-survfit(Surv(death.age, d.event)~partner_educ, data=edhs16)
plot(fit9, ylim=c(0.9,1), xlim=c(0,60), col=c(1,4), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c('Noeducation','Primary','Secodary','Higher'), col=c(1,3), lty=1)

ggsurvplot(fit9,xlab = "Time in Months", ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")

#two group compairison
survdiff(Surv(death.age, d.event)~partner_educ, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ partner_educ, data = edhs16)
## 
## n=8762, 1512 observations deleted due to missingness.
## 
##                              N Observed Expected (O-E)^2/E (O-E)^2/V
## partner_educ=0Noeducation 4162      235    198.7   6.64868  12.84932
## partner_educ=1Primary     2774      130    130.9   0.00654   0.00965
## partner_educ=2Secodary    1019       30     49.2   7.50466   8.62544
## partner_educ=3Higher       807       23     39.2   6.69410   7.49044
## 
##  Chisq= 21.1  on 3 degrees of freedom, p= 9.81e-05
edhs16$mother_educ<-recode(edhs16$v106, recodes="0='0Noeducation'; 1='1Primary'; 2:3='2Secodary and Higher'; else=NA", as.factor.result=T)
fit9<-survfit(Surv(death.age, d.event)~mother_educ, data=edhs16)
plot(fit9, ylim=c(0.9,1), xlim=c(0,60), col=c(1,3), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c('Noeducation','Primary','Secodary&Higher'), col=c(1,3), lty=1)

ggsurvplot(fit9,xlab = "Time in Months", ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")

#two group compairison
survdiff(Surv(death.age, d.event)~mother_educ, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ mother_educ, data = edhs16)
## 
##                                     N Observed Expected (O-E)^2/E
## mother_educ=0Noeducation         6190      370    320.6      7.62
## mother_educ=1Primary             2731      119    141.3      3.51
## mother_educ=2Secodary and Higher 1353       44     71.2     10.38
##                                  (O-E)^2/V
## mother_educ=0Noeducation             19.41
## mother_educ=1Primary                  4.84
## mother_educ=2Secodary and Higher     12.15
## 
##  Chisq= 21.8  on 2 degrees of freedom, p= 1.82e-05
edhs16$mother_occup<-recode(edhs16$v106, recodes="0 ='1Unemployed'; 1:3='2Non-manual/Professional'; 4:9='3Agricultural/Manual(skilled/unskilled)'; else=NA", as.factor.result=T)
fit10<-survfit(Surv(death.age, d.event)~mother_occup, data=edhs16)
plot(fit10, ylim=c(0.9,1), xlim=c(0,60), col=c(1,3), conf.int=T)
title(main="Survival Function for Child Mortality", sub="Low vs. High SES Households")
legend("topright", legend = c('Noeducation','Primary','Secodary','Higher'), col=c(1,3), lty=1)

ggsurvplot(fit10,xlab = "Time in Months", ylim=c(.9, 1), conf.int = F, censor.shape=120,risk.table = F, title = "Survivorship Function for Child Mortality")

#two group compairison
survdiff(Surv(death.age, d.event)~mother_occup, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ mother_occup, data = edhs16)
## 
##                                          N Observed Expected (O-E)^2/E
## mother_occup=1Unemployed              6190      370      321      7.62
## mother_occup=2Non-manual/Professional 4084      163      212     11.50
##                                       (O-E)^2/V
## mother_occup=1Unemployed                   19.4
## mother_occup=2Non-manual/Professional      19.4
## 
##  Chisq= 19.4  on 1 degrees of freedom, p= 1.05e-05
table(edhs16$v025)
## 
##    1    2 
## 2732 7542
edhs16$rural<-recode(edhs16$v025, recodes ="2 = '0rural'; 1='1urban'; else=NA", as.factor.result = T)

fit2<-survfit(Surv(death.age, d.event)~rural, data=edhs16, conf.type = "log")

plot(fit2, xlim=c(0,60), col=c(1,2), conf.int=T)

title(main="Survival Function for Child Mortality", sub="Rural vs Urban Residence")

legend("topright", legend = c("Urban","Rural" ), col=c(1,2), lty=1)
fit2
## Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = edhs16, 
##     conf.type = "log")
## 
##                 n events median 0.95LCL 0.95UCL
## rural=0rural 7542    417     NA      NA      NA
## rural=1urban 2732    116     NA      NA      NA
summary(fit2)
## Call: survfit(formula = Surv(death.age, d.event) ~ rural, data = edhs16, 
##     conf.type = "log")
## 
##                 rural=0rural 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   7542     219    0.971 0.00193        0.967        0.975
##     1   7236      22    0.968 0.00203        0.964        0.972
##     2   7054      13    0.966 0.00208        0.962        0.970
##     3   6904      14    0.964 0.00214        0.960        0.968
##     4   6753       8    0.963 0.00218        0.959        0.967
##     5   6609       9    0.962 0.00222        0.957        0.966
##     6   6452      15    0.960 0.00229        0.955        0.964
##     7   6275       8    0.958 0.00233        0.954        0.963
##     8   6111      12    0.956 0.00238        0.952        0.961
##     9   5961      10    0.955 0.00243        0.950        0.960
##    10   5820       6    0.954 0.00246        0.949        0.959
##    11   5707       6    0.953 0.00249        0.948        0.958
##    12   5596      19    0.950 0.00259        0.945        0.955
##    13   5399       2    0.949 0.00261        0.944        0.954
##    14   5219       3    0.949 0.00262        0.944        0.954
##    15   5063       1    0.949 0.00263        0.943        0.954
##    16   4938       2    0.948 0.00264        0.943        0.953
##    18   4678       5    0.947 0.00268        0.942        0.952
##    19   4570       1    0.947 0.00269        0.942        0.952
##    20   4461       1    0.947 0.00269        0.941        0.952
##    23   4166       2    0.946 0.00271        0.941        0.952
##    24   4088      22    0.941 0.00291        0.936        0.947
##    36   2934       8    0.939 0.00304        0.933        0.945
##    48   2223       4    0.937 0.00315        0.931        0.943
##    60   1700       5    0.934 0.00337        0.928        0.941
## 
##                 rural=1urban 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   2732      54    0.980 0.00266        0.975        0.985
##     1   2667       4    0.979 0.00276        0.973        0.984
##     2   2635       3    0.978 0.00283        0.972        0.983
##     3   2599       4    0.976 0.00292        0.970        0.982
##     4   2547       1    0.976 0.00295        0.970        0.982
##     5   2510       3    0.975 0.00302        0.969        0.981
##     6   2465       6    0.972 0.00316        0.966        0.978
##     7   2420       4    0.971 0.00326        0.964        0.977
##     8   2385       4    0.969 0.00335        0.962        0.976
##     9   2350       4    0.967 0.00345        0.961        0.974
##    10   2297       1    0.967 0.00347        0.960        0.974
##    11   2265       2    0.966 0.00352        0.959        0.973
##    12   2234       3    0.965 0.00360        0.958        0.972
##    15   2126       1    0.964 0.00362        0.957        0.971
##    16   2088       1    0.964 0.00365        0.957        0.971
##    17   2048       1    0.963 0.00368        0.956        0.971
##    18   2011       2    0.962 0.00374        0.955        0.970
##    19   1968       1    0.962 0.00377        0.955        0.969
##    20   1934       1    0.961 0.00380        0.954        0.969
##    24   1828      10    0.956 0.00412        0.948        0.964
##    36   1542       4    0.954 0.00430        0.945        0.962
##    48   1340       1    0.953 0.00435        0.944        0.962
##    60   1145       1    0.952 0.00443        0.944        0.961
#Two- sample test
survdiff(Surv(death.age, d.event)~rural, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ rural, data = edhs16)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## rural=0rural 7542      417      385      2.70      9.88
## rural=1urban 2732      116      148      7.01      9.88
## 
##  Chisq= 9.9  on 1 degrees of freedom, p= 0.00167
library(survminer)

ggsurvplot(fit2,conf.int = F, risk.table = F, title = "Survivorship Function for Child Mortality", xlab = "Child Age in Months", xlim = c(0,60), ylim=c(.9, 1))

prop.table(table(edhs16$d.event, edhs16$rural), margin = 2)
##    
##         0rural     1urban
##   0 0.94470963 0.95754026
##   1 0.05529037 0.04245974
chisq.test(table(edhs16$d.event, edhs16$rural))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(edhs16$d.event, edhs16$rural)
## X-squared = 6.454, df = 1, p-value = 0.01107
edhs16$region<- recode(edhs16$v024, recodes='1="Tigray"; 2="Afar"; 3= "Amhara"; 4= "Oromia"; 5="Somali"; 6="Ben-Gumuz"; 7="SNNP"; 8="Gambella"; 9="Harari"; 10="Addis Ababa"; 11="Dire Dawa";else=NA', as.factor.result = F )


fit3<-survfit(Surv(death.age, d.event)~region, data=edhs16, conf.type = "log")

#Two- sample test
survdiff(Surv(death.age, d.event)~region, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ region, data = edhs16)
## 
##                       N Observed Expected (O-E)^2/E (O-E)^2/V
## region=Addis Ababa  760       20     42.1    11.637    12.830
## region=Afar         835       55     41.8     4.139     4.562
## region=Amhara      1160       51     62.0     1.950     2.239
## region=Ben-Gumuz    804       35     41.7     1.063     1.170
## region=Dire Dawa    654       47     34.8     4.238     4.601
## region=Gambella     756       44     39.9     0.419     0.459
## region=Harari       605       24     31.7     1.885     2.034
## region=Oromia      1366       78     69.0     1.182     1.379
## region=SNNP        1225       55     63.8     1.215     1.401
## region=Somali      1002       81     48.4    22.042    24.652
## region=Tigray      1107       43     57.7     3.764     4.284
## 
##  Chisq= 54.4  on 10 degrees of freedom, p= 4.01e-08
library(survminer)

ggsurvplot(fit3,conf.int = F, risk.table = F, title = "Survivorship Function for Child Mortality", xlab = "Time in Months", ylim=c(.9, 1))
## Warning: Removed 86 rows containing missing values (geom_path).
## Warning: Removed 86 rows containing missing values (geom_point).
## Warning: Removed 86 rows containing missing values (geom_path).
## Warning: Removed 86 rows containing missing values (geom_point).

prop.table(table(edhs16$d.event, edhs16$region), margin = 1)
##    
##     Addis Ababa       Afar     Amhara  Ben-Gumuz  Dire Dawa   Gambella
##   0  0.07596756 0.08007391 0.11384868 0.07894467 0.06231393 0.07309311
##   1  0.03752345 0.10318949 0.09568480 0.06566604 0.08818011 0.08255159
##    
##         Harari     Oromia       SNNP     Somali     Tigray
##   0 0.05964480 0.13222462 0.12011087 0.09454881 0.10922903
##   1 0.04502814 0.14634146 0.10318949 0.15196998 0.08067542
chisq.test(table(edhs16$d.event, edhs16$region))
## 
##  Pearson's Chi-squared test
## 
## data:  table(edhs16$d.event, edhs16$region)
## X-squared = 46.673, df = 10, p-value = 1.083e-06

Interpretation of the results

The primary goal of the study was to assess the risk factors of under- five mortality in Ethiopian by applying an Kaplan-Meier non-parametric method to provide valid estimates for correct statistical inference needed for policy decision making.

The overall under-five mortality rate in Ethiopia is found to be 61 deaths per 1000 live births. We found that region has the most significant risk factor for child mortality. Somali had higher risk of dying before age five, followed by children born in Dire Dawa, Ben-Gumuz, Afar, Gambela. Addis Ababa has the lowest under five mortality. This indicates that there is a differential of child mortality among the regions of Ethiopia.

Next, the mothers’ age emerge as a powerful background covariates of under-five mortality in the Ethiopia states. Children from mothers aged (15-19) and (40-49) have a higher risk of dying than mothers’ age (20-39).

Where there is a strong tradition for young women to be married at a very early age in rural parts of the country the relative risk of their under five children is high. Strong policy shall be implemented to protect early marriage in the country.

Mother’s education emerge as powerful background covariates of under-five mortality in the Ethiopia states, for the reason that both are known to be associated with better child care practices. Thus, policy makers shall focus on educating illiterate mothers about the child care and maternal education.

As expected children of mothers from wealthy households are found to be at a lower risk of dying than mothers’ from poor households. This study therefore urges for policy makers improving the status of women through education and creating more employment opportunities, by giving more rural women.

edhs16$secedu<-recode(edhs16$v106, recodes ="2:3 = 1; 0:1=0; else=NA")
table(edhs16$secedu, edhs16$d.eventfac)
##    
##     Alive at 1 Dead by 1
##   0       8432       489
##   1       1309        44
fit4<-survfit(Surv(death.age, d.event)~rural+secedu, data=edhs16)
summary(fit4)
## Call: survfit(formula = Surv(death.age, d.event) ~ rural + secedu, 
##     data = edhs16)
## 
##                 rural=0rural, secedu=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   7217     210    0.971 0.00198        0.967        0.975
##     1   6925      21    0.968 0.00207        0.964        0.972
##     2   6757      13    0.966 0.00213        0.962        0.970
##     3   6620      14    0.964 0.00220        0.960        0.968
##     4   6477       8    0.963 0.00224        0.958        0.967
##     5   6340       8    0.962 0.00227        0.957        0.966
##     6   6195      15    0.959 0.00235        0.955        0.964
##     7   6027       8    0.958 0.00239        0.953        0.963
##     8   5871      10    0.956 0.00244        0.952        0.961
##     9   5729      10    0.955 0.00249        0.950        0.960
##    10   5596       5    0.954 0.00252        0.949        0.959
##    11   5488       6    0.953 0.00255        0.948        0.958
##    12   5382      19    0.949 0.00265        0.944        0.955
##    13   5191       2    0.949 0.00267        0.944        0.954
##    14   5020       3    0.949 0.00268        0.943        0.954
##    15   4870       1    0.948 0.00269        0.943        0.954
##    16   4753       2    0.948 0.00270        0.943        0.953
##    18   4502       5    0.947 0.00274        0.942        0.952
##    19   4399       1    0.947 0.00275        0.941        0.952
##    20   4293       1    0.946 0.00276        0.941        0.952
##    23   4017       2    0.946 0.00278        0.941        0.951
##    24   3940      22    0.941 0.00298        0.935        0.947
##    36   2838       8    0.938 0.00312        0.932        0.944
##    48   2162       4    0.936 0.00323        0.930        0.943
##    60   1652       5    0.933 0.00346        0.927        0.940
## 
##                 rural=0rural, secedu=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0    325       9    0.972 0.00910        0.955        0.990
##     1    311       1    0.969 0.00959        0.951        0.988
##     5    269       1    0.966 0.01021        0.946        0.986
##     8    240       2    0.958 0.01161        0.935        0.981
##    10    224       1    0.953 0.01232        0.929        0.978
## 
##                 rural=1urban, secedu=0 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   1704      37    0.978 0.00353        0.971        0.985
##     1   1661       4    0.976 0.00371        0.969        0.983
##     2   1641       2    0.975 0.00380        0.967        0.982
##     3   1620       3    0.973 0.00394        0.965        0.981
##     4   1586       1    0.972 0.00398        0.965        0.980
##     5   1564       2    0.971 0.00407        0.963        0.979
##     6   1539       4    0.969 0.00425        0.960        0.977
##     7   1511       4    0.966 0.00443        0.957        0.975
##     8   1488       4    0.963 0.00460        0.954        0.972
##     9   1467       4    0.961 0.00478        0.951        0.970
##    11   1413       2    0.959 0.00486        0.950        0.969
##    12   1396       2    0.958 0.00495        0.948        0.968
##    16   1312       1    0.957 0.00500        0.948        0.967
##    17   1291       1    0.957 0.00505        0.947        0.967
##    18   1270       2    0.955 0.00516        0.945        0.965
##    19   1247       1    0.954 0.00521        0.944        0.965
##    20   1225       1    0.954 0.00526        0.943        0.964
##    24   1166       6    0.949 0.00560        0.938        0.960
##    36    992       4    0.945 0.00590        0.933        0.956
##    48    869       1    0.944 0.00599        0.932        0.956
## 
##                 rural=1urban, secedu=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     0   1028      17    0.983 0.00398        0.976        0.991
##     2    994       1    0.982 0.00409        0.974        0.991
##     3    979       1    0.981 0.00421        0.973        0.990
##     5    946       1    0.980 0.00433        0.972        0.989
##     6    926       2    0.978 0.00458        0.969        0.987
##    10    866       1    0.977 0.00471        0.968        0.986
##    12    838       1    0.976 0.00484        0.967        0.986
##    15    794       1    0.975 0.00499        0.965        0.985
##    24    662       4    0.969 0.00576        0.958        0.980
##    60    394       1    0.966 0.00625        0.954        0.979
ggsurvplot(fit4,conf.int = F, risk.table = F, title = "Survivorship Function for child Mortality", xlab = "Time in Months", xlim = c(0,60), ylim=c(.9, 1))

plot(fit4, ylim=c(.85,1), xlim=c(0,60), col=c(1,1,2,2),lty=c(1,2,1,2), conf.int=F)
title(main="Survival Function for child Mortality", sub="Rural/Urban * Mother's Education")
legend("topright", legend = c("Rural, Low Edu","Urban, Low Edu", "Rural, High Edu","Urban, High Edu" ), col=c(1,1,2,2),lty=c(1,2,1,2))

# test
survdiff(Surv(death.age, d.event)~rural+secedu, data=edhs16)
## Call:
## survdiff(formula = Surv(death.age, d.event) ~ rural + secedu, 
##     data = edhs16)
## 
##                           N Observed Expected (O-E)^2/E (O-E)^2/V
## rural=0rural, secedu=0 7217      403    369.0     3.139    10.368
## rural=0rural, secedu=1  325       14     15.8     0.202     0.211
## rural=1urban, secedu=0 1704       86     92.9     0.506     0.623
## rural=1urban, secedu=1 1028       30     55.4    11.641    13.185
## 
##  Chisq= 15.7  on 3 degrees of freedom, p= 0.00129

We can also calculate the hazard function for each group using the kphaz.fit function in the muhaz library.

haz2<-kphaz.fit(edhs16$death.age, edhs16$d.event, edhs16$rural)
haz2
## $time
##  [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5
## [15] 14.5 15.5 17.0 18.5 19.5 21.5 23.5 30.0 42.0 54.0  0.5  1.5  2.5  3.5
## [29]  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 13.5 15.5 16.5 17.5 18.5 19.5
## [43] 22.0 30.0 42.0 54.0
## 
## $haz
##  [1] 3.071240e-03 1.864027e-03 2.051801e-03 1.199911e-03 1.379496e-03
##  [6] 2.358900e-03 1.293841e-03 1.991030e-03 1.692530e-03 1.037762e-03
## [11] 1.060348e-03 3.458828e-03 3.765343e-04 5.809352e-04 1.976675e-04
## [16] 4.143857e-04 5.407287e-04 2.194908e-04 2.275831e-04 1.619312e-04
## [21] 5.465815e-03 2.307431e-04 1.509894e-04 2.469013e-04 1.504190e-03
## [26] 1.143160e-03 1.555152e-03 3.966680e-04 1.200656e-03 2.448860e-03
## [31] 1.661832e-03 1.691527e-03 1.715326e-03 4.374453e-04 8.908688e-04
## [36] 1.358936e-03 1.579030e-04 4.803074e-04 4.909180e-04 1.002256e-03
## [41] 5.112474e-04 5.200208e-04 1.378284e-03 2.190115e-04 6.303580e-05
## [46] 7.322789e-05
## 
## $var
##  [1] 4.287669e-07 2.672856e-07 3.007223e-07 1.799790e-07 2.114522e-07
##  [6] 3.709899e-07 2.092653e-07 3.303665e-07 2.864749e-07 1.794967e-07
## [11] 1.873946e-07 6.297257e-07 7.089434e-08 1.125079e-07 3.907245e-08
## [16] 8.585916e-08 5.848014e-08 4.817620e-08 5.179405e-08 1.311093e-08
## [21] 1.358062e-06 6.655660e-09 5.699677e-09 1.219216e-08 5.656494e-07
## [26] 4.356093e-07 6.046610e-07 1.573455e-07 4.805313e-07 9.995054e-07
## [31] 6.904264e-07 7.153229e-07 7.356116e-07 1.913584e-07 3.968237e-07
## [36] 6.155854e-07 2.493337e-08 2.306952e-07 2.410005e-07 5.022585e-07
## [41] 2.613739e-07 2.704216e-07 1.899718e-07 1.199159e-08 3.973513e-09
## [46] 5.362323e-09
## 
## $strata
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2
plot(y=haz2$haz[1:60], x=haz2$time[1:60], col=1, lty=1, type="s")
lines(y=haz2$haz[0:24], x=haz2$time[0:24], col=2, lty=1, type="s")

haz3<-kphaz.fit(edhs16$death.age, edhs16$d.event, edhs16$wealth_index)
haz3
## $time
##  [1]  0.5  1.5  2.5  4.5  6.5  7.5  9.5 11.5 12.5 13.5 15.0 17.0 19.0 22.0
## [15] 30.0  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5
## [29] 13.5 14.5 16.5 18.5 21.0 23.5 30.0 42.0 54.0  0.5  1.5  2.5  3.5  4.5
## [43]  5.5  6.5  7.5  8.5  9.5 10.5 11.5 13.5 15.5 16.5 17.5 18.5 19.5 21.5
## [57] 23.5 30.0 42.0 54.0
## 
## $haz
##  [1] 2.239388e-03 1.515752e-03 2.318394e-03 8.134606e-04 2.505010e-03
##  [6] 1.717590e-03 3.052503e-04 3.791981e-03 9.852217e-04 1.023541e-03
## [11] 5.359057e-04 1.670401e-03 5.834306e-04 2.584993e-03 2.822631e-04
## [16] 3.928962e-03 2.623533e-03 2.439841e-03 1.504130e-03 1.539074e-03
## [21] 2.379323e-03 1.092733e-03 1.680283e-03 1.720435e-03 1.471415e-03
## [26] 1.203384e-03 3.094886e-03 3.170577e-04 6.534894e-04 3.376097e-04
## [31] 1.243317e-04 3.815338e-04 1.052632e-04 5.616394e-03 2.707156e-04
## [36] 2.951872e-04 3.859166e-04 1.443126e-03 7.338104e-04 1.248325e-03
## [41] 7.645332e-04 1.550318e-03 2.370707e-03 1.339856e-03 2.184930e-03
## [46] 2.216435e-03 5.633825e-04 8.600312e-04 2.343843e-03 1.032311e-04
## [51] 6.330561e-04 3.212335e-04 9.800783e-04 3.334445e-04 3.394433e-04
## [56] 1.193033e-04 3.990242e-03 1.826718e-04 4.311088e-05 1.027855e-04
## 
## $var
##  [1] 1.671639e-06 1.148772e-06 1.791650e-06 2.205759e-07 2.091808e-06
##  [6] 1.475155e-06 9.317775e-08 3.595261e-06 9.706617e-07 1.047637e-06
## [11] 2.871949e-07 9.300918e-07 3.403912e-07 8.353318e-07 3.983854e-08
## [16] 9.081016e-07 6.257471e-07 5.953224e-07 3.770816e-07 3.947968e-07
## [21] 6.290882e-07 2.985332e-07 4.705817e-07 4.933409e-07 4.330264e-07
## [26] 3.620379e-07 9.579097e-07 1.005256e-07 2.135247e-07 1.139803e-07
## [31] 1.545838e-08 1.455680e-07 1.108033e-08 2.426656e-06 1.465805e-08
## [36] 2.178432e-08 3.723313e-08 3.471046e-07 1.794962e-07 3.116810e-07
## [41] 1.948388e-07 4.005973e-07 6.244899e-07 3.590484e-07 5.967487e-07
## [46] 6.140899e-07 1.587005e-07 2.465574e-07 6.867354e-07 1.065667e-08
## [51] 2.003941e-07 1.031910e-07 3.201866e-07 1.111852e-07 1.152218e-07
## [56] 1.423327e-08 1.447506e-06 6.673925e-09 1.858548e-09 5.282430e-09
## 
## $strata
##  [1] "Middle" "Middle" "Middle" "Middle" "Middle" "Middle" "Middle"
##  [8] "Middle" "Middle" "Middle" "Middle" "Middle" "Middle" "Middle"
## [15] "Middle" "Poor"   "Poor"   "Poor"   "Poor"   "Poor"   "Poor"  
## [22] "Poor"   "Poor"   "Poor"   "Poor"   "Poor"   "Poor"   "Poor"  
## [29] "Poor"   "Poor"   "Poor"   "Poor"   "Poor"   "Poor"   "Poor"  
## [36] "Poor"   "Poor"   "Rich"   "Rich"   "Rich"   "Rich"   "Rich"  
## [43] "Rich"   "Rich"   "Rich"   "Rich"   "Rich"   "Rich"   "Rich"  
## [50] "Rich"   "Rich"   "Rich"   "Rich"   "Rich"   "Rich"   "Rich"  
## [57] "Rich"   "Rich"   "Rich"   "Rich"
plot(y=haz3$haz[1:60], x=haz3$time[1:60], col=1, lty=1, type="s")
lines(y=haz3$haz[13:24], x=haz3$time[13:24], col=2, lty=1, type="s")

haz4<-kphaz.fit(edhs16$death.age, edhs16$d.event, edhs16$region)
haz4
## $time
##   [1]  0.5  2.0  5.0  7.5  8.5  9.5 11.0 13.5 16.5 21.0 36.0  0.5  2.0  3.5
##  [15]  5.0  6.5  7.5  8.5  9.5 10.5 11.5 13.0 17.0 22.0 30.0 42.0  0.5  1.5
##  [29]  2.5  4.5  6.5  7.5  8.5 10.0 11.5 14.0 20.0 30.0  0.5  1.5  3.0  4.5
##  [43]  5.5  6.5  8.0  9.5 11.0 15.5 21.0 23.5 42.0  0.5  1.5  3.0  4.5  5.5
##  [57]  6.5  7.5  8.5 10.0 12.0 14.5 17.0 21.0 30.0  0.5  1.5  2.5  3.5  4.5
##  [71]  5.5  7.0 11.0 14.5 16.5 21.0 30.0 42.0  0.5  3.0  5.5  6.5  7.5  8.5
##  [85] 16.5 42.0  0.5  1.5  2.5  3.5  5.0  6.5  7.5  8.5 10.5 13.0 16.0 20.5
##  [99] 23.5 30.0 42.0 54.0  0.5  1.5  2.5  3.5  5.0  7.0  8.5 10.5 12.5 15.5
## [113] 19.0 22.0 30.0  1.0  2.5  4.0  5.5  6.5  7.5  9.0 10.5 11.5 15.0 21.0
## [127] 30.0 42.0  1.5  4.0  6.0  7.5  9.5 14.0 17.5 18.5 21.5 30.0 48.0
## 
## $haz
##   [1] 1.342282e-03 6.906077e-04 3.628447e-04 1.472754e-03 1.483680e-03
##   [6] 1.517451e-03 7.824726e-04 5.473454e-04 5.707763e-04 3.092146e-04
##  [11] 9.850276e-05 5.014218e-03 2.008384e-03 1.356852e-03 2.878006e-03
##  [16] 2.965174e-03 1.572327e-03 4.818558e-03 3.278724e-03 1.661130e-03
##  [21] 6.862117e-03 9.074410e-04 3.745318e-04 1.777278e-03 3.063725e-04
##  [26] 3.858025e-04 2.689804e-03 9.074410e-04 9.337068e-04 3.242542e-04
##  [31] 1.975348e-03 1.010101e-03 2.040835e-03 1.588446e-03 1.089325e-03
##  [36] 5.875473e-04 3.344663e-04 1.377410e-04 2.595902e-03 2.691912e-03
##  [41] 6.993007e-04 1.422475e-03 1.440922e-03 1.479290e-03 7.692308e-04
##  [46] 1.552795e-03 1.614231e-03 2.834467e-04 5.307856e-04 6.593808e-03
##  [51] 1.316482e-04 4.769483e-03 3.246762e-03 8.445946e-04 6.826038e-03
##  [56] 5.220548e-03 1.779359e-03 1.798561e-03 1.831502e-03 9.541985e-04
##  [61] 9.803922e-04 6.872852e-04 1.084599e-03 2.036911e-03 7.676769e-04
##  [66] 1.366120e-03 4.207689e-03 1.416431e-03 2.890179e-03 1.455604e-03
##  [71] 1.499250e-03 2.359744e-03 2.888504e-04 1.769912e-03 6.422608e-04
##  [76] 1.805344e-03 4.688464e-04 2.986858e-04 1.730104e-03 9.320061e-04
##  [81] 1.890359e-03 1.980198e-03 3.980103e-03 2.016129e-03 1.851852e-04
##  [86] 1.501502e-04 5.415120e-03 2.394270e-03 8.058018e-04 1.692067e-03
##  [91] 2.681971e-03 9.149131e-04 1.874613e-03 9.496676e-04 1.719340e-03
##  [96] 5.543237e-04 3.037667e-04 2.828854e-04 4.343936e-03 1.637197e-04
## [101] 2.131287e-04 5.473469e-04 3.378404e-03 2.583302e-03 3.541438e-03
## [106] 1.796139e-03 1.410660e-03 9.843292e-04 4.026232e-03 2.153880e-03
## [111] 1.122334e-03 2.496879e-04 6.535948e-04 7.097321e-04 1.633987e-04
## [116] 1.109290e-03 6.830655e-03 1.217308e-03 1.246883e-03 1.322751e-03
## [121] 2.762478e-03 2.217278e-03 3.091197e-03 4.777126e-03 3.346720e-04
## [126] 1.994718e-03 6.172924e-04 3.987241e-04 3.267974e-04 1.023050e-03
## [131] 1.064483e-03 1.094092e-03 3.875969e-04 2.204586e-04 1.367989e-03
## [136] 1.406470e-03 6.250015e-04 1.711157e-04 2.675234e-04
## 
## $var
##   [1] 1.801721e-06 4.769390e-07 1.316563e-07 2.169004e-06 2.201305e-06
##   [6] 2.302657e-06 6.122634e-07 2.995870e-07 3.257855e-07 9.561367e-08
##  [11] 9.702793e-09 6.285741e-06 1.344769e-06 1.841048e-06 2.070952e-06
##  [16] 4.396150e-06 2.472212e-06 7.740414e-06 5.375073e-06 2.759351e-06
##  [21] 1.177396e-05 8.234492e-07 1.402741e-07 1.052921e-06 9.386414e-08
##  [26] 1.488435e-07 2.411704e-06 8.234492e-07 8.718084e-07 1.051408e-07
##  [31] 1.951038e-06 1.020304e-06 2.082524e-06 8.410667e-07 1.186628e-06
##  [36] 1.726069e-07 5.593687e-08 1.897260e-08 3.369593e-06 3.623359e-06
##  [41] 4.890215e-07 2.023435e-06 2.076257e-06 2.188299e-06 5.917160e-07
##  [46] 2.411172e-06 1.302893e-06 8.034204e-08 2.817333e-07 1.449365e-05
##  [51] 1.733126e-08 7.582670e-06 5.270745e-06 7.133400e-07 1.164887e-05
##  [56] 9.084933e-06 3.166120e-06 3.234822e-06 3.354399e-06 9.104947e-07
##  [61] 9.611688e-07 4.723610e-07 1.176354e-06 8.299722e-07 1.964480e-07
##  [66] 1.866284e-06 5.901712e-06 2.006276e-06 4.176577e-06 2.118783e-06
##  [71] 2.247752e-06 1.856144e-06 8.343454e-08 3.132587e-06 4.124989e-07
##  [76] 6.518639e-07 1.099139e-07 8.921320e-08 2.993259e-06 4.343362e-07
##  [81] 3.573458e-06 3.921184e-06 7.920620e-06 4.064776e-06 3.429355e-08
##  [86] 2.254507e-08 4.189197e-06 1.910856e-06 6.493165e-07 1.431561e-06
##  [91] 1.198919e-06 8.370659e-07 1.757275e-06 9.018686e-07 5.912478e-07
##  [96] 3.072748e-07 9.227421e-08 8.002417e-08 6.290371e-06 2.680414e-08
## [101] 4.542386e-08 1.497947e-07 2.853424e-06 2.224538e-06 3.135488e-06
## [106] 1.613057e-06 6.633217e-07 4.844900e-07 4.052698e-06 7.732914e-07
## [111] 1.259635e-06 6.234404e-08 4.271861e-07 2.518630e-07 2.669913e-08
## [116] 6.152775e-07 7.777255e-06 7.409323e-07 1.554717e-06 1.749671e-06
## [121] 3.815709e-06 1.639170e-06 4.777762e-06 7.607069e-06 1.120054e-07
## [126] 7.958373e-07 1.905276e-07 1.589809e-07 1.067965e-07 5.233325e-07
## [131] 5.666085e-07 1.197037e-06 1.502314e-07 4.860197e-08 1.871394e-06
## [136] 1.978157e-06 1.953139e-07 2.928057e-08 3.578448e-08
## 
## $strata
##   [1] "Addis Ababa" "Addis Ababa" "Addis Ababa" "Addis Ababa" "Addis Ababa"
##   [6] "Addis Ababa" "Addis Ababa" "Addis Ababa" "Addis Ababa" "Addis Ababa"
##  [11] "Addis Ababa" "Afar"        "Afar"        "Afar"        "Afar"       
##  [16] "Afar"        "Afar"        "Afar"        "Afar"        "Afar"       
##  [21] "Afar"        "Afar"        "Afar"        "Afar"        "Afar"       
##  [26] "Afar"        "Amhara"      "Amhara"      "Amhara"      "Amhara"     
##  [31] "Amhara"      "Amhara"      "Amhara"      "Amhara"      "Amhara"     
##  [36] "Amhara"      "Amhara"      "Amhara"      "Ben-Gumuz"   "Ben-Gumuz"  
##  [41] "Ben-Gumuz"   "Ben-Gumuz"   "Ben-Gumuz"   "Ben-Gumuz"   "Ben-Gumuz"  
##  [46] "Ben-Gumuz"   "Ben-Gumuz"   "Ben-Gumuz"   "Ben-Gumuz"   "Ben-Gumuz"  
##  [51] "Ben-Gumuz"   "Dire Dawa"   "Dire Dawa"   "Dire Dawa"   "Dire Dawa"  
##  [56] "Dire Dawa"   "Dire Dawa"   "Dire Dawa"   "Dire Dawa"   "Dire Dawa"  
##  [61] "Dire Dawa"   "Dire Dawa"   "Dire Dawa"   "Dire Dawa"   "Dire Dawa"  
##  [66] "Gambella"    "Gambella"    "Gambella"    "Gambella"    "Gambella"   
##  [71] "Gambella"    "Gambella"    "Gambella"    "Gambella"    "Gambella"   
##  [76] "Gambella"    "Gambella"    "Gambella"    "Harari"      "Harari"     
##  [81] "Harari"      "Harari"      "Harari"      "Harari"      "Harari"     
##  [86] "Harari"      "Oromia"      "Oromia"      "Oromia"      "Oromia"     
##  [91] "Oromia"      "Oromia"      "Oromia"      "Oromia"      "Oromia"     
##  [96] "Oromia"      "Oromia"      "Oromia"      "Oromia"      "Oromia"     
## [101] "Oromia"      "Oromia"      "SNNP"        "SNNP"        "SNNP"       
## [106] "SNNP"        "SNNP"        "SNNP"        "SNNP"        "SNNP"       
## [111] "SNNP"        "SNNP"        "SNNP"        "SNNP"        "SNNP"       
## [116] "Somali"      "Somali"      "Somali"      "Somali"      "Somali"     
## [121] "Somali"      "Somali"      "Somali"      "Somali"      "Somali"     
## [126] "Somali"      "Somali"      "Somali"      "Tigray"      "Tigray"     
## [131] "Tigray"      "Tigray"      "Tigray"      "Tigray"      "Tigray"     
## [136] "Tigray"      "Tigray"      "Tigray"      "Tigray"
plot(y=haz4$haz[1:60], x=haz4$time[1:60], col=1, lty=1, type="s")
lines(y=haz4$haz[13:24], x=haz4$time[13:24], col=2, lty=1, type="s")

Part B: Parametric Models

My outcome variable: Age at first marriage

Age at First Marriage: Median age in years when women ages 15 to 49 first married or lived with a consensual partner.

sub3<-data.frame(CASEID=dhs_2016$caseid, 
                 int.cmc=dhs_2016$v008,
                 fbir.cmc=dhs_2016$b3_01,
                 sbir.cmc=dhs_2016$b3_02,
                 rural=dhs_2016$v025,
                 region=dhs_2016$v024,
                 wealth.index=dhs_2016$v190,
                 marr.cmc=dhs_2016$v509,
                 mar.stat=dhs_2016$v501,
                 age.fm=dhs_2016$v511,
                 mar.dur=dhs_2016$v513,
                 educ=dhs_2016$v106,
                 age=dhs_2016$v012,
                 age_r=dhs_2016$v013,
                 weight=dhs_2016$v005/1000000,
                 psu=dhs_2016$v021, strata=dhs_2016$v022)

Estimating Parametric Survival Model

## Parametric Model for my outcome varaible "Age at First Marriage"
sub3$agefm<-ifelse(sub3$mar.stat==0, 0,sub3$age.fm)
sub3$fmevent<-ifelse(sub3$mar.stat==0,0,1) 
fit<-survfit(Surv(agefm, fmevent)~1, sub3)
fit 
## Call: survfit(formula = Surv(agefm, fmevent) ~ 1, data = sub3)
## 
##       n  events  median 0.95LCL 0.95UCL 
##   15683   11405      17      16      17
plot(fit, conf.int=T, ylab="S(t)", xlab="Age in Years")
title(main="Survival Function for Age at First Marriage, EDHS - 2016")

###Estimating Parametric Hazard Model

fit.haz.km<-kphaz.fit(sub3$agefm, sub3$fmevent )

#this is a version of the hazard that is smoothed using a kernel-density method
fit.haz.sm<-muhaz(sub3$agefm[sub3$agefm>0], sub3$fmevent[sub3$agefm>0] )

#Empirical hazard function
kphaz.plot(fit.haz.km, main="Hazard Function for Age at First Marriage, EDHS - 2016")
lines(fit.haz.sm, col=2, lwd=3)
legend("topleft", legend = c("KM hazard", "Smoothed Hazard"), col=c(1,2), lty=c(1,1))

Create covariates

predictor variables: Woman’s education (secondary +, vs < secondary), Woman’s age^2, residence(rural, urban), wealth_index(poor, not poor);

sub3$educ.high<-ifelse(sub3$educ %in% c(2,3), 1, 0)

sub3$mother_educ<-recode(sub3$educ, recodes="0='0Noeducation'; 1='1Primary'; 2:3='2Secodary and Higher'; else=NA", as.factor.result=T)

sub3$wealth_index<-recode(sub3$wealth.index, recodes ='1:2 ="Poor"; 3="Middle"; 4:5="Rich"; else=NA')

sub3$age_new<-recode(sub3$age_r, recodes="1='15-19'; 2:3='20-29'; 4:5='30-39'; 6:7='40-49'; else='NA'", as.factor.result=T)

sub3$rural_c<-recode(sub3$rural, recodes ="2 = '0rural'; 1='1urban'; else=NA", as.factor.result = T)

sub3$age2<-(sub3$age/5)^2
sub3$rural2<-recode(sub3$rural, recodes='2=1;else=0')
sub3$poor<-recode(sub3$wealth.index, recodes='1:2=1;else=0')
sub3$educ.high<-ifelse(sub3$educ %in% c(2,3), 1, 0)

sub3$region<- recode(sub3$v024, recodes='1="Tigray"; 2="Afar"; 3= "Amhara"; 4= "Oromia"; 5="Somali"; 6="Ben-Gumuz"; 7="SNNP"; 8="Gambella"; 9="Harari"; 10="Addis Ababa"; 11="Dire Dawa";else=NA', as.factor.result = F )
 
options(survey.lonely.psu = "adjust")
des<-svydesign(ids=~psu, strata=~strata, data=sub3[sub3$agefm>0,], weight=~weight )

rep.des<-as.svrepdesign(des, type="bootstrap" )

Exponential Distribution

fit.1<-phreg(Surv(agefm, fmevent)~educ.high+rural2+poor+I(age/5)+age2, data=sub3[sub3$agefm>0,],dist = "weibull", shape = 1)

summary(fit.1)
## Call:
## phreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + poor + 
##     I(age/5) + age2, data = sub3[sub3$agefm > 0, ], dist = "weibull", 
##     shape = 1)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## educ.high           0.172    -0.147     0.863     0.030     0.000 
## rural2              0.697     0.058     1.060     0.026     0.024 
## poor                0.420    -0.003     0.997     0.022     0.879 
## I(age/5)            6.235    -0.100     0.904     0.042     0.016 
## age2               41.621     0.007     1.007     0.003     0.041 
## 
## log(scale)                    2.513               0.132     0.000 
## 
##  Shape is fixed at  1 
## 
## Events                    11405 
## Total time at risk        196750 
## Max. log. likelihood      -43852 
## LR test statistic         65.35 
## Degrees of freedom        5 
## Overall p-value           9.46465e-13
plot(fit.1)

Weibul Distribution

fit.2<-phreg(Surv(agefm, fmevent)~educ.high+rural2+poor+I(age/5)+age2, data=sub3[sub3$agefm>0,],dist = "weibull")

summary(fit.2)
## Call:
## phreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + poor + 
##     I(age/5) + age2, data = sub3[sub3$agefm > 0, ], dist = "weibull")
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## educ.high           0.172    -0.651     0.522     0.030     0.000 
## rural2              0.697     0.322     1.380     0.026     0.000 
## poor                0.420     0.025     1.025     0.022     0.265 
## I(age/5)            6.235    -0.634     0.530     0.043     0.000 
## age2               41.621     0.035     1.036     0.003     0.000 
## 
## log(scale)                    2.430               0.029     0.000 
## log(shape)                    1.538               0.007     0.000 
## 
## Events                    11405 
## Total time at risk        196750 
## Max. log. likelihood      -31675 
## LR test statistic         2173.13 
## Degrees of freedom        5 
## Overall p-value           0
plot(fit.2)

AFT model specification

fit.1.aft<-survreg(Surv(agefm, fmevent)~educ.high+rural2+poor+I(age/5)+age2, data=sub3[sub3$agefm>0,],dist = "exponential")

summary(fit.1.aft)
## 
## Call:
## survreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + 
##     poor + I(age/5) + age2, data = sub3[sub3$agefm > 0, ], dist = "exponential")
##                Value Std. Error      z        p
## (Intercept)  2.51303    0.13245 18.974 2.81e-80
## educ.high    0.14737    0.02973  4.957 7.18e-07
## rural2      -0.05846    0.02586 -2.261 2.38e-02
## poor         0.00332    0.02190  0.152 8.79e-01
## I(age/5)     0.10045    0.04186  2.400 1.64e-02
## age2        -0.00659    0.00322 -2.046 4.08e-02
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -43852.4   Loglik(intercept only)= -43885.1
##  Chisq= 65.35 on 5 degrees of freedom, p= 9.5e-13 
## Number of Newton-Raphson Iterations: 3 
## n= 11405

Exponential and Weibull models AFT vs PH parameterization

#re-scaled beta's
(betaHat <- -coef(fit.1.aft) / fit.1.aft$scale)
##  (Intercept)    educ.high       rural2         poor     I(age/5) 
## -2.513034921 -0.147370824  0.058459730 -0.003324933 -0.100450148 
##         age2 
##  0.006594256
#beta's from the PH model
coef(fit.1)
##    educ.high       rural2         poor     I(age/5)         age2 
## -0.147370824  0.058459730 -0.003324933 -0.100450148  0.006594256 
##   log(scale) 
##  2.513034921

a. The two models are similar, so you can go back and forth.

b. Model Diagnostics

Log-Normal Model

#log-normal distribution for hazard
fit.3<-phreg(Surv(agefm, fmevent)~educ.high+rural2+poor+I(age/5)+age2, data=sub3[sub3$agefm>0,], dist="lognormal", center=T)
summary(fit.3)
## Call:
## phreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + poor + 
##     I(age/5) + age2, data = sub3[sub3$agefm > 0, ], dist = "lognormal", 
##     center = T)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## (Intercept)                   1.544               0.178     0.000 
## educ.high           0.172    -0.585     0.557     0.030     0.000 
## rural2              0.697     0.268     1.307     0.026     0.000 
## poor                0.420     0.008     1.008     0.022     0.725 
## I(age/5)            6.235    -0.651     0.522     0.043     0.000 
## age2               41.621     0.042     1.043     0.003     0.000 
## 
## log(scale)                    2.710               0.015     0.000 
## log(shape)                    1.816               0.033     0.000 
## 
## Events                    11405 
## Total time at risk        196750 
## Max. log. likelihood      -30158 
## LR test statistic         1453.17 
## Degrees of freedom        5 
## Overall p-value           0
plot(fit.3)

#plot the hazard from the log normal vs the empirical hazard
plot(fit.3, fn="haz")
lines(fit.haz.sm, col=2)

We now see the age effect completely gone from the model.

So, the log-normal model fits the empirical hazard pretty well up to ~20 years, where the empirical rate increase at a lower rate after 20 years.

Log-logistic Model

#log-normal distribution for hazard
fit.4<-phreg(Surv(agefm, fmevent)~educ.high+rural2+I(age/5)+age2, data=sub3[sub3$agefm>0,], dist="loglogistic", center=T)
summary(fit.4)
## Call:
## phreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + I(age/5) + 
##     age2, data = sub3[sub3$agefm > 0, ], dist = "loglogistic", 
##     center = T)
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## (Intercept)                   1.987               0.149     0.000 
## educ.high           0.172    -0.562     0.570     0.030     0.000 
## rural2              0.697     0.253     1.288     0.023     0.000 
## I(age/5)            6.235    -0.650     0.522     0.044     0.000 
## age2               41.621     0.043     1.044     0.003     0.000 
## 
## log(scale)                    2.784               0.006     0.000 
## log(shape)                    2.246               0.016     0.000 
## 
## Events                    11405 
## Total time at risk        196750 
## Max. log. likelihood      -30227 
## LR test statistic         1245.91 
## Degrees of freedom        4 
## Overall p-value           0
plot(fit.4)

#plot the hazard from the log normal vs the empirical hazard
plot(fit.4, fn="haz")
lines(fit.haz.sm, col=2)

Piecewise constant exponential model

# here I must supply the times for the "pieces" where I expect the  hazard to be constant
fit.5<-phreg(Surv(agefm, fmevent)~educ.high+rural2+I(age/5)+age2, data=sub3[sub3$agefm>0,], dist="pch", cuts=c(7, 10,13,15,20,23))
summary(fit.5)
## Call:
## phreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + I(age/5) + 
##     age2, data = sub3[sub3$agefm > 0, ], dist = "pch", cuts = c(7, 
##     10, 13, 15, 20, 23))
## 
## Covariate          W.mean      Coef Exp(Coef)  se(Coef)    Wald p
## educ.high           0.172    -0.511     0.600     0.030     0.000 
## rural2              0.697     0.229     1.257     0.024     0.000 
## I(age/5)            6.235    -0.510     0.601     0.043     0.000 
## age2               41.621     0.033     1.033     0.003     0.000 
## 
## 
## Events                    11405 
## Total time at risk        196750 
## Max. log. likelihood      -31828 
## LR test statistic         996.13 
## Degrees of freedom        4 
## Overall p-value           0
plot(fit.5)

plot(fit.5, fn="haz", ylim=c(0, .5))
lines(fit.haz.sm, col=2)

We may want to compare the models to one another based off AIC values. the eha package doesn’t give this to you, so we must calculate it:

AIC1<--2*fit.1$loglik[2]+2*length(fit.1$coefficients); AIC1
## [1] 87716.84
AIC2<--2*fit.2$loglik[2]+2*length(fit.2$coefficients); AIC2
## [1] 63364.98
AIC3<--2*fit.3$loglik[2]+2*length(fit.3$coefficients); AIC3
## [1] 60331.95
AIC4<--2*fit.4$loglik[2]+2*length(fit.4$coefficients); AIC4
## [1] 60468.6
AIC5<--2*fit.5$loglik[2]+2*length(fit.5$coefficients); AIC5
## [1] 63664.75

b) Based on minimum AIC criteria, the log-Normal model best fits the data

Which looks like it actually fits the data pretty good. The AIC’s show the log-normal is still fitting better.

Graphical checks on the model fit

emp<-coxreg(Surv(agefm, fmevent)~educ.high+rural2+I(age/5)+age2, data=sub3[sub3$agefm>0,])

check.dist(sp=emp,pp=fit.1, main = "Empirical vs. Exponential")

check.dist(sp=emp,pp=fit.2, main = "Empirical vs. Weibull")

check.dist(sp=emp,pp=fit.3, main = "Empirical vs. Log-Normal")

check.dist(sp=emp,pp=fit.4, main = "Empirical vs. Log-Logistic")

check.dist(sp=emp,pp=fit.5, main = "Empirical vs. PCH")

We see that the PCH model and the log-Normal models both appear to fit the empirical hazard function better than the other parametric models.

Using Survey design

survey.fit <- withReplicates(rep.des, quote(coef(survreg(Surv(agefm, fmevent)~educ.high+rural2+poor +I(age/5)+age2,dist="lognormal", weights = .weights+.0001))))
survey.est<-as.data.frame(survey.fit)
survey.test<-data.frame(beta = rownames(survey.est), estimate=survey.est$theta, se.est= survey.est$SE)
survey.test$t<-survey.test$estimate/survey.test$se.est
survey.test$pval<-2*pnorm(survey.test$t,lower.tail = F )
survey.test
##          beta     estimate       se.est          t         pval
## 1 (Intercept)  2.586382764 0.0413285887 62.5809602 0.000000e+00
## 2   educ.high  0.171339975 0.0102406561 16.7313474 7.746582e-63
## 3      rural2 -0.025472394 0.0114195679 -2.2305918 1.974292e+00
## 4        poor -0.003263082 0.0088316658 -0.3694753 1.288226e+00
## 5    I(age/5)  0.066791982 0.0124048276  5.3843539 7.270534e-08
## 6        age2 -0.004552940 0.0009821052 -4.6358988 1.999996e+00
fit.2.aft<-survreg(Surv(agefm, fmevent)~educ.high+rural2+poor+I(age/5)+age2 + age2, data=sub3[sub3$agefm>0,],dist = "lognormal")

fit.2.aft.sum<-summary(fit.2.aft)
fit.2.aft.sum
## 
## Call:
## survreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + 
##     poor + I(age/5) + age2 + age2, data = sub3[sub3$agefm > 0, 
##     ], dist = "lognormal")
##                Value Std. Error       z         p
## (Intercept)  2.54000   0.027215   93.33  0.00e+00
## educ.high    0.14540   0.006128   23.73 1.89e-124
## rural2      -0.05396   0.005322  -10.14  3.70e-24
## poor         0.00466   0.004513    1.03  3.01e-01
## I(age/5)     0.08778   0.008612   10.19  2.14e-24
## age2        -0.00595   0.000664   -8.97  3.05e-19
## Log(scale)  -1.57754   0.006621 -238.26  0.00e+00
## 
## Scale= 0.206 
## 
## Log Normal distribution
## Loglik(model)= -30391.6   Loglik(intercept only)= -31003.5
##  Chisq= 1223.84 on 5 degrees of freedom, p= 0 
## Number of Newton-Raphson Iterations: 3 
## n= 11405
summary(fit.2.aft)
## 
## Call:
## survreg(formula = Surv(agefm, fmevent) ~ educ.high + rural2 + 
##     poor + I(age/5) + age2 + age2, data = sub3[sub3$agefm > 0, 
##     ], dist = "lognormal")
##                Value Std. Error       z         p
## (Intercept)  2.54000   0.027215   93.33  0.00e+00
## educ.high    0.14540   0.006128   23.73 1.89e-124
## rural2      -0.05396   0.005322  -10.14  3.70e-24
## poor         0.00466   0.004513    1.03  3.01e-01
## I(age/5)     0.08778   0.008612   10.19  2.14e-24
## age2        -0.00595   0.000664   -8.97  3.05e-19
## Log(scale)  -1.57754   0.006621 -238.26  0.00e+00
## 
## Scale= 0.206 
## 
## Log Normal distribution
## Loglik(model)= -30391.6   Loglik(intercept only)= -31003.5
##  Chisq= 1223.84 on 5 degrees of freedom, p= 0 
## Number of Newton-Raphson Iterations: 3 
## n= 11405
#Compare the se's of the parameters
survey.test$se.est/sqrt(diag(fit.2.aft.sum$var[-6, -6]))
## (Intercept)   educ.high      rural2        poor    I(age/5)  Log(scale) 
##   1.5186110   1.6711439   2.1458376   1.9569308   1.4403422   0.1483272
#survey based errors are larger, as they should be.

e) check correlation betweeen predictor variables

cor.test(sub3$rural2, sub3$educ.high, method = c("pearson", "kendall", "spearman"))
## 
##  Pearson's product-moment correlation
## 
## data:  sub3$rural2 and sub3$educ.high
## t = -62.19, df = 15681, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.4572681 -0.4321581
## sample estimates:
##        cor 
## -0.4448005

This shows that secondary education and residence rural are correlated to eachother with (cor :-0.4448005) and p-value:0.000

f)Interpretaion of the results

The statistical distribution and the associated factors was investigated. Five parametric survival models were fitted with the covariates, to determine the best parametric survival model that best described the waiting time to first marriage of the women. The results showed that, the time to first marriage is best modeled by the log-normal model, the median age at first marriage among Ethiopian womnen is 18.

An investigation into the impact of the prognostic factors on the waiting time by the Log-normal model evidenced that, the place of residence of a woman, her educational level, her wealth status and region were the statistically significant factors that determined her time to first marrriage.

Older women in this sample are more likely to be involved in first marriage than younger women.Women residing in rural areas are 6% (OR: 1.06) times more likely to be involved in first marriage than women residing in urban areas. Women with higher education are 13.7% (OR:0.836) less likely to be involved in early first marriage than women with no or lower education.