##1. Load and Clean datasets
library(readxl)
library(survival)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
mhas0121dummies <- read_excel("MHAS0121 NO LABELS NO NEGATIVE TIMES.xlsx")
mhas0121noNA<-mhas0121dummies
mhas0121noNA<-mhas0121noNA %>%
mutate(
dead = died0121,
ageint_tv = ageint,
agecensor_tv = agecensor
)
mhas0121noNA
## # A tibble: 9,636 × 52
## id locsize01 perwght01 died18 died21 mobirth yrbirth female moint yrint
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 300 NA NA 10 1941 0 8 2001
## 2 2 1 253 0 0 6 1931 0 7 2001
## 3 3 1 454 0 0 6 1950 0 7 2001
## 4 4 1 337 NA NA 10 1936 0 7 2001
## 5 5 1 295 0 0 3 1944 1 7 2001
## 6 6 1 253 0 0 5 1948 1 7 2001
## 7 7 1 490 NA NA 8 1927 0 8 2001
## 8 8 1 537 0 0 2 1945 0 7 2001
## 9 9 1 370 1 NA 9 1946 1 7 2001
## 10 10 1 237 0 NA 1 1948 0 7 2001
## # ℹ 9,626 more rows
## # ℹ 42 more variables: schooling <dbl>, died0103 <dbl>, refused0103 <dbl>,
## # lost0103 <dbl>, followup0103 <dbl>, died0312 <dbl>, refused0312 <dbl>,
## # lost0312 <dbl>, followup0312 <dbl>, died1215 <dbl>, refused1215 <dbl>,
## # lost1215 <dbl>, followup1215 <dbl>, died1518 <dbl>, refused1518 <dbl>,
## # lost1518 <dbl>, followup1518 <dbl>, died1821 <dbl>, refused1821 <dbl>,
## # lost1821 <dbl>, followup1821 <dbl>, died0121 <dbl>, mocensor <dbl>, …
# Code adapted from Mills (2011): section 3.5
# Expands data into person-months, up to maximum number age in months observed in data
mhas_splitpoisson<-survSplit(mhas0121noNA, cut=c(50,55,60,65,70,75,80,85), start="ageint_tv", end="agecensor_tv", event="dead")
# Sorting data by ID
mhas_splitpoisson<-mhas_splitpoisson[order (mhas_splitpoisson$id, mhas_splitpoisson$ageint_tv),]
### I. Set up the data for semi-parametric continuous models via Poisson regression by splitting the experience of each person in different episodes, each episode being each 5-year age group. In addition to splitting episodes, make sure the procedure calculates the exposure to the risk of dying in each episode (i.e., age group) for that particular person, and whether that person died during the episode/interval. Otherwise, calculate these variables.
mhas_splitpoisson <- survSplit(
mhas0121noNA,
cut = c(50, 55, 60, 65, 70, 75, 80, 85),
start = "ageint_tv",
end = "agecensor_tv",
event = "dead"
)
mhas_splitpoisson <- mhas_splitpoisson[order(mhas_splitpoisson$id, mhas_splitpoisson$ageint_tv),]
mhas_splitpoisson <- mhas_splitpoisson %>%
mutate(
age = trunc(ageint_tv), # Age at the start of the interval
exposure = agecensor_tv - ageint_tv, # Time spent in the interval
age5 = dplyr::case_when(
age >= 50 & age < 55 ~ "50-54",
age >= 55 & age < 60 ~ "55-59",
age >= 60 & age < 65 ~ "60-64",
age >= 65 & age < 70 ~ "65-69",
age >= 70 & age < 75 ~ "70-74",
age >= 75 & age < 80 ~ "75-79",
age >= 80 & age < 85 ~ "80-84",
age >= 85 ~ "85+"
),
age5 = factor(age5, levels = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+"))
)
###II. On this “split” data, estimate a Poisson model using the log of exposure as an offset and controlling for age group (as dummies) as well as by FEMALE, EDUCLEVEL (or SCHOOLING, whatever you have been using) and LOCSIZE01 (all as factors, except for SCHOOLING if you are using years thereof).
fit.poisson <- glm(
dead ~ offset(log(exposure)) + as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01),
data = mhas_splitpoisson,
family = poisson(link = log)
)
summary(fit.poisson)
##
## Call:
## glm(formula = dead ~ offset(log(exposure)) + as.factor(age5) +
## female + as.factor(educlevel) + as.factor(locsize01), family = poisson(link = log),
## data = mhas_splitpoisson)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.75489 0.18347 -25.917 < 2e-16 ***
## as.factor(age5)55-59 0.07273 0.20391 0.357 0.721323
## as.factor(age5)60-64 0.78169 0.18804 4.157 3.22e-05 ***
## as.factor(age5)65-69 1.30473 0.18392 7.094 1.30e-12 ***
## as.factor(age5)70-74 1.51745 0.18334 8.277 < 2e-16 ***
## as.factor(age5)75-79 1.93465 0.18314 10.564 < 2e-16 ***
## as.factor(age5)80-84 2.37316 0.18334 12.944 < 2e-16 ***
## as.factor(age5)85+ 3.07802 0.18230 16.884 < 2e-16 ***
## female -0.25105 0.02962 -8.477 < 2e-16 ***
## as.factor(educlevel)15 -0.03017 0.03585 -0.841 0.400093
## as.factor(educlevel)68 -0.08432 0.04432 -1.902 0.057126 .
## as.factor(educlevel)919 -0.20891 0.05228 -3.996 6.45e-05 ***
## as.factor(locsize01)2 -0.01968 0.04183 -0.470 0.638045
## as.factor(locsize01)3 -0.08353 0.05129 -1.629 0.103416
## as.factor(locsize01)4 -0.15223 0.04192 -3.632 0.000281 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 24070 on 32737 degrees of freedom
## Residual deviance: 20851 on 32723 degrees of freedom
## (1390 observations deleted due to missingness)
## AIC: 30279
##
## Number of Fisher Scoring iterations: 6
exp_results <- exp(cbind(coef(fit.poisson), confint(fit.poisson)))
## Waiting for profiling to be done...
print(exp_results)
## 2.5 % 97.5 %
## (Intercept) 0.008609531 0.005882798 0.01210936
## as.factor(age5)55-59 1.075441927 0.730672767 1.63023345
## as.factor(age5)60-64 2.185161949 1.537919554 3.22321902
## as.factor(age5)65-69 3.686703737 2.618404246 5.39948313
## as.factor(age5)70-74 4.560571478 3.243161092 6.67269457
## as.factor(age5)75-79 6.921612815 4.924327116 10.12374388
## as.factor(age5)80-84 10.731224891 7.631384631 15.70112814
## as.factor(age5)85+ 21.715337842 15.477972873 31.71518595
## female 0.777980529 0.734122847 0.82450238
## as.factor(educlevel)15 0.970284327 0.904495768 1.04098420
## as.factor(educlevel)68 0.919136796 0.842440722 1.00232316
## as.factor(educlevel)919 0.811471953 0.732002402 0.89853988
## as.factor(locsize01)2 0.980512691 0.902769081 1.06366539
## as.factor(locsize01)3 0.919860302 0.830960046 1.01607317
## as.factor(locsize01)4 0.858787648 0.790677919 0.93189998
###III. Test for the proportionality of hazards for FEMALE, SCHOOLING/EDUCLEVEL, and LOCSIZE01.
library(survival)
fit.cox <- coxph(Surv(ageint_tv, agecensor_tv, dead) ~ female + as.factor(educlevel) + as.factor(locsize01),
data = mhas_splitpoisson)
summary(fit.cox)
## Call:
## coxph(formula = Surv(ageint_tv, agecensor_tv, dead) ~ female +
## as.factor(educlevel) + as.factor(locsize01), data = mhas_splitpoisson)
##
## n= 32738, number of events= 4699
## (1390 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## female -0.25617 0.77401 0.02963 -8.645 < 2e-16 ***
## as.factor(educlevel)15 -0.01078 0.98928 0.03598 -0.299 0.764595
## as.factor(educlevel)68 -0.05064 0.95062 0.04456 -1.136 0.255776
## as.factor(educlevel)919 -0.17997 0.83529 0.05245 -3.431 0.000601 ***
## as.factor(locsize01)2 -0.01730 0.98285 0.04183 -0.414 0.679153
## as.factor(locsize01)3 -0.08111 0.92209 0.05131 -1.581 0.113913
## as.factor(locsize01)4 -0.15418 0.85712 0.04196 -3.674 0.000239 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## female 0.7740 1.292 0.7303 0.8203
## as.factor(educlevel)15 0.9893 1.011 0.9219 1.0616
## as.factor(educlevel)68 0.9506 1.052 0.8711 1.0374
## as.factor(educlevel)919 0.8353 1.197 0.7537 0.9257
## as.factor(locsize01)2 0.9828 1.017 0.9055 1.0668
## as.factor(locsize01)3 0.9221 1.084 0.8339 1.0196
## as.factor(locsize01)4 0.8571 1.167 0.7894 0.9306
##
## Concordance= 0.548 (se = 0.005 )
## Likelihood ratio test= 88.77 on 7 df, p=<2e-16
## Wald test = 89 on 7 df, p=<2e-16
## Score (logrank) test = 89.32 on 7 df, p=<2e-16
cox.zph(fit.cox)
## chisq df p
## female 8.028 1 0.0046
## as.factor(educlevel) 8.189 3 0.0423
## as.factor(locsize01) 0.166 3 0.9829
## GLOBAL 18.901 7 0.0085
###IV. Exponentiation the coefficients, add them to the table with all other HRs.
hr_table <- exp(cbind(
Coefficient = coef(fit.cox), # Exponentiate coefficients
`2.5%` = confint(fit.cox)[, 1], # Lower bound of CI
`97.5%` = confint(fit.cox)[, 2] # Upper bound of CI
))
hr_table <- as.data.frame(hr_table)
hr_table <- tibble::rownames_to_column(hr_table, "Variable") # Add variable names
colnames(hr_table) <- c("Variable", "HR", "Lower CI", "Upper CI")
hr_table
## Variable HR Lower CI Upper CI
## 1 female 0.7740112 0.7303368 0.8202972
## 2 as.factor(educlevel)15 0.9892823 0.9219138 1.0615737
## 3 as.factor(educlevel)68 0.9506206 0.8711182 1.0373788
## 4 as.factor(educlevel)919 0.8352948 0.7536909 0.9257343
## 5 as.factor(locsize01)2 0.9828462 0.9054780 1.0668252
## 6 as.factor(locsize01)3 0.9220878 0.8338659 1.0196434
## 7 as.factor(locsize01)4 0.8571165 0.7894428 0.9305913
print(hr_table)
## Variable HR Lower CI Upper CI
## 1 female 0.7740112 0.7303368 0.8202972
## 2 as.factor(educlevel)15 0.9892823 0.9219138 1.0615737
## 3 as.factor(educlevel)68 0.9506206 0.8711182 1.0373788
## 4 as.factor(educlevel)919 0.8352948 0.7536909 0.9257343
## 5 as.factor(locsize01)2 0.9828462 0.9054780 1.0668252
## 6 as.factor(locsize01)3 0.9220878 0.8338659 1.0196434
## 7 as.factor(locsize01)4 0.8571165 0.7894428 0.9305913
### V. Analyze and compare these different estimates, and conclude on which one would you use of them all and why.
AIC(fit.poisson)
## [1] 30278.77
AIC(fit.cox)
## [1] 72185.42
BIC(fit.poisson)
## [1] 30404.71
BIC(fit.cox)
## [1] 72230.61