##1. Load and Clean datasets

library(readxl)
library(survival)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mhas0121dummies <- read_excel("MHAS0121 NO LABELS NO NEGATIVE TIMES.xlsx")

mhas0121noNA<-mhas0121dummies

mhas0121noNA<-mhas0121noNA %>%
  mutate(
        dead = died0121,
        ageint_tv = ageint,
        agecensor_tv = agecensor
       )
mhas0121noNA
## # A tibble: 9,636 × 52
##       id locsize01 perwght01 died18 died21 mobirth yrbirth female moint yrint
##    <dbl>     <dbl>     <dbl>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl> <dbl> <dbl>
##  1     1         1       300     NA     NA      10    1941      0     8  2001
##  2     2         1       253      0      0       6    1931      0     7  2001
##  3     3         1       454      0      0       6    1950      0     7  2001
##  4     4         1       337     NA     NA      10    1936      0     7  2001
##  5     5         1       295      0      0       3    1944      1     7  2001
##  6     6         1       253      0      0       5    1948      1     7  2001
##  7     7         1       490     NA     NA       8    1927      0     8  2001
##  8     8         1       537      0      0       2    1945      0     7  2001
##  9     9         1       370      1     NA       9    1946      1     7  2001
## 10    10         1       237      0     NA       1    1948      0     7  2001
## # ℹ 9,626 more rows
## # ℹ 42 more variables: schooling <dbl>, died0103 <dbl>, refused0103 <dbl>,
## #   lost0103 <dbl>, followup0103 <dbl>, died0312 <dbl>, refused0312 <dbl>,
## #   lost0312 <dbl>, followup0312 <dbl>, died1215 <dbl>, refused1215 <dbl>,
## #   lost1215 <dbl>, followup1215 <dbl>, died1518 <dbl>, refused1518 <dbl>,
## #   lost1518 <dbl>, followup1518 <dbl>, died1821 <dbl>, refused1821 <dbl>,
## #   lost1821 <dbl>, followup1821 <dbl>, died0121 <dbl>, mocensor <dbl>, …
# Code adapted from Mills (2011): section 3.5
# Expands data into person-months, up to maximum number age in months observed in data
mhas_splitpoisson<-survSplit(mhas0121noNA, cut=c(50,55,60,65,70,75,80,85), start="ageint_tv", end="agecensor_tv", event="dead")

# Sorting data by ID
mhas_splitpoisson<-mhas_splitpoisson[order (mhas_splitpoisson$id, mhas_splitpoisson$ageint_tv),]

2. Solution of Problem Set 6

### I.  Set up the data for semi-parametric continuous models via Poisson regression by splitting the experience of each person in different episodes, each episode being each 5-year age group. In addition to splitting episodes, make sure the procedure calculates the exposure to the risk of dying in each episode (i.e., age group) for that particular person, and whether that person died during the episode/interval. Otherwise, calculate these variables.


mhas_splitpoisson <- survSplit(
  mhas0121noNA, 
  cut = c(50, 55, 60, 65, 70, 75, 80, 85), 
  start = "ageint_tv", 
  end = "agecensor_tv", 
  event = "dead"
)

mhas_splitpoisson <- mhas_splitpoisson[order(mhas_splitpoisson$id, mhas_splitpoisson$ageint_tv),]

mhas_splitpoisson <- mhas_splitpoisson %>%
  mutate(
    age = trunc(ageint_tv), # Age at the start of the interval
    exposure = agecensor_tv - ageint_tv, # Time spent in the interval
    age5 = dplyr::case_when(
      age >= 50 & age < 55 ~ "50-54",
      age >= 55 & age < 60 ~ "55-59",
      age >= 60 & age < 65 ~ "60-64",
      age >= 65 & age < 70 ~ "65-69",
      age >= 70 & age < 75 ~ "70-74",
      age >= 75 & age < 80 ~ "75-79",
      age >= 80 & age < 85 ~ "80-84",
      age >= 85             ~ "85+"
    ),
    age5 = factor(age5, levels = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+"))
  )


###II.  On this “split” data, estimate a Poisson model using the log of exposure as an offset and controlling for age group (as dummies) as well as by FEMALE, EDUCLEVEL (or SCHOOLING, whatever you have been using) and LOCSIZE01 (all as factors, except for SCHOOLING if you are using years thereof).


fit.poisson <- glm(
  dead ~ offset(log(exposure)) + as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01), 
  data = mhas_splitpoisson, 
  family = poisson(link = log)
)

summary(fit.poisson)
## 
## Call:
## glm(formula = dead ~ offset(log(exposure)) + as.factor(age5) + 
##     female + as.factor(educlevel) + as.factor(locsize01), family = poisson(link = log), 
##     data = mhas_splitpoisson)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -4.75489    0.18347 -25.917  < 2e-16 ***
## as.factor(age5)55-59     0.07273    0.20391   0.357 0.721323    
## as.factor(age5)60-64     0.78169    0.18804   4.157 3.22e-05 ***
## as.factor(age5)65-69     1.30473    0.18392   7.094 1.30e-12 ***
## as.factor(age5)70-74     1.51745    0.18334   8.277  < 2e-16 ***
## as.factor(age5)75-79     1.93465    0.18314  10.564  < 2e-16 ***
## as.factor(age5)80-84     2.37316    0.18334  12.944  < 2e-16 ***
## as.factor(age5)85+       3.07802    0.18230  16.884  < 2e-16 ***
## female                  -0.25105    0.02962  -8.477  < 2e-16 ***
## as.factor(educlevel)15  -0.03017    0.03585  -0.841 0.400093    
## as.factor(educlevel)68  -0.08432    0.04432  -1.902 0.057126 .  
## as.factor(educlevel)919 -0.20891    0.05228  -3.996 6.45e-05 ***
## as.factor(locsize01)2   -0.01968    0.04183  -0.470 0.638045    
## as.factor(locsize01)3   -0.08353    0.05129  -1.629 0.103416    
## as.factor(locsize01)4   -0.15223    0.04192  -3.632 0.000281 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 24070  on 32737  degrees of freedom
## Residual deviance: 20851  on 32723  degrees of freedom
##   (1390 observations deleted due to missingness)
## AIC: 30279
## 
## Number of Fisher Scoring iterations: 6
exp_results <- exp(cbind(coef(fit.poisson), confint(fit.poisson)))
## Waiting for profiling to be done...
print(exp_results)
##                                             2.5 %      97.5 %
## (Intercept)              0.008609531  0.005882798  0.01210936
## as.factor(age5)55-59     1.075441927  0.730672767  1.63023345
## as.factor(age5)60-64     2.185161949  1.537919554  3.22321902
## as.factor(age5)65-69     3.686703737  2.618404246  5.39948313
## as.factor(age5)70-74     4.560571478  3.243161092  6.67269457
## as.factor(age5)75-79     6.921612815  4.924327116 10.12374388
## as.factor(age5)80-84    10.731224891  7.631384631 15.70112814
## as.factor(age5)85+      21.715337842 15.477972873 31.71518595
## female                   0.777980529  0.734122847  0.82450238
## as.factor(educlevel)15   0.970284327  0.904495768  1.04098420
## as.factor(educlevel)68   0.919136796  0.842440722  1.00232316
## as.factor(educlevel)919  0.811471953  0.732002402  0.89853988
## as.factor(locsize01)2    0.980512691  0.902769081  1.06366539
## as.factor(locsize01)3    0.919860302  0.830960046  1.01607317
## as.factor(locsize01)4    0.858787648  0.790677919  0.93189998
###III. Test for the proportionality of hazards for FEMALE, SCHOOLING/EDUCLEVEL, and LOCSIZE01.

library(survival)
fit.cox <- coxph(Surv(ageint_tv, agecensor_tv, dead) ~ female + as.factor(educlevel) + as.factor(locsize01), 
                 data = mhas_splitpoisson)

summary(fit.cox)
## Call:
## coxph(formula = Surv(ageint_tv, agecensor_tv, dead) ~ female + 
##     as.factor(educlevel) + as.factor(locsize01), data = mhas_splitpoisson)
## 
##   n= 32738, number of events= 4699 
##    (1390 observations deleted due to missingness)
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)    
## female                  -0.25617   0.77401  0.02963 -8.645  < 2e-16 ***
## as.factor(educlevel)15  -0.01078   0.98928  0.03598 -0.299 0.764595    
## as.factor(educlevel)68  -0.05064   0.95062  0.04456 -1.136 0.255776    
## as.factor(educlevel)919 -0.17997   0.83529  0.05245 -3.431 0.000601 ***
## as.factor(locsize01)2   -0.01730   0.98285  0.04183 -0.414 0.679153    
## as.factor(locsize01)3   -0.08111   0.92209  0.05131 -1.581 0.113913    
## as.factor(locsize01)4   -0.15418   0.85712  0.04196 -3.674 0.000239 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## female                     0.7740      1.292    0.7303    0.8203
## as.factor(educlevel)15     0.9893      1.011    0.9219    1.0616
## as.factor(educlevel)68     0.9506      1.052    0.8711    1.0374
## as.factor(educlevel)919    0.8353      1.197    0.7537    0.9257
## as.factor(locsize01)2      0.9828      1.017    0.9055    1.0668
## as.factor(locsize01)3      0.9221      1.084    0.8339    1.0196
## as.factor(locsize01)4      0.8571      1.167    0.7894    0.9306
## 
## Concordance= 0.548  (se = 0.005 )
## Likelihood ratio test= 88.77  on 7 df,   p=<2e-16
## Wald test            = 89  on 7 df,   p=<2e-16
## Score (logrank) test = 89.32  on 7 df,   p=<2e-16
cox.zph(fit.cox)
##                       chisq df      p
## female                8.028  1 0.0046
## as.factor(educlevel)  8.189  3 0.0423
## as.factor(locsize01)  0.166  3 0.9829
## GLOBAL               18.901  7 0.0085
###IV.  Exponentiation the coefficients, add them to the table with all other HRs.


hr_table <- exp(cbind(
  Coefficient = coef(fit.cox),         # Exponentiate coefficients
  `2.5%` = confint(fit.cox)[, 1],     # Lower bound of CI
  `97.5%` = confint(fit.cox)[, 2]     # Upper bound of CI
))


hr_table <- as.data.frame(hr_table)
hr_table <- tibble::rownames_to_column(hr_table, "Variable")  # Add variable names
colnames(hr_table) <- c("Variable", "HR", "Lower CI", "Upper CI")

hr_table
##                  Variable        HR  Lower CI  Upper CI
## 1                  female 0.7740112 0.7303368 0.8202972
## 2  as.factor(educlevel)15 0.9892823 0.9219138 1.0615737
## 3  as.factor(educlevel)68 0.9506206 0.8711182 1.0373788
## 4 as.factor(educlevel)919 0.8352948 0.7536909 0.9257343
## 5   as.factor(locsize01)2 0.9828462 0.9054780 1.0668252
## 6   as.factor(locsize01)3 0.9220878 0.8338659 1.0196434
## 7   as.factor(locsize01)4 0.8571165 0.7894428 0.9305913
print(hr_table)
##                  Variable        HR  Lower CI  Upper CI
## 1                  female 0.7740112 0.7303368 0.8202972
## 2  as.factor(educlevel)15 0.9892823 0.9219138 1.0615737
## 3  as.factor(educlevel)68 0.9506206 0.8711182 1.0373788
## 4 as.factor(educlevel)919 0.8352948 0.7536909 0.9257343
## 5   as.factor(locsize01)2 0.9828462 0.9054780 1.0668252
## 6   as.factor(locsize01)3 0.9220878 0.8338659 1.0196434
## 7   as.factor(locsize01)4 0.8571165 0.7894428 0.9305913
### V.  Analyze and compare these different estimates, and conclude on which one would you use of them all and why.


AIC(fit.poisson)
## [1] 30278.77
AIC(fit.cox)
## [1] 72185.42
BIC(fit.poisson)
## [1] 30404.71
BIC(fit.cox)
## [1] 72230.61