setwd("C:\\Users\\anami\\OneDrive\\Documents\\EHA\\Assignment 5 Revised")
data1 <-read.csv("C:\\Users\\anami\\OneDrive\\Documents\\EHA\\Assignment 5 Revised\\MHAS0121 NO LABELS NO NEGATIVE TIMES.csv")
data2<-na.omit(data1)

1.

library(survival)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data2<-data2 %>%
  mutate(
        startmo = ageint*12,
        endmo = agecensor*12,
        dead = died0121
       )
summary(data2)
##        id          locsize01       perwght01         died18      died21      
##  Min.   :    2   Min.   :1.000   Min.   :   17   Min.   :0   Min.   :0.0000  
##  1st Qu.: 2770   1st Qu.:1.000   1st Qu.:  204   1st Qu.:0   1st Qu.:0.0000  
##  Median : 5425   Median :1.000   Median :  466   Median :0   Median :0.0000  
##  Mean   : 5487   Mean   :1.898   Mean   : 1100   Mean   :0   Mean   :0.1896  
##  3rd Qu.: 8308   3rd Qu.:3.000   3rd Qu.: 1193   3rd Qu.:0   3rd Qu.:0.0000  
##  Max.   :10992   Max.   :4.000   Max.   :44177   Max.   :0   Max.   :1.0000  
##     mobirth          yrbirth         female           moint       
##  Min.   : 1.000   Min.   :1907   Min.   :0.0000   Min.   : 3.000  
##  1st Qu.: 4.000   1st Qu.:1937   1st Qu.:0.0000   1st Qu.: 7.000  
##  Median : 6.000   Median :1943   Median :1.0000   Median : 7.000  
##  Mean   : 6.544   Mean   :1942   Mean   :0.5586   Mean   : 7.516  
##  3rd Qu.:10.000   3rd Qu.:1947   3rd Qu.:1.0000   3rd Qu.: 8.000  
##  Max.   :12.000   Max.   :1950   Max.   :1.0000   Max.   :11.000  
##      yrint        schooling        died0103  refused0103         lost0103
##  Min.   :2001   Min.   : 0.00   Min.   :0   Min.   :0.00000   Min.   :0  
##  1st Qu.:2001   1st Qu.: 1.00   1st Qu.:0   1st Qu.:0.00000   1st Qu.:0  
##  Median :2001   Median : 4.00   Median :0   Median :0.00000   Median :0  
##  Mean   :2001   Mean   : 4.61   Mean   :0   Mean   :0.01174   Mean   :0  
##  3rd Qu.:2001   3rd Qu.: 6.00   3rd Qu.:0   3rd Qu.:0.00000   3rd Qu.:0  
##  Max.   :2001   Max.   :19.00   Max.   :0   Max.   :1.00000   Max.   :0  
##   followup0103       died0312          refused0312         lost0312       
##  Min.   :0.0000   Min.   :0.0000000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.:1.0000   1st Qu.:0.0000000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median :1.0000   Median :0.0000000   Median :0.00000   Median :0.000000  
##  Mean   :0.9883   Mean   :0.0002498   Mean   :0.01474   Mean   :0.005246  
##  3rd Qu.:1.0000   3rd Qu.:0.0000000   3rd Qu.:0.00000   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :1.0000000   Max.   :1.00000   Max.   :1.000000  
##   followup0312       died1215  refused1215          lost1215       
##  Min.   :0.0000   Min.   :0   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:1.0000   1st Qu.:0   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :1.0000   Median :0   Median :0.000000   Median :0.000000  
##  Mean   :0.9798   Mean   :0   Mean   :0.007494   Mean   :0.006495  
##  3rd Qu.:1.0000   3rd Qu.:0   3rd Qu.:0.000000   3rd Qu.:0.000000  
##  Max.   :1.0000   Max.   :0   Max.   :1.000000   Max.   :1.000000  
##   followup1215      died1518          refused1518          lost1518        
##  Min.   :0.000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.0000000  
##  1st Qu.:1.000   1st Qu.:0.0000000   1st Qu.:0.000000   1st Qu.:0.0000000  
##  Median :1.000   Median :0.0000000   Median :0.000000   Median :0.0000000  
##  Mean   :0.986   Mean   :0.0002498   Mean   :0.004746   Mean   :0.0002498  
##  3rd Qu.:1.000   3rd Qu.:0.0000000   3rd Qu.:0.000000   3rd Qu.:0.0000000  
##  Max.   :1.000   Max.   :1.0000000   Max.   :1.000000   Max.   :1.0000000  
##   followup1518       died1821       refused1821          lost1821        
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000000   Min.   :0.0000000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.000000   1st Qu.:0.0000000  
##  Median :1.0000   Median :0.0000   Median :0.000000   Median :0.0000000  
##  Mean   :0.9948   Mean   :0.2051   Mean   :0.001499   Mean   :0.0002498  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.000000   3rd Qu.:0.0000000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.000000   Max.   :1.0000000  
##   followup1821       died0121         mocensor         yrcensor   
##  Min.   :0.0000   Min.   :0.0000   Min.   : 1.000   Min.   :2008  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.: 3.000   1st Qu.:2021  
##  Median :1.0000   Median :0.0000   Median :12.000   Median :2021  
##  Mean   :0.7932   Mean   :0.2051   Mean   : 8.628   Mean   :2021  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:12.000   3rd Qu.:2021  
##  Max.   :1.0000   Max.   :1.0000   Max.   :99.000   Max.   :2022  
##      ageint        agecensor       ageintexact    exposure_months
##  Min.   :50.50   Min.   : 67.42   Min.   :50.50   Min.   : 78.0  
##  1st Qu.:54.00   1st Qu.: 74.00   1st Qu.:54.00   1st Qu.:243.0  
##  Median :57.92   Median : 77.92   Median :57.92   Median :244.0  
##  Mean   :59.49   Mean   : 79.18   Mean   :59.49   Mean   :236.3  
##  3rd Qu.:63.75   3rd Qu.: 83.17   3rd Qu.:63.75   3rd Qu.:245.0  
##  Max.   :94.17   Max.   :114.42   Max.   :94.17   Max.   :296.0  
##    educlevel        ageintmo       agecensormo         educ1       
##  Min.   :  0.0   Min.   : 606.0   Min.   : 809.0   Min.   :0.0000  
##  1st Qu.: 15.0   1st Qu.: 648.0   1st Qu.: 888.0   1st Qu.:0.0000  
##  Median : 15.0   Median : 695.0   Median : 935.0   Median :0.0000  
##  Mean   :187.7   Mean   : 713.8   Mean   : 950.1   Mean   :0.2323  
##  3rd Qu.: 68.0   3rd Qu.: 765.0   3rd Qu.: 998.0   3rd Qu.:0.0000  
##  Max.   :919.0   Max.   :1130.0   Max.   :1373.0   Max.   :1.0000  
##      educ2            educ3            educ4          locsize011    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.3507   Mean   :0.2358   Mean   :0.1811   Mean   :0.5656  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##    locsize012       locsize013        locsize014        startmo      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   : 606.0  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 648.0  
##  Median :0.0000   Median :0.00000   Median :0.0000   Median : 695.0  
##  Mean   :0.1549   Mean   :0.09518   Mean   :0.1844   Mean   : 713.8  
##  3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.: 765.0  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1130.0  
##      endmo             dead       
##  Min.   : 809.0   Min.   :0.0000  
##  1st Qu.: 888.0   1st Qu.:0.0000  
##  Median : 935.0   Median :0.0000  
##  Mean   : 950.1   Mean   :0.2051  
##  3rd Qu.: 998.0   3rd Qu.:0.0000  
##  Max.   :1373.0   Max.   :1.0000
max(data2$exposure_months)
## [1] 296
# Expands data into person-months, up to maximum number observed in 
mhas_personmonth<-survSplit(data2, cut=c(1:1000), start="startmo", end="endmo", event="dead")
# Sorting data by ID
mhas_personmonth<-mhas_personmonth[order (mhas_personmonth$id, mhas_personmonth$startmo),]
mhas_personmonth<-mhas_personmonth %>%
  mutate(
  age=startmo/12, 
  agesq=age^2,
  .after = "died0121"
  )
library(dplyr)
library(ggplot2)
#install.packages("AMR")
mhas_personmonth <- mhas_personmonth %>% 
  mutate(
  # Create age groups
  age5 = dplyr::case_when(
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 & age < 70 ~ "65-69",
    age >= 70 & age < 75 ~ "70-74",
    age >= 75 & age < 80 ~ "75-79",
    age >= 80 & age < 85 ~ "80-84",
    age >= 85             ~ "85+"
  ),
  # Convert to factor
  age5 = factor(
    age5,
    level = c("50-54", "55-59","60-64","65-69","70-74","75-79","80-84","85+")
)
)
testPMdata1 <- subset(mhas_personmonth, id < 50, select=c(id,ageint,agecensor,startmo,endmo,age,age5,died0121,dead))

# Load the writexl library
library(writexl)
# Write the data frame to an Excel file
write_xlsx(testPMdata1, "testPMdata.xlsx")

1.a.

# i.    Not controlling for age.
fit.constant<-glm(dead ~ female + as.factor(educlevel) + as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.constant)
## 
## Call:
## glm(formula = dead ~ female + as.factor(educlevel) + as.factor(locsize01), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -6.56988    0.09168 -71.665  < 2e-16 ***
## female                  -0.13260    0.07111  -1.865 0.062210 .  
## as.factor(educlevel)15  -0.25956    0.08869  -2.926 0.003428 ** 
## as.factor(educlevel)68  -0.36804    0.10111  -3.640 0.000273 ***
## as.factor(educlevel)919 -0.66249    0.11996  -5.523 3.34e-08 ***
## as.factor(locsize01)2   -0.04905    0.09983  -0.491 0.623155    
## as.factor(locsize01)3   -0.12125    0.12449  -0.974 0.330074    
## as.factor(locsize01)4   -0.21389    0.10013  -2.136 0.032661 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 13080  on 888728  degrees of freedom
## AIC: 13096
## 
## Number of Fisher Scoring iterations: 10
exp(coef(fit.constant))
##             (Intercept)                  female  as.factor(educlevel)15 
##             0.001401965             0.875815926             0.771393552 
##  as.factor(educlevel)68 as.factor(educlevel)919   as.factor(locsize01)2 
##             0.692090427             0.515565108             0.952129561 
##   as.factor(locsize01)3   as.factor(locsize01)4 
##             0.885811648             0.807433892
#ii.    Controlling for age linearly.
fit.linear<-glm(dead ~ age + female + as.factor(educlevel) + as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.linear)
## 
## Call:
## glm(formula = dead ~ age + female + as.factor(educlevel) + as.factor(locsize01), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -25.967008   0.622183 -41.735  < 2e-16 ***
## age                       0.260193   0.007847  33.158  < 2e-16 ***
## female                   -0.317962   0.071264  -4.462 8.13e-06 ***
## as.factor(educlevel)15   -0.093447   0.089087  -1.049    0.294    
## as.factor(educlevel)68    0.071873   0.101798   0.706    0.480    
## as.factor(educlevel)919  -0.014564   0.120864  -0.120    0.904    
## as.factor(locsize01)2     0.018055   0.100613   0.179    0.858    
## as.factor(locsize01)3    -0.176475   0.125201  -1.410    0.159    
## as.factor(locsize01)4    -0.135233   0.100748  -1.342    0.180    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11322  on 888727  degrees of freedom
## AIC: 11340
## 
## Number of Fisher Scoring iterations: 11
exp(coef(fit.linear))
##             (Intercept)                     age                  female 
##            5.280460e-12            1.297181e+00            7.276301e-01 
##  as.factor(educlevel)15  as.factor(educlevel)68 as.factor(educlevel)919 
##            9.107866e-01            1.074519e+00            9.855414e-01 
##   as.factor(locsize01)2   as.factor(locsize01)3   as.factor(locsize01)4 
##            1.018219e+00            8.382194e-01            8.735121e-01
#iii.   Controlling for age and age-squared .
fit.quadratic<-glm(dead ~ age + agesq + female + as.factor(educlevel) + as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.quadratic)
## 
## Call:
## glm(formula = dead ~ age + agesq + female + as.factor(educlevel) + 
##     as.factor(locsize01), family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             10.0588558  4.3248237   2.326    0.020 *  
## age                     -0.7130010  0.1180136  -6.042 1.53e-09 ***
## agesq                    0.0065284  0.0008011   8.149 3.66e-16 ***
## female                  -0.3174340  0.0713422  -4.449 8.61e-06 ***
## as.factor(educlevel)15  -0.0863476  0.0891911  -0.968    0.333    
## as.factor(educlevel)68   0.0827675  0.1019707   0.812    0.417    
## as.factor(educlevel)919  0.0003919  0.1211784   0.003    0.997    
## as.factor(locsize01)2    0.0190345  0.1006706   0.189    0.850    
## as.factor(locsize01)3   -0.1765767  0.1252922  -1.409    0.159    
## as.factor(locsize01)4   -0.1283252  0.1008291  -1.273    0.203    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11271  on 888726  degrees of freedom
## AIC: 11291
## 
## Number of Fisher Scoring iterations: 12
exp(coef(fit.quadratic))
##             (Intercept)                     age                   agesq 
##            2.336176e+04            4.901710e-01            1.006550e+00 
##                  female  as.factor(educlevel)15  as.factor(educlevel)68 
##            7.280147e-01            9.172753e-01            1.086289e+00 
## as.factor(educlevel)919   as.factor(locsize01)2   as.factor(locsize01)3 
##            1.000392e+00            1.019217e+00            8.381345e-01 
##   as.factor(locsize01)4 
##            8.795673e-01
#iv.    Controlling for 5-year age groups (top-coded at 85+).
fit.pch5<-glm(dead ~ as.factor(age5) + as.factor(female) + as.factor(educlevel) + as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + as.factor(female) + as.factor(educlevel) + 
##     as.factor(locsize01), family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -23.352208 428.235696  -0.055    0.957    
## as.factor(age5)55-59      0.007880 488.897896   0.000    1.000    
## as.factor(age5)60-64      0.016416 469.715144   0.000    1.000    
## as.factor(age5)65-69     15.841814 428.235699   0.037    0.970    
## as.factor(age5)70-74     16.600548 428.235693   0.039    0.969    
## as.factor(age5)75-79     16.983793 428.235694   0.040    0.968    
## as.factor(age5)80-84     18.751888 428.235690   0.044    0.965    
## as.factor(age5)85+       23.232129 428.236669   0.054    0.957    
## as.factor(female)1       -0.299973   0.071326  -4.206  2.6e-05 ***
## as.factor(educlevel)15   -0.101122   0.089217  -1.133    0.257    
## as.factor(educlevel)68    0.033160   0.101993   0.325    0.745    
## as.factor(educlevel)919  -0.082844   0.121098  -0.684    0.494    
## as.factor(locsize01)2     0.008283   0.100552   0.082    0.934    
## as.factor(locsize01)3    -0.166632   0.125166  -1.331    0.183    
## as.factor(locsize01)4    -0.144852   0.100738  -1.438    0.150    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11433  on 888721  degrees of freedom
## AIC: 11463
## 
## Number of Fisher Scoring iterations: 22
exp(coef(fit.pch5))
##             (Intercept)    as.factor(age5)55-59    as.factor(age5)60-64 
##            7.215477e-11            1.007911e+00            1.016552e+00 
##    as.factor(age5)65-69    as.factor(age5)70-74    as.factor(age5)75-79 
##            7.585989e+06            1.620043e+07            2.376663e+07 
##    as.factor(age5)80-84      as.factor(age5)85+      as.factor(female)1 
##            1.392648e+08            1.229095e+10            7.408379e-01 
##  as.factor(educlevel)15  as.factor(educlevel)68 as.factor(educlevel)919 
##            9.038225e-01            1.033716e+00            9.204943e-01 
##   as.factor(locsize01)2   as.factor(locsize01)3   as.factor(locsize01)4 
##            1.008317e+00            8.465109e-01            8.651504e-01
#v. Any other(s) you may want to try, if any at all.

fit.interaction<-glm(dead ~ age*female + as.factor(educlevel) + as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.interaction)
## 
## Call:
## glm(formula = dead ~ age * female + as.factor(educlevel) + as.factor(locsize01), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -23.84624    0.80707 -29.547  < 2e-16 ***
## age                       0.23278    0.01035  22.489  < 2e-16 ***
## female                   -4.91331    1.23472  -3.979 6.91e-05 ***
## as.factor(educlevel)15   -0.08695    0.08912  -0.976 0.329241    
## as.factor(educlevel)68    0.07055    0.10186   0.693 0.488560    
## as.factor(educlevel)919  -0.02418    0.12104  -0.200 0.841662    
## as.factor(locsize01)2     0.01767    0.10058   0.176 0.860543    
## as.factor(locsize01)3    -0.17015    0.12520  -1.359 0.174145    
## as.factor(locsize01)4    -0.13444    0.10075  -1.334 0.182102    
## age:female                0.05891    0.01579   3.731 0.000191 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11308  on 888726  degrees of freedom
## AIC: 11328
## 
## Number of Fisher Scoring iterations: 11
exp(coef(fit.interaction))
##             (Intercept)                     age                  female 
##            4.402609e-11            1.262104e+00            7.348116e-03 
##  as.factor(educlevel)15  as.factor(educlevel)68 as.factor(educlevel)919 
##            9.167204e-01            1.073100e+00            9.761110e-01 
##   as.factor(locsize01)2   as.factor(locsize01)3   as.factor(locsize01)4 
##            1.017828e+00            8.435395e-01            8.742068e-01 
##              age:female 
##            1.060676e+00

1.b.

Each model specification makes different assumptions about how age influences the baseline hazard:

No Age Control (model_no_age): By excluding age, this model assumes that the baseline hazard is constant across ages, implying that age does not affect mortality risk. This is a simplistic and unrealistic assumption, as mortality risk generally increases with age.

Linear Age Control (model_linear_age): This model assumes a linear relationship between age and the hazard rate, capturing a general increase in risk as age rises. While it reflects the increasing mortality risk with age, it may not fully capture the accelerating risk seen in older age groups, where mortality tends to rise at a faster rate.

Quadratic Age Control (age and age-squared): Adding an age-squared term allows the model to capture a non-linear, accelerating increase in hazard with age, aligning more closely with mortality trends. This specification accounts for the rapid increase in mortality risk in older age groups.

Categorical Age Groups (5-year age groups): By treating age in 5-year intervals, this model does not impose a specific functional form on the age-hazard relationship. Each age group has its own baseline hazard, allowing for more flexibility to capture variations in risk across age groups. This approach models hazard changes in a stepwise manner rather than as a continuous function.

These distinctions highlight the importance of choosing an age specification that aligns with the mortality patterns in the data.

1.c.

# Calculate AIC and BIC for each model
model_comparison <- data.frame(
  Model = c("Without Age Control", "Linear Age Control", "Age + Age^2", "5-Year Age Groups"),
  AIC = c(AIC(fit.constant), AIC(fit.linear), AIC(fit.quadratic),AIC(fit.pch5)),
  BIC = c(BIC(fit.constant), BIC(fit.linear), BIC(fit.quadratic),BIC(fit.pch5))
)
print(model_comparison)
##                 Model      AIC      BIC
## 1 Without Age Control 13095.50 13189.09
## 2  Linear Age Control 11339.99 11445.27
## 3         Age + Age^2 11290.54 11407.51
## 4   5-Year Age Groups 11463.44 11638.90

While age + age² captures a nonlinear trend, interpreting the exact impact of age on mortality is straightforward. In contrast, 5-year age groups provide age-specific estimates but may be harder to analyze for trend analyses. If specific age thresholds are clinically meaningful, 5-year age groups might offer practical insights for age-related mortality patterns despite a slightly worse fit.

1.d.

# Load necessary libraries
library(ggplot2)
library(scales)

# Fit the logistic and complementary log-log models
fit.pch5_noctrls1 <- glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01), 
                         data = mhas_personmonth, family = binomial(link = "logit"))
fit.pch5_noctrls2 <- glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01), 
                         data = mhas_personmonth, family = binomial(link = "cloglog"))

# Summarize the models
summary(fit.pch5_noctrls1)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01), family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -23.352208 428.235696  -0.055    0.957    
## as.factor(age5)55-59      0.007880 488.897896   0.000    1.000    
## as.factor(age5)60-64      0.016416 469.715144   0.000    1.000    
## as.factor(age5)65-69     15.841814 428.235699   0.037    0.970    
## as.factor(age5)70-74     16.600548 428.235693   0.039    0.969    
## as.factor(age5)75-79     16.983793 428.235694   0.040    0.968    
## as.factor(age5)80-84     18.751888 428.235690   0.044    0.965    
## as.factor(age5)85+       23.232129 428.236669   0.054    0.957    
## female                   -0.299973   0.071326  -4.206  2.6e-05 ***
## as.factor(educlevel)15   -0.101122   0.089217  -1.133    0.257    
## as.factor(educlevel)68    0.033160   0.101993   0.325    0.745    
## as.factor(educlevel)919  -0.082844   0.121098  -0.684    0.494    
## as.factor(locsize01)2     0.008283   0.100552   0.082    0.934    
## as.factor(locsize01)3    -0.166632   0.125166  -1.331    0.183    
## as.factor(locsize01)4    -0.144852   0.100738  -1.438    0.150    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11433  on 888721  degrees of freedom
## AIC: 11463
## 
## Number of Fisher Scoring iterations: 22
summary(fit.pch5_noctrls2)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01), family = binomial(link = "cloglog"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -23.264571 409.439593  -0.057    0.955    
## as.factor(age5)55-59      0.007863 467.439263   0.000    1.000    
## as.factor(age5)60-64      0.016368 449.098475   0.000    1.000    
## as.factor(age5)65-69     15.751740 409.439596   0.038    0.969    
## as.factor(age5)70-74     16.510187 409.439589   0.040    0.968    
## as.factor(age5)75-79     16.893154 409.439590   0.041    0.967    
## as.factor(age5)80-84     18.657862 409.439586   0.046    0.964    
## as.factor(age5)85+       22.845754 409.440224   0.056    0.956    
## female                   -0.297978   0.071138  -4.189 2.81e-05 ***
## as.factor(educlevel)15   -0.099404   0.088984  -1.117    0.264    
## as.factor(educlevel)68    0.033667   0.101769   0.331    0.741    
## as.factor(educlevel)919  -0.081931   0.120869  -0.678    0.498    
## as.factor(locsize01)2     0.010016   0.100222   0.100    0.920    
## as.factor(locsize01)3    -0.165974   0.124907  -1.329    0.184    
## as.factor(locsize01)4    -0.144493   0.100499  -1.438    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11434  on 888721  degrees of freedom
## AIC: 11464
## 
## Number of Fisher Scoring iterations: 22
# Convert coefficients to regular numbers for logistic model
logit_coefficients <- coef(fit.pch5_noctrls1)
logit_coefficients_formatted <- format(logit_coefficients, scientific = FALSE)

# Convert coefficients to regular numbers for complementary log-log model
cloglog_coefficients <- coef(fit.pch5_noctrls2)
cloglog_coefficients_formatted <- format(cloglog_coefficients, scientific = FALSE)

# Print formatted coefficients for both models
print("Logistic Model Coefficients (Formatted):")
## [1] "Logistic Model Coefficients (Formatted):"
print(logit_coefficients_formatted)
##             (Intercept)    as.factor(age5)55-59    as.factor(age5)60-64 
##         "-23.352207664"         "  0.007879531"         "  0.016416364" 
##    as.factor(age5)65-69    as.factor(age5)70-74    as.factor(age5)75-79 
##         " 15.841813539"         " 16.600548153"         " 16.983792970" 
##    as.factor(age5)80-84      as.factor(age5)85+                  female 
##         " 18.751887931"         " 23.232129211"         " -0.299973388" 
##  as.factor(educlevel)15  as.factor(educlevel)68 as.factor(educlevel)919 
##         " -0.101122272"         "  0.033160230"         " -0.082844479" 
##   as.factor(locsize01)2   as.factor(locsize01)3   as.factor(locsize01)4 
##         "  0.008282517"         " -0.166632208"         " -0.144851967"
print("Complementary Log-Log Model Coefficients (Formatted):")
## [1] "Complementary Log-Log Model Coefficients (Formatted):"
print(cloglog_coefficients_formatted)
##             (Intercept)    as.factor(age5)55-59    as.factor(age5)60-64 
##         "-23.264571436"         "  0.007862525"         "  0.016367777" 
##    as.factor(age5)65-69    as.factor(age5)70-74    as.factor(age5)75-79 
##         " 15.751739841"         " 16.510186636"         " 16.893153669" 
##    as.factor(age5)80-84      as.factor(age5)85+                  female 
##         " 18.657862048"         " 22.845754485"         " -0.297977860" 
##  as.factor(educlevel)15  as.factor(educlevel)68 as.factor(educlevel)919 
##         " -0.099403506"         "  0.033666597"         " -0.081931344" 
##   as.factor(locsize01)2   as.factor(locsize01)3   as.factor(locsize01)4 
##         "  0.010016181"         " -0.165974487"         " -0.144492938"
# Exponentiate coefficients for female, education levels, and locality size for both models
logit_exp_coefficients <- exp(logit_coefficients[c("female", "as.factor(educlevel)15", "as.factor(educlevel)68", 
                                                   "as.factor(educlevel)919", "as.factor(locsize01)2", 
                                                   "as.factor(locsize01)3", "as.factor(locsize01)4")])

cloglog_exp_coefficients <- exp(cloglog_coefficients[c("female", "as.factor(educlevel)15", "as.factor(educlevel)68", 
                                                       "as.factor(educlevel)919", "as.factor(locsize01)2", 
                                                       "as.factor(locsize01)3", "as.factor(locsize01)4")])

# Print exponentiated coefficients for both models
print("Exponentiated Coefficients (Odds Ratios) - Logistic Model:")
## [1] "Exponentiated Coefficients (Odds Ratios) - Logistic Model:"
print(logit_exp_coefficients)
##                  female  as.factor(educlevel)15  as.factor(educlevel)68 
##               0.7408379               0.9038225               1.0337162 
## as.factor(educlevel)919   as.factor(locsize01)2   as.factor(locsize01)3 
##               0.9204943               1.0083169               0.8465109 
##   as.factor(locsize01)4 
##               0.8651504
print("Exponentiated Coefficients (Odds Ratios) - Complementary Log-Log Model:")
## [1] "Exponentiated Coefficients (Odds Ratios) - Complementary Log-Log Model:"
print(cloglog_exp_coefficients)
##                  female  as.factor(educlevel)15  as.factor(educlevel)68 
##               0.7423178               0.9053773               1.0342397 
## as.factor(educlevel)919   as.factor(locsize01)2   as.factor(locsize01)3 
##               0.9213352               1.0100665               0.8470678 
##   as.factor(locsize01)4 
##               0.8654610
# Obtain predicted values for each age group in both models
predicted_logit <- predict(fit.pch5_noctrls1, type = "response")
predicted_cloglog <- predict(fit.pch5_noctrls2, type = "response")

# Combine the predictions with age groups for comparison
predictions <- data.frame(
  Age_Group = mhas_personmonth$age5,
  Logistic_Prediction = predicted_logit,
  Complementary_LogLog_Prediction = predicted_cloglog
)

# Aggregate predicted values by age group for comparison
predictions_summary <- aggregate(. ~ Age_Group, data = predictions, mean)

# Plotting the predictions with readable y-axis labels
ggplot(predictions_summary, aes(x = Age_Group)) +
  geom_line(aes(y = Logistic_Prediction, color = "Logistic Model", group = 1), size = 1) +
  geom_line(aes(y = Complementary_LogLog_Prediction, color = "Complementary Log-Log Model", group = 1), size = 1) +
  geom_point(aes(y = Logistic_Prediction, color = "Logistic Model"), size = 2) +
  geom_point(aes(y = Complementary_LogLog_Prediction, color = "Complementary Log-Log Model"), size = 2) +
  scale_y_log10(labels = label_number()) +  # Use label_number() for readable labels
  labs(
    title = "Model Comparison of Predicted Probabilities by Age Group",
    x = "Age Group",
    y = "Predicted Probability (Log Scale)",
    color = "Model"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

1.e.

# Fit the model
fit.pch5_noctrlsa <- glm(dead ~ as.factor(age5), data = mhas_personmonth, family = binomial(link = "logit"))
summary(fit.pch5_noctrlsa)
## 
## Call:
## glm(formula = dead ~ as.factor(age5), family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)
## (Intercept)          -2.357e+01  4.289e+02  -0.055    0.956
## as.factor(age5)55-59  2.622e-07  4.897e+02   0.000    1.000
## as.factor(age5)60-64  2.601e-07  4.705e+02   0.000    1.000
## as.factor(age5)65-69  1.582e+01  4.289e+02   0.037    0.971
## as.factor(age5)70-74  1.657e+01  4.289e+02   0.039    0.969
## as.factor(age5)75-79  1.694e+01  4.289e+02   0.039    0.968
## as.factor(age5)80-84  1.870e+01  4.289e+02   0.044    0.965
## as.factor(age5)85+    2.316e+01  4.289e+02   0.054    0.957
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11456  on 888728  degrees of freedom
## AIC: 11472
## 
## Number of Fisher Scoring iterations: 22
AIC(fit.pch5_noctrlsa)
## [1] 11472.07
BIC(fit.pch5_noctrlsa)
## [1] 11565.65
# Create a new data frame with only the relevant age groups for prediction
xvalues <- data.frame(age5 = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+"))
xvalues$phat <- predict(fit.pch5_noctrlsa, xvalues, type = "response")

# Plotting the predicted probabilities
library(ggplot2)

ggplot(xvalues, aes(x = age5, y = phat, group = 1)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted Probability of Death", title = "Predicted Death Probabilities for 5-Year Categories") +
  theme_minimal()

# Fit the model
fit.pch5_noctrlsb <- glm(dead ~ as.factor(age5), data = mhas_personmonth, family = binomial(link = "cloglog"))
summary(fit.pch5_noctrlsb)
## 
## Call:
## glm(formula = dead ~ as.factor(age5), family = binomial(link = "cloglog"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)
## (Intercept)          -2.348e+01  4.101e+02  -0.057    0.954
## as.factor(age5)55-59  5.306e-07  4.682e+02   0.000    1.000
## as.factor(age5)60-64  5.283e-07  4.498e+02   0.000    1.000
## as.factor(age5)65-69  1.573e+01  4.101e+02   0.038    0.969
## as.factor(age5)70-74  1.648e+01  4.101e+02   0.040    0.968
## as.factor(age5)75-79  1.685e+01  4.101e+02   0.041    0.967
## as.factor(age5)80-84  1.861e+01  4.101e+02   0.045    0.964
## as.factor(age5)85+    2.280e+01  4.101e+02   0.056    0.956
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11456  on 888728  degrees of freedom
## AIC: 11472
## 
## Number of Fisher Scoring iterations: 22
AIC(fit.pch5_noctrlsb)
## [1] 11472.07
BIC(fit.pch5_noctrlsb)
## [1] 11565.65
# Create a new data frame with only the relevant age groups for prediction
xvalues <- data.frame(age5 = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+"))
xvalues$phat <- predict(fit.pch5_noctrlsb, xvalues, type = "response")

# Plotting the predicted probabilities
library(ggplot2)

ggplot(xvalues, aes(x = age5, y = phat, group = 1)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted Probability of Death", title = "Predicted Death Probabilities for 5-Year Categories") +
  theme_minimal()

# Load knitr package for displaying tables
library(knitr)

# Create the data frame
odds_ratios_table <- data.frame(
  Variable = c("Female", "Schooling 1-5 years", "Schooling 6-8 years", 
               "Schooling 9 years or more", "locsize2", 
               "locsize3", "locsize4"),
  `Logistic Model (Odds Ratio)` = c(0.7408, 0.9038, 1.0337, 0.9205, 1.0083, 0.8465, 0.8652),
  `Complementary Log-Log Model (Odds Ratio)` = c(0.7424, 0.9054, 1.0342, 0.9213, 1.0101, 0.8471, 0.8655)
)

# Display the table
kable(odds_ratios_table, caption = "Exponentiated Coefficients (Odds Ratios) for Gender, Education Levels, and Locality Size")
Exponentiated Coefficients (Odds Ratios) for Gender, Education Levels, and Locality Size
Variable Logistic.Model..Odds.Ratio. Complementary.Log.Log.Model..Odds.Ratio.
Female 0.7408 0.7424
Schooling 1-5 years 0.9038 0.9054
Schooling 6-8 years 1.0337 1.0342
Schooling 9 years or more 0.9205 0.9213
locsize2 1.0083 1.0101
locsize3 0.8465 0.8471
locsize4 0.8652 0.8655

1.e.i.

Logit Model Interpretation Intercept (-23.35, p = 0.957) is not significant; represents the baseline log-odds of mortality for the reference categories. None of the age groups are substantial (p > 0.05), indicating no significant difference in mortality across age groups relative to the 50-54 reference group. Females have significantly lower odds of mortality than males. The odds ratio is approximately 0.74, meaning females have a 26% lower risk of death than males, holding other factors constant. No education levels are significant. This suggests that educational attainment does not significantly influence mortality in this model. None of the locality size categories are significant, indicating that locality size does not meaningfully impact mortality risk in this model. Complementary Log-Log Model Interpretation Intercept (-23.26, p = 0.955) is not significant; represents the baseline hazard of mortality for the reference categories. None of the age groups have a significant effect (p > 0.05), meaning mortality risk does not differ significantly by age relative to the 50-54 reference group. Females have significantly lower mortality risk than males. The hazard ratio suggests that being female is associated with a roughly 26% reduction in the risk of death compared to males. Similar to the logit model, no education levels are significant, implying no strong association between educational attainment and mortality in this dataset. None of the locality size categories are significant, indicating that locality size does not significantly affect mortality risk in this model. Both models consistently find that being female is associated with a lower mortality risk, while age, education, and locality size do not have significant impacts on mortality within this dataset.

1.e.ii.

The odds ratios show that gender has a significant effect on mortality, with females having a 26% lower odds of death than males in both models, indicating a consistent protective effect. Education levels provide some protective effect, particularly for those with 9 or more years of schooling, who show around an 8% lower mortality risk compared to those with no schooling. However, the impact of education is generally modest, with odds ratios close to 1, indicating minimal variation in mortality risk across different schooling levels.

For locality size, larger communities (locsize3 and locsize4) are associated with slightly lower mortality odds compared to the smallest locality (locsize1), suggesting a modest protective effect of residing in larger areas. Specifically, residents in locsize3 and locsize4 have approximately 15% and 13.5% lower odds of mortality, respectively. Overall, gender and locality size show consistent protective associations with mortality, while education has a more nuanced, modest effect.

The odds ratios are very similar across both the logistic and complementary log-log models, indicating that the relationship between gender, education levels, and locality size with mortality is consistent regardless of the model used.

1.f.

fit.pch5phtest1<-glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5):female, data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5phtest1)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5):female, family = binomial(link = logit), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)
## (Intercept)                 -2.348e+01  6.008e+02  -0.039    0.969
## as.factor(age5)55-59         1.596e-03  6.887e+02   0.000    1.000
## as.factor(age5)60-64         3.720e-03  6.636e+02   0.000    1.000
## as.factor(age5)65-69         1.612e+01  6.008e+02   0.027    0.979
## as.factor(age5)70-74         1.678e+01  6.008e+02   0.028    0.978
## as.factor(age5)75-79         1.720e+01  6.008e+02   0.029    0.977
## as.factor(age5)80-84         1.877e+01  6.008e+02   0.031    0.975
## as.factor(age5)85+          -8.300e-02  7.946e+04   0.000    1.000
## female                      -1.160e-02  8.576e+02   0.000    1.000
## as.factor(educlevel)15      -9.778e-02  8.926e-02  -1.096    0.273
## as.factor(educlevel)68       3.028e-02  1.020e-01   0.297    0.767
## as.factor(educlevel)919     -9.155e-02  1.212e-01  -0.755    0.450
## as.factor(locsize01)2        6.124e-03  1.006e-01   0.061    0.951
## as.factor(locsize01)3       -1.632e-01  1.252e-01  -1.304    0.192
## as.factor(locsize01)4       -1.463e-01  1.008e-01  -1.452    0.147
## as.factor(age5)55-59:female  2.135e-03  9.791e+02   0.000    1.000
## as.factor(age5)60-64:female  1.698e-03  9.410e+02   0.000    1.000
## as.factor(age5)65-69:female -6.323e-01  8.576e+02  -0.001    0.999
## as.factor(age5)70-74:female -3.844e-01  8.576e+02   0.000    1.000
## as.factor(age5)75-79:female -4.561e-01  8.576e+02  -0.001    1.000
## as.factor(age5)80-84:female -9.534e-02  8.576e+02   0.000    1.000
## as.factor(age5)85+:female    2.364e+01  7.947e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11424  on 888714  degrees of freedom
## AIC: 11468
## 
## Number of Fisher Scoring iterations: 22
AIC(fit.pch5phtest1)
## [1] 11468.38
BIC(fit.pch5phtest1)
## [1] 11725.72
fit.pch5phtest2<-glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5):as.factor(educlevel), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5phtest2)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5):as.factor(educlevel), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients: (2 not defined because of singularities)
##                                                Estimate Std. Error z value
## (Intercept)                                  -2.335e+01  1.160e+03  -0.020
## as.factor(age5)55-59                          7.619e-03  1.293e+03   0.000
## as.factor(age5)60-64                          1.272e-02  1.241e+03   0.000
## as.factor(age5)65-69                          1.532e+01  1.160e+03   0.013
## as.factor(age5)70-74                          1.636e+01  1.160e+03   0.014
## as.factor(age5)75-79                          1.716e+01  1.160e+03   0.015
## as.factor(age5)80-84                          1.880e+01  1.160e+03   0.016
## as.factor(age5)85+                            2.250e+01  1.160e+03   0.019
## female                                       -2.908e-01  7.145e-02  -4.070
## as.factor(educlevel)15                       -1.405e-02  1.392e+03   0.000
## as.factor(educlevel)68                       -5.774e-02  1.405e+03   0.000
## as.factor(educlevel)919                      -1.061e-01  1.431e+03   0.000
## as.factor(locsize01)2                         1.728e-03  1.005e-01   0.017
## as.factor(locsize01)3                        -1.716e-01  1.252e-01  -1.371
## as.factor(locsize01)4                        -1.417e-01  1.007e-01  -1.408
## as.factor(age5)55-59:as.factor(educlevel)15  -6.591e-03  1.561e+03   0.000
## as.factor(age5)60-64:as.factor(educlevel)15  -7.156e-03  1.498e+03   0.000
## as.factor(age5)65-69:as.factor(educlevel)15   2.540e-01  1.392e+03   0.000
## as.factor(age5)70-74:as.factor(educlevel)15   4.641e-02  1.392e+03   0.000
## as.factor(age5)75-79:as.factor(educlevel)15  -3.459e-01  1.392e+03   0.000
## as.factor(age5)80-84:as.factor(educlevel)15  -5.425e-02  1.392e+03   0.000
## as.factor(age5)85+:as.factor(educlevel)15     2.472e+01  7.947e+04   0.000
## as.factor(age5)55-59:as.factor(educlevel)68  -6.752e-03  1.583e+03   0.000
## as.factor(age5)60-64:as.factor(educlevel)68  -9.084e-03  1.521e+03   0.000
## as.factor(age5)65-69:as.factor(educlevel)68   9.307e-01  1.405e+03   0.001
## as.factor(age5)70-74:as.factor(educlevel)68   4.017e-01  1.405e+03   0.000
## as.factor(age5)75-79:as.factor(educlevel)68  -4.424e-02  1.405e+03   0.000
## as.factor(age5)80-84:as.factor(educlevel)68  -1.732e-01  1.405e+03   0.000
## as.factor(age5)85+:as.factor(educlevel)68            NA         NA      NA
## as.factor(age5)55-59:as.factor(educlevel)919  1.966e-03  1.616e+03   0.000
## as.factor(age5)60-64:as.factor(educlevel)919  7.540e-03  1.554e+03   0.000
## as.factor(age5)65-69:as.factor(educlevel)919  6.467e-01  1.431e+03   0.000
## as.factor(age5)70-74:as.factor(educlevel)919  5.560e-01  1.431e+03   0.000
## as.factor(age5)75-79:as.factor(educlevel)919 -4.876e-01  1.431e+03   0.000
## as.factor(age5)80-84:as.factor(educlevel)919 -2.137e-01  1.431e+03   0.000
## as.factor(age5)85+:as.factor(educlevel)919           NA         NA      NA
##                                              Pr(>|z|)    
## (Intercept)                                     0.984    
## as.factor(age5)55-59                            1.000    
## as.factor(age5)60-64                            1.000    
## as.factor(age5)65-69                            0.989    
## as.factor(age5)70-74                            0.989    
## as.factor(age5)75-79                            0.988    
## as.factor(age5)80-84                            0.987    
## as.factor(age5)85+                              0.985    
## female                                        4.7e-05 ***
## as.factor(educlevel)15                          1.000    
## as.factor(educlevel)68                          1.000    
## as.factor(educlevel)919                         1.000    
## as.factor(locsize01)2                           0.986    
## as.factor(locsize01)3                           0.170    
## as.factor(locsize01)4                           0.159    
## as.factor(age5)55-59:as.factor(educlevel)15     1.000    
## as.factor(age5)60-64:as.factor(educlevel)15     1.000    
## as.factor(age5)65-69:as.factor(educlevel)15     1.000    
## as.factor(age5)70-74:as.factor(educlevel)15     1.000    
## as.factor(age5)75-79:as.factor(educlevel)15     1.000    
## as.factor(age5)80-84:as.factor(educlevel)15     1.000    
## as.factor(age5)85+:as.factor(educlevel)15       1.000    
## as.factor(age5)55-59:as.factor(educlevel)68     1.000    
## as.factor(age5)60-64:as.factor(educlevel)68     1.000    
## as.factor(age5)65-69:as.factor(educlevel)68     0.999    
## as.factor(age5)70-74:as.factor(educlevel)68     1.000    
## as.factor(age5)75-79:as.factor(educlevel)68     1.000    
## as.factor(age5)80-84:as.factor(educlevel)68     1.000    
## as.factor(age5)85+:as.factor(educlevel)68          NA    
## as.factor(age5)55-59:as.factor(educlevel)919    1.000    
## as.factor(age5)60-64:as.factor(educlevel)919    1.000    
## as.factor(age5)65-69:as.factor(educlevel)919    1.000    
## as.factor(age5)70-74:as.factor(educlevel)919    1.000    
## as.factor(age5)75-79:as.factor(educlevel)919    1.000    
## as.factor(age5)80-84:as.factor(educlevel)919    1.000    
## as.factor(age5)85+:as.factor(educlevel)919         NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11407  on 888702  degrees of freedom
## AIC: 11475
## 
## Number of Fisher Scoring iterations: 22
AIC(fit.pch5phtest2)
## [1] 11475.28
BIC(fit.pch5phtest2)
## [1] 11873
fit.pch5phtest3<-glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5):as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5phtest3)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5):as.factor(locsize01), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients: (1 not defined because of singularities)
##                                              Estimate Std. Error z value
## (Intercept)                                -2.338e+01  5.564e+02  -0.042
## as.factor(age5)55-59                        6.353e-03  6.367e+02   0.000
## as.factor(age5)60-64                        1.462e-02  6.118e+02   0.000
## as.factor(age5)65-69                        1.582e+01  5.564e+02   0.028
## as.factor(age5)70-74                        1.669e+01  5.564e+02   0.030
## as.factor(age5)75-79                        1.701e+01  5.564e+02   0.031
## as.factor(age5)80-84                        1.876e+01  5.564e+02   0.034
## as.factor(age5)85+                          2.291e+01  5.564e+02   0.041
## female                                     -3.005e-01  7.135e-02  -4.212
## as.factor(educlevel)15                     -9.915e-02  8.926e-02  -1.111
## as.factor(educlevel)68                      3.458e-02  1.021e-01   0.339
## as.factor(educlevel)919                    -8.435e-02  1.212e-01  -0.696
## as.factor(locsize01)2                      -1.699e-02  1.176e+03   0.000
## as.factor(locsize01)3                      -1.850e-02  1.633e+03   0.000
## as.factor(locsize01)4                      -1.899e-02  1.210e+03   0.000
## as.factor(age5)55-59:as.factor(locsize01)2 -2.791e-03  1.350e+03   0.000
## as.factor(age5)60-64:as.factor(locsize01)2 -4.289e-03  1.299e+03   0.000
## as.factor(age5)65-69:as.factor(locsize01)2  3.790e-01  1.176e+03   0.000
## as.factor(age5)70-74:as.factor(locsize01)2 -3.034e-01  1.176e+03   0.000
## as.factor(age5)75-79:as.factor(locsize01)2 -1.741e-01  1.176e+03   0.000
## as.factor(age5)80-84:as.factor(locsize01)2  1.285e-01  1.176e+03   0.000
## as.factor(age5)85+:as.factor(locsize01)2    2.435e+01  7.947e+04   0.000
## as.factor(age5)55-59:as.factor(locsize01)3  1.847e-02  1.853e+03   0.000
## as.factor(age5)60-64:as.factor(locsize01)3  1.146e-02  1.775e+03   0.000
## as.factor(age5)65-69:as.factor(locsize01)3  1.179e-01  1.633e+03   0.000
## as.factor(age5)70-74:as.factor(locsize01)3 -2.234e-01  1.633e+03   0.000
## as.factor(age5)75-79:as.factor(locsize01)3 -2.590e-01  1.633e+03   0.000
## as.factor(age5)80-84:as.factor(locsize01)3 -1.291e-01  1.633e+03   0.000
## as.factor(age5)85+:as.factor(locsize01)3           NA         NA      NA
## as.factor(age5)55-59:as.factor(locsize01)4 -1.320e-02  1.372e+03   0.000
## as.factor(age5)60-64:as.factor(locsize01)4 -1.831e-02  1.318e+03   0.000
## as.factor(age5)65-69:as.factor(locsize01)4 -4.757e-01  1.210e+03   0.000
## as.factor(age5)70-74:as.factor(locsize01)4 -1.693e-01  1.210e+03   0.000
## as.factor(age5)75-79:as.factor(locsize01)4  6.390e-02  1.210e+03   0.000
## as.factor(age5)80-84:as.factor(locsize01)4 -1.287e-01  1.210e+03   0.000
## as.factor(age5)85+:as.factor(locsize01)4   -2.278e+01  7.947e+04   0.000
##                                            Pr(>|z|)    
## (Intercept)                                   0.966    
## as.factor(age5)55-59                          1.000    
## as.factor(age5)60-64                          1.000    
## as.factor(age5)65-69                          0.977    
## as.factor(age5)70-74                          0.976    
## as.factor(age5)75-79                          0.976    
## as.factor(age5)80-84                          0.973    
## as.factor(age5)85+                            0.967    
## female                                     2.54e-05 ***
## as.factor(educlevel)15                        0.267    
## as.factor(educlevel)68                        0.735    
## as.factor(educlevel)919                       0.487    
## as.factor(locsize01)2                         1.000    
## as.factor(locsize01)3                         1.000    
## as.factor(locsize01)4                         1.000    
## as.factor(age5)55-59:as.factor(locsize01)2    1.000    
## as.factor(age5)60-64:as.factor(locsize01)2    1.000    
## as.factor(age5)65-69:as.factor(locsize01)2    1.000    
## as.factor(age5)70-74:as.factor(locsize01)2    1.000    
## as.factor(age5)75-79:as.factor(locsize01)2    1.000    
## as.factor(age5)80-84:as.factor(locsize01)2    1.000    
## as.factor(age5)85+:as.factor(locsize01)2      1.000    
## as.factor(age5)55-59:as.factor(locsize01)3    1.000    
## as.factor(age5)60-64:as.factor(locsize01)3    1.000    
## as.factor(age5)65-69:as.factor(locsize01)3    1.000    
## as.factor(age5)70-74:as.factor(locsize01)3    1.000    
## as.factor(age5)75-79:as.factor(locsize01)3    1.000    
## as.factor(age5)80-84:as.factor(locsize01)3    1.000    
## as.factor(age5)85+:as.factor(locsize01)3         NA    
## as.factor(age5)55-59:as.factor(locsize01)4    1.000    
## as.factor(age5)60-64:as.factor(locsize01)4    1.000    
## as.factor(age5)65-69:as.factor(locsize01)4    1.000    
## as.factor(age5)70-74:as.factor(locsize01)4    1.000    
## as.factor(age5)75-79:as.factor(locsize01)4    1.000    
## as.factor(age5)80-84:as.factor(locsize01)4    1.000    
## as.factor(age5)85+:as.factor(locsize01)4      1.000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13114  on 888735  degrees of freedom
## Residual deviance: 11422  on 888701  degrees of freedom
## AIC: 11492
## 
## Number of Fisher Scoring iterations: 22
AIC(fit.pch5phtest3)
## [1] 11491.64
BIC(fit.pch5phtest3)
## [1] 11901.05
#install.packages("lmtest")
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Add your more complex model second
lrtest(fit.pch5, fit.pch5phtest1)
lrtest(fit.pch5, fit.pch5phtest2)
lrtest(fit.pch5, fit.pch5phtest3)

Answer:

The likelihood ratio tests indicate that the proportional hazards assumption holds for gender, education levels, and locality size, as none of the interactions with age were statistically significant: Gender: The interaction test between age and gender yielded a chi-square of 9.0593 (p = 0.2484), suggesting no significant interaction. Education Level: The age and education level interaction test produced a chi-square of 26.153 (p = 0.126), indicating no significant effect. Locality Size: The test for interaction between age and locality size showed a chi-square of 11.796 (p = 0.9229), also non-significant. Since the proportional hazards assumption holds, no further adjustments are needed. If it had not, we could address non-proportionality by using Cox models with time-varying coefficients or stratified Cox models, which allow for different baseline hazards across strata (e.g., by gender or education level).

1.g.

Females have a significantly lower risk of mortality than males. The interaction between age and females is significant, showing that the protective effect of being female decreases with age. Higher education levels are generally associated with lower mortality risk, though this effect diminishes when adjusting for age as a continuous variable, suggesting age may mediate this association. No significant association with mortality was found for locality size, and the proportional hazards assumption holds for this variable. Gender and education levels are associated with mortality risk, with gender effects varying by age. Locality size does not significantly impact mortality in this dataset. The proportional hazards assumption largely holds, but if it did not, time-varying coefficients or stratified models could address non-proportionality.