Warning: package 'readr' was built under R version 4.3.2
getwd()
[1] "C:/Users/jacob/University of Texas at San Antonio/TEAM - Cossman Crew - General"
setwd("C:/Users/jacob/Downloads")mhas0121dummies<-read.csv("MHAS0121 NO LABELS NO NEGATIVE TIMES (2).csv")mhas0121noNA<-mhas0121dummies#update.packages()#Split data by person-monthlibrary(survival)library(dplyr)
Warning: package 'dplyr' was built under R version 4.3.2
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
#Possible max time is 296 months, but using an even higher threshold in case things change#rm(mhas_personmonth)# Code adapted from Mills (2011): section 3.5# Expands data into person-months, up to maximum number age in months observed in datamhas_personmonth<-survSplit(mhas0121noNA, cut=c(1:2000), start="startmo", end="endmo", event="dead")# Sorting data by IDmhas_personmonth<-mhas_personmonth[order (mhas_personmonth$id, mhas_personmonth$ageintmo),]#Creating time-varying age variables for different model specifications0mhas_personmonth<-mhas_personmonth %>%mutate(age=startmo/12, #could truncate if one wanted to, but I don't agesq=age^2,agectr=age-50,ageln=logb(age),.after ="died0121" )library(dplyr)library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.3.3
#install.packages("AMR")library(AMR)
Warning: package 'AMR' was built under R version 4.3.3
mhas_personmonth <- mhas_personmonth %>%mutate(# Create age groupsage5 = dplyr::case_when( age >=50& age <55~"50-54", age >=55& age <60~"55-59", age >=60& age <65~"60-64", age >=65& age <70~"65-69", age >=70& age <75~"70-74", age >=75& age <80~"75-79", age >=80& age <85~"80-84", age >=85~"85+" ),# Convert to factorage5 =factor( age5,level =c("50-54", "55-59","60-64","65-69","70-74","75-79","80-84","85+")))mhas_personmonth <- mhas_personmonth %>%mutate(age5b =case_when(age >=50& age <55~"50-54", age >=55& age <60~"55-59", age >=60& age <65~"60-64", age >=65& age <70~"65-69", age >=70& age <75~"70-74", age >=75& age <80~"75-79", age >=80& age <85~"80-84", age >=85~"85+"),age5b =factor(age5, level =c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+")))#write.csv(mhas_personmonth,"C:\\Users\\eay621\\OneDrive - University of Texas at San Antonio\\3. TEACHING\\EVENT HISTORY ANALYSIS\\EHA Fall 2024\\4. Problem sets\\MHAS0121_PERSONMONTHS.csv", row.names = TRUE)# Export to CSV to explore more easily (at least for me)#testPMdata <- mhas_personmonth[which(mhas_personmonth$id < 20, ]#testPMdata <- subset(mhas_personmonth, id < 50, select=c(id,ageint,agecensor,startmo,endmo,age,age5,died0121,dead))#write.csv(testPMdata,"C:\\Users\\eay621\\OneDrive - University of Texas at San Antonio\\3. TEACHING\\EVENT HISTORY ANALYSIS\\EHA Fall 2024\\4. Problem sets\\MHAS0121_PERSONMONTHS_TEST.csv", row.names = TRUE)
A. Models
i. Not controlling for age.
model_i <-glm(dead ~ female + schooling + locsize01, family = binomial, data = mhas_personmonth)summary(model_i)
Call:
glm(formula = dead ~ female + schooling + locsize01, family = binomial,
data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.341879 0.040457 -132.038 < 2e-16 ***
female -0.169605 0.029420 -5.765 8.16e-09 ***
schooling -0.063895 0.003981 -16.050 < 2e-16 ***
locsize01 -0.044814 0.013057 -3.432 0.000599 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 64671 on 1548755 degrees of freedom
Residual deviance: 64370 on 1548752 degrees of freedom
(2761 observations deleted due to missingness)
AIC: 64378
Number of Fisher Scoring iterations: 9
ii. Controlling for age linearly.
model_ii <-glm(dead ~ female + schooling + locsize01 + age, family = binomial, data = mhas_personmonth)summary(model_ii)
Call:
glm(formula = dead ~ female + schooling + locsize01 + age, family = binomial,
data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -11.590878 0.124633 -93.000 < 2e-16 ***
female -0.249253 0.029513 -8.445 < 2e-16 ***
schooling -0.016573 0.003927 -4.220 2.44e-05 ***
locsize01 -0.043019 0.013166 -3.267 0.00109 **
age 0.082480 0.001475 55.907 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 64671 on 1548755 degrees of freedom
Residual deviance: 61321 on 1548751 degrees of freedom
(2761 observations deleted due to missingness)
AIC: 61331
Number of Fisher Scoring iterations: 9
iii. Controlling for age and age-squared.
mhas_personmonth <- mhas_personmonth %>%mutate(age_squared = age^2)model_iii <-glm(dead ~ female + schooling + locsize01 + age + age_squared, family = binomial, data = mhas_personmonth)summary(model_iii)
Call:
glm(formula = dead ~ female + schooling + locsize01 + age + age_squared,
family = binomial, data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -12.950667 0.670706 -19.309 < 2e-16 ***
female -0.251018 0.029522 -8.503 < 2e-16 ***
schooling -0.016543 0.003926 -4.213 2.51e-05 ***
locsize01 -0.043204 0.013167 -3.281 0.00103 **
age 0.117773 0.017160 6.863 6.73e-12 ***
age_squared -0.000225 0.000109 -2.064 0.03902 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 64671 on 1548755 degrees of freedom
Residual deviance: 61317 on 1548750 degrees of freedom
(2761 observations deleted due to missingness)
AIC: 61329
Number of Fisher Scoring iterations: 9
iv. Controlling for 5-year age groups (top-coded at 85+)
mhas_personmonth <- mhas_personmonth %>%mutate(age_group =cut(age, breaks =seq(0, 90, by =5), right =FALSE))model_iv <-glm(dead ~ female + schooling + locsize01 + age + age_group, family = binomial, data = mhas_personmonth)summary(model_iv)
Call:
glm(formula = dead ~ female + schooling + locsize01 + age + age_group,
family = binomial, data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -12.212054 0.609769 -20.027 < 2e-16 ***
female -0.275696 0.031532 -8.743 < 2e-16 ***
schooling -0.019635 0.004127 -4.758 1.95e-06 ***
locsize01 -0.048479 0.014189 -3.417 0.000634 ***
age 0.096076 0.010892 8.821 < 2e-16 ***
age_group[55,60) -0.378618 0.199139 -1.901 0.057266 .
age_group[60,65) -0.177407 0.204161 -0.869 0.384869
age_group[65,70) -0.136494 0.231969 -0.588 0.556253
age_group[70,75) -0.383645 0.269430 -1.424 0.154471
age_group[75,80) -0.438829 0.312462 -1.404 0.160192
age_group[80,85) -0.485129 0.358462 -1.353 0.175940
age_group[85,90) -0.488751 0.406437 -1.203 0.229160
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 57413 on 1512938 degrees of freedom
Residual deviance: 55138 on 1512927 degrees of freedom
(38578 observations deleted due to missingness)
AIC: 55162
Number of Fisher Scoring iterations: 10
v. Any other(s) you may want to try, if any at all.
va. cubed
mhas_personmonth <- mhas_personmonth %>%mutate(age_cubed= age^3)model_va <-glm(dead ~ female + schooling + locsize01 + age + age_cubed, family = binomial, data = mhas_personmonth)summary(model_va)
Call:
glm(formula = dead ~ female + schooling + locsize01 + age + age_cubed,
family = binomial, data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.258e+01 4.584e-01 -27.448 < 2e-16 ***
female -2.513e-01 2.952e-02 -8.511 < 2e-16 ***
schooling -1.654e-02 3.926e-03 -4.213 2.52e-05 ***
locsize01 -4.325e-02 1.317e-02 -3.285 0.00102 **
age 1.018e-01 8.715e-03 11.680 < 2e-16 ***
age_cubed -1.031e-06 4.590e-07 -2.246 0.02468 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 64671 on 1548755 degrees of freedom
Residual deviance: 61316 on 1548750 degrees of freedom
(2761 observations deleted due to missingness)
AIC: 61328
Number of Fisher Scoring iterations: 9
vb interaction.
model_vb <-glm(dead ~ female * age + schooling + locsize01, family = binomial, data = mhas_personmonth)summary(model_vb)
Call:
glm(formula = dead ~ female * age + schooling + locsize01, family = binomial,
data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -11.267276 0.168743 -66.772 < 2e-16 ***
female -0.881373 0.225373 -3.911 9.20e-05 ***
age 0.078291 0.002093 37.408 < 2e-16 ***
schooling -0.017041 0.003931 -4.335 1.46e-05 ***
locsize01 -0.042486 0.013166 -3.227 0.00125 **
female:age 0.008112 0.002868 2.829 0.00467 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 64671 on 1548755 degrees of freedom
Residual deviance: 61313 on 1548750 degrees of freedom
(2761 observations deleted due to missingness)
AIC: 61325
Number of Fisher Scoring iterations: 9
B. Assumptions of Baseline Hazard
Model I. No Age Control:
The baseline hazard is constant with respect to age, implying no change in mortality risk as people get older. This assumption suggests that the probability of dying is the same for all ages, regardless of actual age differences. This assumption is unrealistic and runs counter to the literature. Thus, it is important to control for age.
Model II. Controlling for Age Linearly:
The baseline hazard changes linearly with age. This implies that each additional year of age has a fixed, additive effect on the risk of death. The model assumes that mortality risk increases (or decreases) at a constant rate with each year, which may not capture more complex age-related risk patterns which vary over the life course.
Model III. Controlling for Age and Age-Squared:
In this model, the baseline hazard has a quadratic relationship with age, allowing for non-linear effects. This can model a U-shaped or inverted U-shaped relationship, where mortality risk changes at different rates across ages. For example, risk might accelerate as people age, or it could rise more slowly at older ages. This approach is more flexible than the linear term alone and can better capture complex curivlinear patterns.
Model IV. Controlling for 5-Year Age Groups (Top-Coded at 85+):
The baseline hazard changes non-linearly, with distinct levels of risk for each age group. This approach allows for discrete changes in risk between age groups without imposing a continuous relationship. It’s flexible enough to capture sudden increases or stabilizes in risk within specific age brackets and can be especially useful when there’s a marked difference in mortality risk at certain ages.
Models Va and Vb. Other Specifications (e.g., Splines, Interactions):
Including interactions (such as age-gender) means that the baseline hazard is not only age-dependent but also varies by other covariates, like sex. For example, it could capture differences in how the risk of death changes with age between men and women.
Model AIC BIC
1 Model I 64377.68 64426.69
2 Model II 61331.44 61392.70
3 Model III 61329.07 61402.59
4 Model IV 55162.42 55309.18
5 Model V_a 61328.25 61401.77
6 Model V_b 61325.42 61398.94
Warning: package 'purrr' was built under R version 4.3.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ lubridate 1.9.2 ✔ tibble 3.2.1
✔ purrr 1.0.2 ✔ tidyr 1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Create a list of modelsmodels <-list(model_i, model_ii,model_iii,model_iv, model_va, model_vb)# Extract coefficients, standard errors, and odds ratios from each modelmodel_summaries <-lapply(seq_along(models), function(i) { model <- models[[i]]tidy(model) %>%mutate(Odds_Ratio =exp(estimate),Model =paste("Model", i) )})# Combine summaries into a single tablecomparison_table <-bind_rows(model_summaries) %>%select(Model, term, estimate, std.error, Odds_Ratio, p.value)# Pivot the data widerdf_wide <- comparison_table %>%pivot_wider(names_from = term,values_from =c(estimate, std.error, Odds_Ratio, p.value),names_glue ="{term}_{.value}" )df_wide
# A tibble: 6 × 61
Model `(Intercept)_estimate` female_estimate schooling_estimate
<chr> <dbl> <dbl> <dbl>
1 Model 1 -5.34 -0.170 -0.0639
2 Model 2 -11.6 -0.249 -0.0166
3 Model 3 -13.0 -0.251 -0.0165
4 Model 4 -12.2 -0.276 -0.0196
5 Model 5 -12.6 -0.251 -0.0165
6 Model 6 -11.3 -0.881 -0.0170
# ℹ 57 more variables: locsize01_estimate <dbl>, age_estimate <dbl>,
# age_squared_estimate <dbl>, `age_group[55,60)_estimate` <dbl>,
# `age_group[60,65)_estimate` <dbl>, `age_group[65,70)_estimate` <dbl>,
# `age_group[70,75)_estimate` <dbl>, `age_group[75,80)_estimate` <dbl>,
# `age_group[80,85)_estimate` <dbl>, `age_group[85,90)_estimate` <dbl>,
# age_cubed_estimate <dbl>, `female:age_estimate` <dbl>,
# `(Intercept)_std.error` <dbl>, female_std.error <dbl>, …
Most models have consistent estimates for the female variable, schooling, and locsize01. Model four with age groups had the best AIC and BIC, suggesting the best fit. Model IV has the lowest AIC (55162.42) and BIC (55309.18) values, suggesting it fits the data best among the models listed, with an appropriate balance of fit and complexity. Model III and Model V_b have similar AIC values (61329.07 and 61325.42, respectively) and slightly different BIC values, indicating comparable fits but with subtle differences in complexity or parameter penalties. Model I has the highest AIC and BIC values (64377.68 and 64426.69), indicating it is the least favorable model in terms of fit and complexity.In model 4, being female decreases the odds of the event, more education reduces the odds of the even and smaller or rural areas having reduced odds of the event. Risk of event increases the odds of the event with age. The effect of age is captured in different age categories, but most of the age group coefficients are not statistically significant, except for the group 55-60, which shows a marginally significant reduction in the odds of the event.
D. Logistic and Complementary Log-Log Models
Da. Logistic Model
logit_model <-glm(dead ~ age_group, data = mhas_personmonth, family =binomial(link ="logit"))summary(logit_model)
Call:
glm(formula = dead ~ age_group, family = binomial(link = "logit"),
data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.40036 0.16908 -43.768 < 2e-16 ***
age_group[55,60) 0.03536 0.19350 0.183 0.855
age_group[60,65) 0.71102 0.17793 3.996 6.44e-05 ***
age_group[65,70) 1.22275 0.17364 7.042 1.90e-12 ***
age_group[70,75) 1.44536 0.17293 8.358 < 2e-16 ***
age_group[75,80) 1.86759 0.17262 10.819 < 2e-16 ***
age_group[80,85) 2.29911 0.17272 13.311 < 2e-16 ***
age_group[85,90) 2.77515 0.17355 15.991 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 57551 on 1514413 degrees of freedom
Residual deviance: 55439 on 1514406 degrees of freedom
(37103 observations deleted due to missingness)
AIC: 55455
Number of Fisher Scoring iterations: 10
Db. Clog-Log Model
cloglog_model <-glm(dead ~ age_group, data = mhas_personmonth, family =binomial(link ="cloglog"))summary(cloglog_model)
Call:
glm(formula = dead ~ age_group, family = binomial(link = "cloglog"),
data = mhas_personmonth)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.40066 0.16903 -43.783 < 2e-16 ***
age_group[55,60) 0.03535 0.19344 0.183 0.855
age_group[60,65) 0.71070 0.17787 3.996 6.45e-05 ***
age_group[65,70) 1.22202 0.17358 7.040 1.92e-12 ***
age_group[70,75) 1.44437 0.17287 8.355 < 2e-16 ***
age_group[75,80) 1.86592 0.17256 10.813 < 2e-16 ***
age_group[80,85) 2.29638 0.17265 13.301 < 2e-16 ***
age_group[85,90) 2.77057 0.17345 15.973 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 57551 on 1514413 degrees of freedom
Residual deviance: 55439 on 1514406 degrees of freedom
(37103 observations deleted due to missingness)
AIC: 55455
Number of Fisher Scoring iterations: 10
# Predicted probabilities for logistic regressionlogit_preds <-predict(logit_model, type ="response")# Predicted probabilities for complementary-log-log regressioncloglog_preds <-predict(cloglog_model, type ="response")
# # Combine the predicted values with the age group variable for comparison# comparison_df <- data.frame(age_group = mhas_personmonth$age_group, # logit_preds = logit_preds, # cloglog_preds = cloglog_preds)# # # View the comparison dataframe# head(comparison_df)