Problem Set 5

Author

Jacob M. Souch

Import Data

library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.3.2
Warning: package 'readr' was built under R version 4.3.2
Warning: package 'purrr' was built under R version 4.3.2
Warning: package 'dplyr' was built under R version 4.3.2
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(AER)
Warning: package 'AER' was built under R version 4.3.3
Loading required package: car
Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

Loading required package: lmtest
Warning: package 'lmtest' was built under R version 4.3.3
Loading required package: zoo
Warning: package 'zoo' was built under R version 4.3.2

Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Loading required package: sandwich
Warning: package 'sandwich' was built under R version 4.3.2
Loading required package: survival
setwd("C:/Users/jacob/Downloads")

mhas0121dummies<-read.csv("MHAS0121 NO LABELS NO NEGATIVE TIMES.csv")

mhas0121noNA<-na.omit(mhas0121dummies)
getwd()
[1] "C:/Users/jacob/Downloads"
mhas0121noNA<-mhas0121noNA %>%
  mutate(
        startmo = ageint*12,
        endmo = agecensor*12,
        dead = died0121,
        ageintmo = startmo - mobirth  # Calculate age in months
       )

max(mhas0121noNA$exposure_months)
[1] 296
#Possible max time is 296 months, but using an even higher threshold in case things change

#rm(mhas_personmonth)
# Code adapted from Mills (2011): section 3.5
# Expands data into person-months, up to maximum number observed in 
mhas_personmonth<-survSplit(mhas0121noNA, cut=c(1:1000), start="startmo", end="endmo", event="dead")

# Sorting data by ID
mhas_personmonth<-mhas_personmonth[order (mhas_personmonth$id, mhas_personmonth$ageintmo),]

#Creating time-varying age variables for different model specifications0
mhas_personmonth<-mhas_personmonth %>%
  mutate(
  age=startmo/12, #could truncate if one wanted to, but I don't 
  agesq=age^2,
  .after = "died0121"
  )


mhas0121noNA <- mhas0121noNA %>%
  mutate(
    ageintmo = startmo - mobirth  # Calculate age in months
  )

# Create person-months data
mhas_personmonth <- survSplit(mhas0121noNA, cut = c(1:1000), start = "startmo", end = "endmo", event = "dead")




# Sorting data by ID and age
mhas_personmonth <- mhas_personmonth %>%
  arrange(id, ageintmo)

# Creating time-varying age variables
mhas_personmonth <- mhas_personmonth %>%
  mutate(
    age = startmo / 12,
    agesq = age^2,
    .after = died0121  # Ensure died0121 is a column in your data
  )

# Create age groups
mhas_personmonth <- mhas_personmonth %>%
  mutate(
    age5 = case_when(
      age >= 50 & age < 55 ~ "50-54",
      age >= 55 & age < 60 ~ "55-59",
      age >= 60 & age < 65 ~ "60-64",
      age >= 65 & age < 70 ~ "65-69",
      age >= 70 & age < 75 ~ "70-74",
      age >= 75 & age < 80 ~ "75-79",
      age >= 80 & age < 85 ~ "80-84",
      age >= 85             ~ "85+",
      TRUE                  ~ NA_character_ # Handle cases outside of these ranges
    ),
    age5 = factor(
      age5,
      levels = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+")
    )
  )

# Check the structure of the resulting data frame
model1 <- glm(dead ~ female + schooling + locsize01, family = binomial, data = mhas_personmonth)
summary(model1)

Call:
glm(formula = dead ~ female + schooling + locsize01, family = binomial, 
    data = mhas_personmonth)

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -6.545591   0.100200 -65.325  < 2e-16 ***
female      -0.139351   0.071153  -1.958   0.0502 .  
schooling   -0.052074   0.009389  -5.546 2.92e-08 ***
locsize01   -0.072584   0.031632  -2.295   0.0218 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 13114  on 888735  degrees of freedom
Residual deviance: 13080  on 888732  degrees of freedom
AIC: 13088

Number of Fisher Scoring iterations: 10
model2 <- glm(dead ~ female + schooling + locsize01 + age, family = binomial, data = mhas_personmonth)
summary(model2)

Call:
glm(formula = dead ~ female + schooling + locsize01 + age, family = binomial, 
    data = mhas_personmonth)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) -25.904271   0.623359 -41.556  < 2e-16 ***
female       -0.317068   0.071144  -4.457 8.32e-06 ***
schooling     0.003073   0.009162   0.335   0.7374    
locsize01    -0.055694   0.031897  -1.746   0.0808 .  
age           0.259811   0.007846  33.114  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 13114  on 888735  degrees of freedom
Residual deviance: 11326  on 888731  degrees of freedom
AIC: 11336

Number of Fisher Scoring iterations: 11
mhas_personmonth <- mhas_personmonth %>%
  mutate(age_group = cut(age, breaks = seq(0, 90, by = 5), right = FALSE))
         
model4 <- glm(dead ~ female  + schooling + locsize01 + age + age_group, family = binomial, data = mhas_personmonth)
summary(model4)

Call:
glm(formula = dead ~ female + schooling + locsize01 + age + age_group, 
    family = binomial, data = mhas_personmonth)

Coefficients:
                   Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -55.046798 421.380640  -0.131   0.8961    
female            -0.316194   0.071299  -4.435 9.22e-06 ***
schooling          0.003498   0.009183   0.381   0.7033    
locsize01         -0.054457   0.031983  -1.703   0.0886 .  
age                0.590257   0.031888  18.511  < 2e-16 ***
age_group[55,60)  -2.604669 479.442519  -0.005   0.9957    
age_group[60,65)  -5.468869 460.888238  -0.012   0.9905    
age_group[65,70)   7.366106 421.377371   0.017   0.9861    
age_group[70,75)   5.285063 421.377563   0.013   0.9900    
age_group[75,80)   2.761722 421.377824   0.007   0.9948    
age_group[80,85)   2.171142 421.378055   0.005   0.9959    
age_group[85,90)   4.610681 421.379545   0.011   0.9913    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 13114  on 888734  degrees of freedom
Residual deviance: 11045  on 888723  degrees of freedom
  (1 observation deleted due to missingness)
AIC: 11069

Number of Fisher Scoring iterations: 22
mhas_personmonth <- mhas_personmonth %>%
  mutate(age_squared = age^2)

model3 <- glm(dead ~ female + schooling + locsize01 + age + age_squared, family = binomial, data = mhas_personmonth)
summary(model3)

Call:
glm(formula = dead ~ female + schooling + locsize01 + age + age_squared, 
    family = binomial, data = mhas_personmonth)

Coefficients:
              Estimate Std. Error z value Pr(>|z|)    
(Intercept) 10.1093398  4.3238314   2.338   0.0194 *  
female      -0.3164928  0.0712216  -4.444 8.84e-06 ***
schooling    0.0041593  0.0091741   0.453   0.6503    
locsize01   -0.0539427  0.0319334  -1.689   0.0912 .  
age         -0.7130371  0.1179780  -6.044 1.51e-09 ***
age_squared  0.0065261  0.0008009   8.149 3.67e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 13114  on 888735  degrees of freedom
Residual deviance: 11275  on 888730  degrees of freedom
AIC: 11287

Number of Fisher Scoring iterations: 12