Load and Clean Data

library(readr)
mhas0121noNA <- read_csv("MHAS0121 NO LABELS NO NEGATIVE TIMES.csv")
## Rows: 9636 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (39): id, locsize01, perwght01, died18, died21, mobirth, yrbirth, female...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(survival)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mhas0121noNA<-mhas0121noNA %>%
  mutate(
        startmo = round(ageint*12),
        endmo = round(agecensor*12),
        dead = died0121
       )
mhas0121noNA
## # A tibble: 9,636 × 42
##       id locsize01 perwght01 died18 died21 mobirth yrbirth female moint yrint
##    <dbl>     <dbl>     <dbl>  <dbl>  <dbl>   <dbl>   <dbl>  <dbl> <dbl> <dbl>
##  1     1         1       300     NA     NA      10    1941      0     8  2001
##  2     2         1       253      0      0       6    1931      0     7  2001
##  3     3         1       454      0      0       6    1950      0     7  2001
##  4     4         1       337     NA     NA      10    1936      0     7  2001
##  5     5         1       295      0      0       3    1944      1     7  2001
##  6     6         1       253      0      0       5    1948      1     7  2001
##  7     7         1       490     NA     NA       8    1927      0     8  2001
##  8     8         1       537      0      0       2    1945      0     7  2001
##  9     9         1       370      1     NA       9    1946      1     7  2001
## 10    10         1       237      0     NA       1    1948      0     7  2001
## # ℹ 9,626 more rows
## # ℹ 32 more variables: schooling <dbl>, died0103 <dbl>, refused0103 <dbl>,
## #   lost0103 <dbl>, followup0103 <dbl>, died0312 <dbl>, refused0312 <dbl>,
## #   lost0312 <dbl>, followup0312 <dbl>, died1215 <dbl>, refused1215 <dbl>,
## #   lost1215 <dbl>, followup1215 <dbl>, died1518 <dbl>, refused1518 <dbl>,
## #   lost1518 <dbl>, followup1518 <dbl>, died1821 <dbl>, refused1821 <dbl>,
## #   lost1821 <dbl>, followup1821 <dbl>, died0121 <dbl>, mocensor <dbl>, …
max(mhas0121noNA$exposure_months)
## [1] NA
#Possible max time is 296 months, but using an even higher threshold in case things change

#rm(mhas_personmonth)
# Code adapted from Mills (2011): section 3.5
# Expands data into person-months, up to maximum number age in months observed in data
mhas_personmonth<-survSplit(mhas0121noNA, cut=c(1:2000), start="startmo", end="endmo", event="dead")
## Warning in Surv(startmo, endmo, dead): Stop time must be > start time, NA
## created
# Sorting data by ID
mhas_personmonth<-mhas_personmonth[order (mhas_personmonth$id),]

#Creating time-varying age variables for different model specifications0
mhas_personmonth<-mhas_personmonth %>%
  mutate(
  age=startmo/12, #could truncate if one wanted to, but I don't 
  agesq=age^2,
  agectr=age-50,
  ageln=logb(age),
  .after = "died0121"
  )

library(dplyr)
library(ggplot2)
#install.packages("AMR")
library(AMR)
mhas_personmonth <- mhas_personmonth %>% 
  mutate(
  # Create age groups
  age5 = dplyr::case_when(
    age >= 50 & age < 55 ~ "50-54",
    age >= 55 & age < 60 ~ "55-59",
    age >= 60 & age < 65 ~ "60-64",
    age >= 65 & age < 70 ~ "65-69",
    age >= 70 & age < 75 ~ "70-74",
    age >= 75 & age < 80 ~ "75-79",
    age >= 80 & age < 85 ~ "80-84",
    age >= 85             ~ "85+"
  ),
  # Convert to factor
  age5 = factor(
    age5,
    level = c("50-54", "55-59","60-64","65-69","70-74","75-79","80-84","85+")
)
)

mhas_personmonth <- mhas_personmonth %>%
  mutate(age5b = case_when(age >= 50 & age < 55 ~ "50-54",
                          age >= 55 & age < 60 ~ "55-59",
                          age >= 60 & age < 65 ~ "60-64",
                          age >= 65 & age < 70 ~ "65-69",
                          age >= 70 & age < 75 ~ "70-74",
                          age >= 75 & age < 80 ~ "75-79",
                          age >= 80 & age < 85 ~ "80-84",
                          age >= 85 ~ "85+"),
         age5b = factor(age5, level = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+")))

Problem Set 5

###a.   Estimate a logistic regression controlling for gender, education levels, and locality size. Estimate various models with different (implicit or explicit) functional forms for age, including:

# Basic model: No age control
model_i <- glm(dead ~ as.factor(female) + as.factor(educlevel) + as.factor(locsize01), 
               family = binomial(link = "logit"), 
               data = mhas_personmonth)
model_i
## 
## Call:  glm(formula = dead ~ as.factor(female) + as.factor(educlevel) + 
##     as.factor(locsize01), family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)       as.factor(female)1   as.factor(educlevel)15  
##                -5.24387                 -0.17345                 -0.26067  
##  as.factor(educlevel)68  as.factor(educlevel)919    as.factor(locsize01)2  
##                -0.53374                 -0.82140                 -0.02740  
##   as.factor(locsize01)3    as.factor(locsize01)4  
##                -0.08669                 -0.14551  
## 
## Degrees of Freedom: 1442770 Total (i.e. Null);  1442763 Residual
##   (2761 observations deleted due to missingness)
## Null Deviance:       63990 
## Residual Deviance: 63650     AIC: 63660
# Model ii: Linear control for age
model_ii <- glm(dead ~ as.factor(female) + as.factor(educlevel) + as.factor(locsize01) + age, 
                family = binomial(link = "logit"), 
                data = mhas_personmonth)
model_ii
## 
## Call:  glm(formula = dead ~ as.factor(female) + as.factor(educlevel) + 
##     as.factor(locsize01) + age, family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)       as.factor(female)1   as.factor(educlevel)15  
##              -10.294650                -0.237838                -0.069851  
##  as.factor(educlevel)68  as.factor(educlevel)919    as.factor(locsize01)2  
##               -0.180576                -0.375226                 0.007083  
##   as.factor(locsize01)3    as.factor(locsize01)4                      age  
##               -0.117427                -0.128182                 0.066754  
## 
## Degrees of Freedom: 1442750 Total (i.e. Null);  1442742 Residual
##   (2781 observations deleted due to missingness)
## Null Deviance:       63770 
## Residual Deviance: 61490     AIC: 61510
# Model iii: Age and age-squared
model_iii <- glm(dead ~ as.factor(female) + as.factor(educlevel) + as.factor(locsize01) + age + agesq, 
                 family = binomial(link = "logit"), 
                 data = mhas_personmonth)
model_iii
## 
## Call:  glm(formula = dead ~ as.factor(female) + as.factor(educlevel) + 
##     as.factor(locsize01) + age + agesq, family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)       as.factor(female)1   as.factor(educlevel)15  
##              -7.0118283               -0.2338445               -0.0544349  
##  as.factor(educlevel)68  as.factor(educlevel)919    as.factor(locsize01)2  
##              -0.1679436               -0.3703414                0.0112869  
##   as.factor(locsize01)3    as.factor(locsize01)4                      age  
##              -0.1165693               -0.1260846               -0.0204215  
##                   agesq  
##               0.0005661  
## 
## Degrees of Freedom: 1442750 Total (i.e. Null);  1442741 Residual
##   (2781 observations deleted due to missingness)
## Null Deviance:       63770 
## Residual Deviance: 61460     AIC: 61480
# Model iv: Age groups (5-year intervals, top-coded at 85+)
model_iv <- glm(dead ~ as.factor(female) + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5), 
                family = binomial(link = "logit"), 
                data = mhas_personmonth)
model_iv
## 
## Call:  glm(formula = dead ~ as.factor(female) + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5), family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)       as.factor(female)1   as.factor(educlevel)15  
##              -5.7336221               -0.2351751               -0.0855654  
##  as.factor(educlevel)68  as.factor(educlevel)919    as.factor(locsize01)2  
##              -0.2036471               -0.4017135                0.0001421  
##   as.factor(locsize01)3    as.factor(locsize01)4     as.factor(age5)55-59  
##              -0.1107726               -0.1285291               -0.7127648  
##    as.factor(age5)60-64     as.factor(age5)65-69     as.factor(age5)70-74  
##              -0.4170856               -0.0410303                0.1937194  
##    as.factor(age5)75-79     as.factor(age5)80-84       as.factor(age5)85+  
##               0.5424598                0.9355486                1.5505841  
## 
## Degrees of Freedom: 1442750 Total (i.e. Null);  1442736 Residual
##   (2781 observations deleted due to missingness)
## Null Deviance:       63770 
## Residual Deviance: 61490     AIC: 61520
# model v:  Interaction of age and gender
model_v <- glm(dead ~ female + age + female * age + as.factor(educlevel) + as.factor(locsize01), 
               family = binomial(link = "logit"), 
               data = mhas_personmonth)
model_v
## 
## Call:  glm(formula = dead ~ female + age + female * age + as.factor(educlevel) + 
##     as.factor(locsize01), family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)                   female                      age  
##              -10.020330                -0.766298                 0.063108  
##  as.factor(educlevel)15   as.factor(educlevel)68  as.factor(educlevel)919  
##               -0.069529                -0.181084                -0.379082  
##   as.factor(locsize01)2    as.factor(locsize01)3    as.factor(locsize01)4  
##                0.006281                -0.115226                -0.126381  
##              female:age  
##                0.006948  
## 
## Degrees of Freedom: 1442750 Total (i.e. Null);  1442741 Residual
##   (2781 observations deleted due to missingness)
## Null Deviance:       63770 
## Residual Deviance: 61490     AIC: 61510
###c.   Compare the goodness of fit of all of these models (e.g., using AIC or BIC). What would you conclude from these in terms of which specification may be more adequate for this phenomenon and data? Any other issues to consider besides AIC/BIC?


# Calculate AIC for each model
AIC(model_i, model_ii, model_iii, model_iv, model_v)
## Warning in AIC.default(model_i, model_ii, model_iii, model_iv, model_v): models
## are not all fitted to the same number of observations
##           df      AIC
## model_i    8 63664.01
## model_ii   9 61510.25
## model_iii 10 61481.76
## model_iv  15 61516.28
## model_v   10 61506.50
# Calculate BIC for each model
BIC(model_i, model_ii, model_iii, model_iv, model_v)
## Warning in BIC.default(model_i, model_ii, model_iii, model_iv, model_v): models
## are not all fitted to the same number of observations
##           df      BIC
## model_i    8 63761.46
## model_ii   9 61619.88
## model_iii 10 61603.58
## model_iv  15 61699.02
## model_v   10 61628.32
##d.    Estimate logistic and complementary-log-log models without covariates other than age, in 5-year age groups. Obtain the predicted values for all different age groups in each model. Compare these estimates between each other and with those obtained in section I.a in PS3.


# Logistic model with age groups only
logit_model <- glm(dead ~ as.factor(age5), 
                   family = binomial(link = "logit"), 
                   data = mhas_personmonth)

summary(logit_model)
## 
## Call:
## glm(formula = dead ~ as.factor(age5), family = binomial(link = "logit"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -6.06349    0.08884 -68.253  < 2e-16 ***
## as.factor(age5)55-59 -0.70339    0.11402  -6.169 6.87e-10 ***
## as.factor(age5)60-64 -0.39706    0.10231  -3.881 0.000104 ***
## as.factor(age5)65-69 -0.01912    0.09690  -0.197 0.843567    
## as.factor(age5)70-74  0.22476    0.09562   2.351 0.018746 *  
## as.factor(age5)75-79  0.58377    0.09566   6.103 1.04e-09 ***
## as.factor(age5)80-84  0.98805    0.09627  10.264  < 2e-16 ***
## as.factor(age5)85+    1.62190    0.09412  17.232  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63912  on 1444226  degrees of freedom
## Residual deviance: 61745  on 1444219  degrees of freedom
##   (1305 observations deleted due to missingness)
## AIC: 61761
## 
## Number of Fisher Scoring iterations: 9
AIC(logit_model)
## [1] 61760.85
BIC(logit_model)
## [1] 61858.32
# Create xvalues with only unique values of age5 (no dead variable)
xvalues <- data.frame(age5 = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+"))

# Generate predictions
xvalues$phat <- predict(logit_model, newdata = xvalues, type = "response")

summary(xvalues$phat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001150 0.002098 0.002612 0.004027 0.004667 0.011640
ggplot(xvalues, aes(x = age5, y = phat)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted Age at Death", title = "Predicted Death Probabilities for 5-Year Categories, by logistic model")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Complementary log-log model with age groups only
cloglog_model <- glm(dead ~ as.factor(age5), 
                     family = binomial(link = "cloglog"), 
                     data = mhas_personmonth)

summary(cloglog_model)
## 
## Call:
## glm(formula = dead ~ as.factor(age5), family = binomial(link = "cloglog"), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -6.06465    0.08874 -68.345  < 2e-16 ***
## as.factor(age5)55-59 -0.70280    0.11391  -6.170 6.84e-10 ***
## as.factor(age5)60-64 -0.39668    0.10220  -3.881 0.000104 ***
## as.factor(age5)65-69 -0.01910    0.09679  -0.197 0.843565    
## as.factor(age5)70-74  0.22446    0.09550   2.350 0.018758 *  
## as.factor(age5)75-79  0.58285    0.09554   6.101 1.05e-09 ***
## as.factor(age5)80-84  0.98610    0.09613  10.258  < 2e-16 ***
## as.factor(age5)85+    1.61720    0.09396  17.211  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63912  on 1444226  degrees of freedom
## Residual deviance: 61745  on 1444219  degrees of freedom
##   (1305 observations deleted due to missingness)
## AIC: 61761
## 
## Number of Fisher Scoring iterations: 9
AIC(cloglog_model)
## [1] 61760.85
BIC(cloglog_model)
## [1] 61858.32
# Create xvalues with only unique values of age5 (no dead variable)
xvalues2 <- data.frame(age5 = c("50-54", "55-59", "60-64", "65-69", "70-74", "75-79", "80-84", "85+"))

# Generate predictions
xvalues2$phat <- predict(cloglog_model, newdata = xvalues2, type = "response")

summary(xvalues2$phat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.001150 0.002098 0.002612 0.004027 0.004667 0.011640
ggplot(xvalues2, aes(x = age5, y = phat)) +
  geom_point() +
  geom_line() +
  labs(x = "Age", y = "Predicted Age at Death", title = "Predicted Death Probabilities for 5-Year Categories, by cloglog model")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Assuming xvalues2 is for the cloglog model and xvalues for the logistic model
xvalues$model <- "Logistic Model"
xvalues2$model <- "Cloglog Model"

# Combine the data frames
combined_data <- rbind(xvalues, xvalues2)

# Plot with different colors and trend lines for each model
library(ggplot2)

ggplot(combined_data, aes(x = age5, y = phat, color = model, group = model)) +
  geom_point() +
  geom_line() +
  geom_smooth(method = "loess", se = FALSE) +  # Add trend line for each model
  labs(x = "Age Group", 
       y = "Predicted Probability of Death", 
       title = "Predicted Death Probabilities for 5-Year Age Groups by Model") +
  scale_color_manual(values = c("Logistic Model" = "blue", "Cloglog Model" = "red")) +  # Custom colors
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

##e.    Using the same model estimated in section I.a.iv above (i.e., controlling for sex/gender, schooling, locality size, and 5-year age groups [using 50-54 as the reference category]), estimate another set logistic and complementary log-log regressions. 
##i.    Interpret the coefficients of each model.


# Logistic regression model
logit_model_iv <- glm(dead ~ female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5), 
                      family = binomial(link = "logit"), 
                      data = mhas_personmonth)

logit_model_iv
## 
## Call:  glm(formula = dead ~ female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5), family = binomial(link = "logit"), data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)                   female   as.factor(educlevel)15  
##              -5.7336221               -0.2351751               -0.0855654  
##  as.factor(educlevel)68  as.factor(educlevel)919    as.factor(locsize01)2  
##              -0.2036471               -0.4017135                0.0001421  
##   as.factor(locsize01)3    as.factor(locsize01)4     as.factor(age5)55-59  
##              -0.1107726               -0.1285291               -0.7127648  
##    as.factor(age5)60-64     as.factor(age5)65-69     as.factor(age5)70-74  
##              -0.4170856               -0.0410303                0.1937194  
##    as.factor(age5)75-79     as.factor(age5)80-84       as.factor(age5)85+  
##               0.5424598                0.9355486                1.5505841  
## 
## Degrees of Freedom: 1442750 Total (i.e. Null);  1442736 Residual
##   (2781 observations deleted due to missingness)
## Null Deviance:       63770 
## Residual Deviance: 61490     AIC: 61520
# Complementary log-log regression model
cloglog_model_iv <- glm(dead ~ female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5), 
                        family = binomial(link = "cloglog"), 
                        data = mhas_personmonth)

cloglog_model_iv
## 
## Call:  glm(formula = dead ~ female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5), family = binomial(link = "cloglog"), data = mhas_personmonth)
## 
## Coefficients:
##             (Intercept)                   female   as.factor(educlevel)15  
##              -5.7358835               -0.2343832               -0.0851568  
##  as.factor(educlevel)68  as.factor(educlevel)919    as.factor(locsize01)2  
##              -0.2029052               -0.4005623                0.0001388  
##   as.factor(locsize01)3    as.factor(locsize01)4     as.factor(age5)55-59  
##              -0.1104154               -0.1280903               -0.7121247  
##    as.factor(age5)60-64     as.factor(age5)65-69     as.factor(age5)70-74  
##              -0.4166502               -0.0409435                0.1935057  
##    as.factor(age5)75-79     as.factor(age5)80-84       as.factor(age5)85+  
##               0.5416345                0.9336704                1.5459108  
## 
## Degrees of Freedom: 1442750 Total (i.e. Null);  1442736 Residual
##   (2781 observations deleted due to missingness)
## Null Deviance:       63770 
## Residual Deviance: 61490     AIC: 61520
##ii.   Exponentiation the coefficients of female, education levels, and locality sizes both models and add them to the table you first made in (d). What additional conclusions or insights do these additions give you? 


# Exponentiate coefficients for both models

exp_logit <- exp(coef(logit_model_iv))
exp_cloglog <- exp(coef(cloglog_model_iv))

# Combine results into a data frame for easy comparison
exponentiated_results <- data.frame(
  Logit_Odds_Ratio = exp_logit,
  Cloglog_Hazard_Ratio = exp_cloglog
)

# Display the exponentiated results table
print(exponentiated_results)
##                         Logit_Odds_Ratio Cloglog_Hazard_Ratio
## (Intercept)                  0.003235337          0.003228029
## female                       0.790432458          0.791058596
## as.factor(educlevel)15       0.917993088          0.918368308
## as.factor(educlevel)68       0.815750224          0.816355614
## as.factor(educlevel)919      0.669172434          0.669943202
## as.factor(locsize01)2        1.000142156          1.000138801
## as.factor(locsize01)3        0.895142252          0.895462121
## as.factor(locsize01)4        0.879387975          0.879773914
## as.factor(age5)55-59         0.490286761          0.490600697
## as.factor(age5)60-64         0.658964496          0.659251479
## as.factor(age5)65-69         0.959800037          0.959883332
## as.factor(age5)70-74         1.213755628          1.213496247
## as.factor(age5)75-79         1.720233021          1.718814008
## as.factor(age5)80-84         2.548611246          2.543828914
## as.factor(age5)85+           4.714223062          4.692243484
##f.    Test whether the proportionality of hazards assumption holds for gender/sex and education levels (separate tests, but you can also do a joint one). Do they? Otherwise, what would you do if the hazards are not proportional?



fit.pch5phtest1<-glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5):female, data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5phtest1)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5):female, family = binomial(link = logit), 
##     data = mhas_personmonth)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -5.710e+00  1.224e-01 -46.651  < 2e-16 ***
## as.factor(age5)55-59        -7.675e-01  1.529e-01  -5.021 5.14e-07 ***
## as.factor(age5)60-64        -4.765e-01  1.373e-01  -3.469 0.000522 ***
## as.factor(age5)65-69        -3.357e-03  1.287e-01  -0.026 0.979186    
## as.factor(age5)70-74         1.737e-01  1.279e-01   1.358 0.174453    
## as.factor(age5)75-79         5.826e-01  1.280e-01   4.551 5.34e-06 ***
## as.factor(age5)80-84         9.048e-01  1.305e-01   6.933 4.12e-12 ***
## as.factor(age5)85+           1.443e+00  1.280e-01  11.266  < 2e-16 ***
## female                      -2.891e-01  1.798e-01  -1.608 0.107831    
## as.factor(educlevel)15      -8.573e-02  3.581e-02  -2.394 0.016671 *  
## as.factor(educlevel)68      -2.047e-01  4.421e-02  -4.630 3.65e-06 ***
## as.factor(educlevel)919     -4.036e-01  5.205e-02  -7.753 8.99e-15 ***
## as.factor(locsize01)2       -6.249e-05  4.175e-02  -0.001 0.998806    
## as.factor(locsize01)3       -1.101e-01  5.139e-02  -2.142 0.032167 *  
## as.factor(locsize01)4       -1.274e-01  4.163e-02  -3.059 0.002219 ** 
## as.factor(age5)55-59:female  1.220e-01  2.298e-01   0.531 0.595531    
## as.factor(age5)60-64:female  1.280e-01  2.065e-01   0.620 0.535282    
## as.factor(age5)65-69:female -7.397e-02  1.958e-01  -0.378 0.705584    
## as.factor(age5)70-74:female  4.692e-02  1.932e-01   0.243 0.808105    
## as.factor(age5)75-79:female -6.840e-02  1.932e-01  -0.354 0.723387    
## as.factor(age5)80-84:female  6.628e-02  1.947e-01   0.340 0.733551    
## as.factor(age5)85+:female    1.977e-01  1.907e-01   1.036 0.299993    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63765  on 1442750  degrees of freedom
## Residual deviance: 61475  on 1442729  degrees of freedom
##   (2781 observations deleted due to missingness)
## AIC: 61519
## 
## Number of Fisher Scoring iterations: 9
AIC(fit.pch5phtest1)
## [1] 61518.58
BIC(fit.pch5phtest1)
## [1] 61786.58
fit.pch5phtest2<-glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5):as.factor(educlevel), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5phtest2)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5):as.factor(educlevel), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                                               Estimate Std. Error z value
## (Intercept)                                  -5.827239   0.244199 -23.863
## as.factor(age5)55-59                         -0.438498   0.287786  -1.524
## as.factor(age5)60-64                         -0.262373   0.266011  -0.986
## as.factor(age5)65-69                          0.035448   0.256065   0.138
## as.factor(age5)70-74                          0.362121   0.251629   1.439
## as.factor(age5)75-79                          0.685242   0.250587   2.735
## as.factor(age5)80-84                          0.983404   0.250907   3.919
## as.factor(age5)85+                            1.591310   0.247310   6.434
## female                                       -0.237839   0.029563  -8.045
## as.factor(educlevel)15                       -0.046685   0.292178  -0.160
## as.factor(educlevel)68                       -0.036009   0.291208  -0.124
## as.factor(educlevel)919                      -0.275967   0.299277  -0.922
## as.factor(locsize01)2                         0.003565   0.041788   0.085
## as.factor(locsize01)3                        -0.106783   0.051426  -2.076
## as.factor(locsize01)4                        -0.127415   0.041647  -3.059
## as.factor(age5)55-59:as.factor(educlevel)15  -0.129735   0.350866  -0.370
## as.factor(age5)60-64:as.factor(educlevel)15  -0.094974   0.323271  -0.294
## as.factor(age5)65-69:as.factor(educlevel)15  -0.001499   0.309903  -0.005
## as.factor(age5)70-74:as.factor(educlevel)15  -0.091933   0.304971  -0.301
## as.factor(age5)75-79:as.factor(educlevel)15  -0.179109   0.304439  -0.588
## as.factor(age5)80-84:as.factor(educlevel)15   0.014752   0.304652   0.048
## as.factor(age5)85+:as.factor(educlevel)15     0.038389   0.300376   0.128
## as.factor(age5)55-59:as.factor(educlevel)68  -0.467369   0.360953  -1.295
## as.factor(age5)60-64:as.factor(educlevel)68  -0.232827   0.326961  -0.712
## as.factor(age5)65-69:as.factor(educlevel)68  -0.114474   0.312017  -0.367
## as.factor(age5)70-74:as.factor(educlevel)68  -0.372909   0.309159  -1.206
## as.factor(age5)75-79:as.factor(educlevel)68  -0.131561   0.307734  -0.428
## as.factor(age5)80-84:as.factor(educlevel)68  -0.125771   0.311746  -0.403
## as.factor(age5)85+:as.factor(educlevel)68    -0.062773   0.307571  -0.204
## as.factor(age5)55-59:as.factor(educlevel)919 -0.490377   0.375457  -1.306
## as.factor(age5)60-64:as.factor(educlevel)919 -0.258983   0.339754  -0.762
## as.factor(age5)65-69:as.factor(educlevel)919 -0.181081   0.324638  -0.558
## as.factor(age5)70-74:as.factor(educlevel)919 -0.214773   0.320250  -0.671
## as.factor(age5)75-79:as.factor(educlevel)919 -0.155961   0.323585  -0.482
## as.factor(age5)80-84:as.factor(educlevel)919  0.078330   0.328187   0.239
## as.factor(age5)85+:as.factor(educlevel)919    0.074235   0.325260   0.228
##                                              Pr(>|z|)    
## (Intercept)                                   < 2e-16 ***
## as.factor(age5)55-59                          0.12758    
## as.factor(age5)60-64                          0.32397    
## as.factor(age5)65-69                          0.88990    
## as.factor(age5)70-74                          0.15012    
## as.factor(age5)75-79                          0.00625 ** 
## as.factor(age5)80-84                         8.88e-05 ***
## as.factor(age5)85+                           1.24e-10 ***
## female                                       8.61e-16 ***
## as.factor(educlevel)15                        0.87305    
## as.factor(educlevel)68                        0.90159    
## as.factor(educlevel)919                       0.35647    
## as.factor(locsize01)2                         0.93202    
## as.factor(locsize01)3                         0.03785 *  
## as.factor(locsize01)4                         0.00222 ** 
## as.factor(age5)55-59:as.factor(educlevel)15   0.71156    
## as.factor(age5)60-64:as.factor(educlevel)15   0.76892    
## as.factor(age5)65-69:as.factor(educlevel)15   0.99614    
## as.factor(age5)70-74:as.factor(educlevel)15   0.76307    
## as.factor(age5)75-79:as.factor(educlevel)15   0.55631    
## as.factor(age5)80-84:as.factor(educlevel)15   0.96138    
## as.factor(age5)85+:as.factor(educlevel)15     0.89831    
## as.factor(age5)55-59:as.factor(educlevel)68   0.19538    
## as.factor(age5)60-64:as.factor(educlevel)68   0.47641    
## as.factor(age5)65-69:as.factor(educlevel)68   0.71370    
## as.factor(age5)70-74:as.factor(educlevel)68   0.22774    
## as.factor(age5)75-79:as.factor(educlevel)68   0.66900    
## as.factor(age5)80-84:as.factor(educlevel)68   0.68662    
## as.factor(age5)85+:as.factor(educlevel)68     0.83828    
## as.factor(age5)55-59:as.factor(educlevel)919  0.19152    
## as.factor(age5)60-64:as.factor(educlevel)919  0.44590    
## as.factor(age5)65-69:as.factor(educlevel)919  0.57698    
## as.factor(age5)70-74:as.factor(educlevel)919  0.50245    
## as.factor(age5)75-79:as.factor(educlevel)919  0.62982    
## as.factor(age5)80-84:as.factor(educlevel)919  0.81136    
## as.factor(age5)85+:as.factor(educlevel)919    0.81946    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63765  on 1442750  degrees of freedom
## Residual deviance: 61466  on 1442715  degrees of freedom
##   (2781 observations deleted due to missingness)
## AIC: 61538
## 
## Number of Fisher Scoring iterations: 9
AIC(fit.pch5phtest2)
## [1] 61538.47
BIC(fit.pch5phtest2)
## [1] 61977.02
fit.pch5phtest3<-glm(dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + as.factor(age5):as.factor(locsize01), data=mhas_personmonth, family=binomial(link=logit))
summary(fit.pch5phtest3)
## 
## Call:
## glm(formula = dead ~ as.factor(age5) + female + as.factor(educlevel) + 
##     as.factor(locsize01) + as.factor(age5):as.factor(locsize01), 
##     family = binomial(link = logit), data = mhas_personmonth)
## 
## Coefficients:
##                                             Estimate Std. Error z value
## (Intercept)                                -5.721159   0.116880 -48.949
## as.factor(age5)55-59                       -0.756116   0.144948  -5.216
## as.factor(age5)60-64                       -0.403371   0.128775  -3.132
## as.factor(age5)65-69                       -0.045074   0.122243  -0.369
## as.factor(age5)70-74                        0.187612   0.120830   1.553
## as.factor(age5)75-79                        0.522511   0.121334   4.306
## as.factor(age5)80-84                        0.895420   0.122793   7.292
## as.factor(age5)85+                          1.543889   0.120049  12.860
## female                                     -0.235247   0.029556  -7.959
## as.factor(educlevel)15                     -0.084878   0.035844  -2.368
## as.factor(educlevel)68                     -0.202662   0.044229  -4.582
## as.factor(educlevel)919                    -0.401219   0.052102  -7.701
## as.factor(locsize01)2                      -0.054369   0.245320  -0.222
## as.factor(locsize01)3                      -0.048387   0.352021  -0.137
## as.factor(locsize01)4                      -0.191316   0.274408  -0.697
## as.factor(age5)55-59:as.factor(locsize01)2  0.288775   0.306240   0.943
## as.factor(age5)60-64:as.factor(locsize01)2 -0.097032   0.287849  -0.337
## as.factor(age5)65-69:as.factor(locsize01)2 -0.017681   0.269230  -0.066
## as.factor(age5)70-74:as.factor(locsize01)2  0.027489   0.265051   0.104
## as.factor(age5)75-79:as.factor(locsize01)2  0.032854   0.265581   0.124
## as.factor(age5)80-84:as.factor(locsize01)2  0.231991   0.265063   0.875
## as.factor(age5)85+:as.factor(locsize01)2    0.016155   0.261777   0.062
## as.factor(age5)55-59:as.factor(locsize01)3 -0.163973   0.457588  -0.358
## as.factor(age5)60-64:as.factor(locsize01)3 -0.004814   0.395706  -0.012
## as.factor(age5)65-69:as.factor(locsize01)3 -0.004260   0.377124  -0.011
## as.factor(age5)70-74:as.factor(locsize01)3 -0.028176   0.372564  -0.076
## as.factor(age5)75-79:as.factor(locsize01)3 -0.210105   0.374659  -0.561
## as.factor(age5)80-84:as.factor(locsize01)3 -0.050381   0.372944  -0.135
## as.factor(age5)85+:as.factor(locsize01)3   -0.033206   0.366359  -0.091
## as.factor(age5)55-59:as.factor(locsize01)4  0.022504   0.347457   0.065
## as.factor(age5)60-64:as.factor(locsize01)4  0.003475   0.310156   0.011
## as.factor(age5)65-69:as.factor(locsize01)4  0.045089   0.294713   0.153
## as.factor(age5)70-74:as.factor(locsize01)4  0.028466   0.291043   0.098
## as.factor(age5)75-79:as.factor(locsize01)4  0.193899   0.289285   0.670
## as.factor(age5)80-84:as.factor(locsize01)4  0.039454   0.292759   0.135
## as.factor(age5)85+:as.factor(locsize01)4    0.045325   0.285835   0.159
##                                            Pr(>|z|)    
## (Intercept)                                 < 2e-16 ***
## as.factor(age5)55-59                       1.82e-07 ***
## as.factor(age5)60-64                        0.00173 ** 
## as.factor(age5)65-69                        0.71233    
## as.factor(age5)70-74                        0.12050    
## as.factor(age5)75-79                       1.66e-05 ***
## as.factor(age5)80-84                       3.05e-13 ***
## as.factor(age5)85+                          < 2e-16 ***
## female                                     1.73e-15 ***
## as.factor(educlevel)15                      0.01789 *  
## as.factor(educlevel)68                     4.60e-06 ***
## as.factor(educlevel)919                    1.35e-14 ***
## as.factor(locsize01)2                       0.82460    
## as.factor(locsize01)3                       0.89067    
## as.factor(locsize01)4                       0.48568    
## as.factor(age5)55-59:as.factor(locsize01)2  0.34570    
## as.factor(age5)60-64:as.factor(locsize01)2  0.73605    
## as.factor(age5)65-69:as.factor(locsize01)2  0.94764    
## as.factor(age5)70-74:as.factor(locsize01)2  0.91740    
## as.factor(age5)75-79:as.factor(locsize01)2  0.90155    
## as.factor(age5)80-84:as.factor(locsize01)2  0.38145    
## as.factor(age5)85+:as.factor(locsize01)2    0.95079    
## as.factor(age5)55-59:as.factor(locsize01)3  0.72009    
## as.factor(age5)60-64:as.factor(locsize01)3  0.99029    
## as.factor(age5)65-69:as.factor(locsize01)3  0.99099    
## as.factor(age5)70-74:as.factor(locsize01)3  0.93972    
## as.factor(age5)75-79:as.factor(locsize01)3  0.57494    
## as.factor(age5)80-84:as.factor(locsize01)3  0.89254    
## as.factor(age5)85+:as.factor(locsize01)3    0.92778    
## as.factor(age5)55-59:as.factor(locsize01)4  0.94836    
## as.factor(age5)60-64:as.factor(locsize01)4  0.99106    
## as.factor(age5)65-69:as.factor(locsize01)4  0.87840    
## as.factor(age5)70-74:as.factor(locsize01)4  0.92209    
## as.factor(age5)75-79:as.factor(locsize01)4  0.50269    
## as.factor(age5)80-84:as.factor(locsize01)4  0.89280    
## as.factor(age5)85+:as.factor(locsize01)4    0.87401    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63765  on 1442750  degrees of freedom
## Residual deviance: 61474  on 1442715  degrees of freedom
##   (2781 observations deleted due to missingness)
## AIC: 61546
## 
## Number of Fisher Scoring iterations: 9
AIC(fit.pch5phtest3)
## [1] 61545.69
BIC(fit.pch5phtest3)
## [1] 61984.25
#install.packages("lmtest")
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Add your more complex model second

lrtest(logit_model_iv, fit.pch5phtest1)
## Likelihood ratio test
## 
## Model 1: dead ~ female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5)
## Model 2: dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5):female
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1  15 -30743                     
## 2  22 -30737  7 11.706     0.1106
lrtest(logit_model_iv, fit.pch5phtest2)
## Likelihood ratio test
## 
## Model 1: dead ~ female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5)
## Model 2: dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5):as.factor(educlevel)
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1  15 -30743                     
## 2  36 -30733 21 19.819     0.5327
lrtest(logit_model_iv, fit.pch5phtest3)
## Likelihood ratio test
## 
## Model 1: dead ~ female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5)
## Model 2: dead ~ as.factor(age5) + female + as.factor(educlevel) + as.factor(locsize01) + 
##     as.factor(age5):as.factor(locsize01)
##   #Df LogLik Df  Chisq Pr(>Chisq)
## 1  15 -30743                     
## 2  36 -30737 21 12.591     0.9223