knitr::opts_chunk$set(error = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
library(flexsurv)
library(muhaz)
library(data.table)
library(ggplot2)
dat <- readRDS(here::here("Reports","WHAMPsurvey","MM","Testing",
                          "coxdata.RDS"))

# remove never testers: never_tested = "T" and age 45+
# 8 men

dat <- dat %>% filter(!(never_tested == "T" & age > 45))

sdf <- with(dat, Surv(mlt, tested))

Compare Null Models

The code here is taken from Devin Vincenti’s excellent tutorial.

Of the null models, the Gomperz fits the best.

# Estimate the overall hazard function using nonparametric kernel density estimator 
# from the muhaz package.

kernel_haz_est <- muhaz(dat$mlt, dat$tested)
kernel_haz <- data.table(time = kernel_haz_est$est.grid,
                         est = kernel_haz_est$haz.est,
                         method = "Kernel density")

# Use flexsurv to estimate intercept only models for a range of probability distributions.  Ordered to highlight Gomperz as final choice

dists <- c("gompertz", "exp", "gamma", "gengamma",
           "lognormal", "llogis", "weibull")
dists_long <- c("Gompertz", "Exponential",
                "Gamma", "Generalized gamma",
                "Lognormal", "Log-logistic",
                "Weibull (AFT)")
parametric_haz <- vector(mode = "list", length = length(dists))

for (i in 1:length(dists)){
  fit <- flexsurvreg(sdf ~ 1, data = dat, dist = dists[i]) 
  parametric_haz[[i]] <- summary(fit, type = "hazard", 
                                     ci = FALSE, tidy = TRUE)
  parametric_haz[[i]]$method <- dists_long[i]
}
parametric_haz <- rbindlist(parametric_haz)

# Plot the hazard functions from the parametric models and compare them 
# to the kernel density estimate.

haz <- rbind(kernel_haz, parametric_haz)
haz[, method := factor(method,
                       levels = c("Kernel density",
                                  dists_long))]
n_dists <- length(dists)

ggplot(haz, 
       aes(x = time, y = est, col = method, size = method)) +
  geom_line() + #lwd=c(0.8, rep(0.6, 7))) +
  labs(title = "Testing Hazard: Null model comparison",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  xlim(0,200) +
  scale_colour_manual(name = "Model", 
                      values = c("black", rainbow(n_dists))) +
  scale_size_manual(name = "Model",
                    values = c(rep(0.8, 2),rep(0.6, n_dists-1)))

Fit models

Exponential

fit_exp <- flexsurvreg(sdf ~
                          age.young + 
                          race2 +
                          region.ewa + 
                          snap.grp3,
                        data = dat,
                        weights = ego.wawt,
                        dist = "exponential")
print(fit_exp)
## Call:
## flexsurvreg(formula = sdf ~ age.young + race2 + region.ewa + 
##     snap.grp3, data = dat, weights = ego.wawt, dist = "exponential")
## 
## Estimates: 
##                  data mean  est       L95%      U95%      se        exp(est)
## rate                   NA    0.00572   0.00382   0.00856   0.00118        NA
## age.young25+      0.78860    0.52343   0.27007   0.77680   0.12927   1.68781
## race2B            0.03125   -0.29455  -0.74573   0.15664   0.23020   0.74487
## race2H            0.12684   -0.75337  -1.07486  -0.43188   0.16403   0.47078
## region.ewaOther   0.84743    0.77160   0.45608   1.08711   0.16098   2.16322
## snap.grp32-5      0.28309    0.59813   0.38013   0.81614   0.11123   1.81872
## snap.grp36-125    0.15257    1.82542   1.55024   2.10061   0.14040   6.20541
##                  L95%      U95%    
## rate                   NA        NA
## age.young25+      1.31005   2.17451
## race2B            0.47439   1.16958
## race2H            0.34134   0.64929
## region.ewaOther   1.57788   2.96570
## snap.grp32-5      1.46248   2.26175
## snap.grp36-125    4.71258   8.17112
## 
## N = 544,  Events: 461,  Censored: 83
## Total time at risk: 21729.77
## Log-likelihood = -2081.794, df = 7
## AIC = 4177.587

Gompertz rate

Covariates on rate parameter only

fit_gomp <- flexsurvreg(sdf ~
                          age.young + 
                          race2 +
                          region.ewa + 
                          snap.grp3,
                        data = dat,
                        weights = ego.wawt,
                        dist = "gompertz")
print(fit_gomp)
## Call:
## flexsurvreg(formula = sdf ~ age.young + race2 + region.ewa + 
##     snap.grp3, data = dat, weights = ego.wawt, dist = "gompertz")
## 
## Estimates: 
##                  data mean  est       L95%      U95%      se        exp(est)
## shape                  NA   -0.02100  -0.02438  -0.01762   0.00173        NA
## rate                   NA    0.01469   0.00980   0.02201   0.00303        NA
## age.young25+      0.78860    0.73620   0.48273   0.98967   0.12932   2.08798
## race2B            0.03125   -0.36598  -0.81779   0.08583   0.23052   0.69352
## race2H            0.12684   -0.44278  -0.75122  -0.13434   0.15737   0.64225
## region.ewaOther   0.84743    0.44395   0.13383   0.75408   0.15823   1.55886
## snap.grp32-5      0.28309    0.60302   0.38382   0.82223   0.11184   1.82764
## snap.grp36-125    0.15257    1.35952   1.08911   1.62992   0.13796   3.89432
##                  L95%      U95%    
## shape                  NA        NA
## rate                   NA        NA
## age.young25+      1.62049   2.69034
## race2B            0.44141   1.08963
## race2H            0.47179   0.87429
## region.ewaOther   1.14320   2.12565
## snap.grp32-5      1.46789   2.27556
## snap.grp36-125    2.97164   5.10349
## 
## N = 544,  Events: 461,  Censored: 83
## Total time at risk: 21729.77
## Log-likelihood = -1943.402, df = 8
## AIC = 3902.804

Gompertz rate+scale

Add age.young to scale fit (other terms were n.s.). Makes sense because the age range for 15-24 is truncated, which gives it a distinct scale.

fit_gomp_scale <- flexsurvreg(sdf ~
                                age.young + 
                                race2 +
                                region.ewa + 
                                snap.grp3,
                              anc = list(shape =~ age.young),
                              data = dat,
                              weights = ego.wawt,
                              dist = "gompertz")
print(fit_gomp_scale)
## Call:
## flexsurvreg(formula = sdf ~ age.young + race2 + region.ewa + 
##     snap.grp3, anc = list(shape = ~age.young), data = dat, weights = ego.wawt, 
##     dist = "gompertz")
## 
## Estimates: 
##                      data mean  est       L95%      U95%      se      
## shape                      NA   -0.05223  -0.06891  -0.03554   0.00851
## rate                       NA    0.02467   0.01572   0.03870   0.00567
## age.young25+          0.78860    0.15669  -0.18540   0.49878   0.17454
## race2B                0.03125   -0.34109  -0.79311   0.11092   0.23062
## race2H                0.12684   -0.48154  -0.79147  -0.17160   0.15813
## region.ewaOther       0.84743    0.45291   0.14234   0.76349   0.15846
## snap.grp32-5          0.28309    0.60675   0.38735   0.82616   0.11194
## snap.grp36-125        0.15257    1.38688   1.11510   1.65866   0.13866
## shape(age.young25+)   0.78860    0.03339   0.01639   0.05040   0.00868
##                      exp(est)  L95%      U95%    
## shape                      NA        NA        NA
## rate                       NA        NA        NA
## age.young25+          1.16963   0.83077   1.64671
## race2B                0.71099   0.45244   1.11730
## race2H                0.61783   0.45318   0.84232
## region.ewaOther       1.57289   1.15296   2.14575
## snap.grp32-5          1.83447   1.47307   2.28452
## snap.grp36-125        4.00233   3.04987   5.25224
## shape(age.young25+)   1.03396   1.01652   1.05169
## 
## N = 544,  Events: 461,  Censored: 83
## Total time at risk: 21729.77
## Log-likelihood = -1933.356, df = 9
## AIC = 3884.712

Predicted hazards

# define covariate grid

covardf <- with(dat, expand.grid(
  age.young = unique(age.young),
  race2 = unique(race2),
  region.ewa = unique(region.ewa),
  snap.grp3 = unique(snap.grp3)))

# weights for each strata

wtdf <- dat %>%
  group_by(age.young, race2, region.ewa, snap.grp3) %>%
  summarize(wt = sum(ego.wawt))

# predict hazards

haz_exp <- summary(fit_exp, newdata = covardf, 
                   type = "hazard", tidy = TRUE)
haz_gomp <- summary(fit_gomp_scale, newdata = covardf, 
                    type = "hazard", tidy = TRUE)

Exponential model

Overall

colors <- c("obsKD" = "grey4", "expfit" = "blue")

ggplot(haz_exp, aes(x = time, y = est)) + 
  geom_smooth(aes(col="expfit")) +
  geom_smooth(data = kernel_haz,
              aes(x=time, y=est, col = "obsKD")) +
  labs(title = "Testing Hazard: Exponential model vs. Observed kernel density",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard",
       color = "Legend") + 
  xlim(0,200) +
  scale_color_manual(name = "", 
                     values = colors,
                     guide = guide_legend(reverse=TRUE),
                     labels = c("Exponential fit",
                                "Obs kernel density")) +
  theme(legend.position = "bottom")

Age

ggplot(haz_exp, aes(x = time, y = est, col = age.young)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by Age: Exp model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "Age Group") +
  theme(legend.position = "bottom") +
  facet_grid(race2 ~ snap.grp3)

Race

ggplot(haz_exp, aes(x = time, y = est, col = race2)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by Race: Exp model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "Race") +
  theme(legend.position = "bottom") +
  facet_grid(age.young ~ snap.grp3)

Region

ggplot(haz_exp, aes(x = time, y = est, col = region.ewa)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by Region: Exp model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "Region") +
  theme(legend.position = "bottom") +
  facet_grid(race2 ~ snap.grp3)

SNAP

ggplot(haz_exp, aes(x = time, y = est, col = snap.grp3)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by SNAP: Exp model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "SNAP") +
  theme(legend.position = "bottom") +
  facet_grid(race2 ~ age.young)

Gomperz model

Overall

colors <- c("obsKD" = "grey4", "GomperzFit" = "blue")

ggplot(haz_gomp, aes(x = time, y = est)) + 
  geom_smooth(aes(col="GomperzFit")) +
  geom_smooth(data = kernel_haz %>% 
                filter(time < max(haz_gomp$time)),
              aes(x=time, y=est, col = "obsKD")) +
  xlim(0,200) +
  labs(title = "Testing Hazard: Gomperz model vs. Observed kernel density",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard",
       color = "Legend") + 
    scale_color_manual(name = "", 
                       values = colors,
                       guide = guide_legend(reverse=TRUE),
                       labels = c("Gomperz fit",
                                  "Obs kernel density")) +
  theme(legend.position = "bottom")

Age

ggplot(haz_gomp, aes(x = time, y = est, col = age.young)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by Age: Gomperz model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "Age Group") +
  theme(legend.position = "bottom") +
  facet_grid(race2 ~ snap.grp3)

Race

ggplot(haz_gomp, aes(x = time, y = est, col = race2)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by Race: Gomperz model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "Race") +
  theme(legend.position = "bottom") +
  facet_grid(age.young ~ snap.grp3)

Region

ggplot(haz_gomp, aes(x = time, y = est, col = region.ewa)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by Region: Gomperz model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "Region") +
  theme(legend.position = "bottom") +
  facet_grid(race2 ~ snap.grp3)

SNAP

ggplot(haz_gomp, aes(x = time, y = est, col = snap.grp3)) + 
  geom_smooth() +
  xlim(0,200) +
  labs(title = "Testing Hazard by SNAP: Gomperz model",
       caption = "Not on PrEP, excl. never-testers",
       x = "Months", y = "Hazard") + 
  scale_color_discrete(name = "SNAP") +
  theme(legend.position = "bottom") +
  facet_grid(race2 ~ age.young)

Make lookup table

Here we re-estimate the Gomperz model in weeks, and take advantage of the proportional hazard feature of this model (thanks Adam!) to store a lookup table the size of the covariate grid: (36, 4).

The lookup table stores the base (week 1) hazard, and the weekly decay rate, for each covariate group. The base hazard varies by the full joint covariate grid. The decay rate is determined by the Gomperz scale parameter \(\gamma\): \(decay = 1/exp(\gamma)\). We have specified our Gomperz model with an age-group dependent scale, so our decay rate is age-group specific.

For simulation, the weekly testing hazard at time \(t\) for person \(j\) in full joint-covariate group \(i\), and age-group \(k\), with last negative test \(wlt\) weeks ago is given by:

\[ haz_{i,j,k,}(t) = basehaz_{i} * decay_k^{wlt_{j}(t)-1} \]

Note that age-group \(k\) aggregates over the other covariates reference by \(i\).

The lookup table is stored in the WHAMP2 Data folder

# define the survival object in weeks
sdfwk <- with(dat, Surv(mlt*4, tested))

# refit
fit_gomp_scalewk <- flexsurvreg(sdfwk ~
                                age.young + 
                                race2 +
                                region.ewa + 
                                snap.grp3,
                              anc = list(shape =~ age.young),
                              data = dat,
                              weights = ego.wawt,
                              dist = "gompertz")
print(fit_gomp_scalewk)
## Call:
## flexsurvreg(formula = sdfwk ~ age.young + race2 + region.ewa + 
##     snap.grp3, anc = list(shape = ~age.young), data = dat, weights = ego.wawt, 
##     dist = "gompertz")
## 
## Estimates: 
##                      data mean  est       L95%      U95%      se      
## shape                      NA   -0.01306  -0.01721  -0.00890   0.00212
## rate                       NA    0.00617   0.00393   0.00967   0.00142
## age.young25+          0.78860    0.15671  -0.18478   0.49820   0.17423
## race2B                0.03125   -0.34114  -0.79317   0.11088   0.23063
## race2H                0.12684   -0.48155  -0.79145  -0.17165   0.15812
## region.ewaOther       0.84743    0.45285   0.14227   0.76342   0.15846
## snap.grp32-5          0.28309    0.60680   0.38741   0.82618   0.11193
## snap.grp36-125        0.15257    1.38690   1.11525   1.65855   0.13860
## shape(age.young25+)   0.78860    0.00835   0.00412   0.01258   0.00216
##                      exp(est)  L95%      U95%    
## shape                      NA        NA        NA
## rate                       NA        NA        NA
## age.young25+          1.16966   0.83129   1.64576
## race2B                0.71096   0.45241   1.11726
## race2H                0.61783   0.45319   0.84227
## region.ewaOther       1.57278   1.15289   2.14560
## snap.grp32-5          1.83454   1.47317   2.28457
## snap.grp36-125        4.00241   3.05032   5.25168
## shape(age.young25+)   1.00838   1.00412   1.01266
## 
## N = 544,  Events: 461,  Censored: 83
## Total time at risk: 86919.08
## Log-likelihood = -2552.978, df = 9
## AIC = 5123.956
# pull the coefs to set the decay/scale rates in the lookup table
coef <- fit_gomp_scalewk$coefficients

# calculate hazards
haz_gompwk <- summary(fit_gomp_scalewk, newdata = covardf, 
                      type = "hazard", tidy = TRUE)

# create lookup table
haz_lookup <- haz_gompwk %>% 
  filter(time==2) %>%
  mutate(hazard = est,
         race = race2,
         phaz = ifelse(age.young == "15-24",
                       exp(coef["shape"]),
                       exp(coef["shape"] + coef["shape(age.young25+)"])),
         haz1 = hazard/phaz) %>%
  mutate(week = 1) %>%
  select(week, haz1, phaz, age.young, race, region.ewa, snap.grp3) %>%
  data.table::setDT()
  
data.table::setkeyv(haz_lookup, 
                    c("age.young", "race", "region.ewa", "snap.grp3"))

saveRDS(haz_lookup,
        file = here::here("Data","test_haz_lookup.RDS"))