knitr::opts_chunk$set(error = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
library(flexsurv)
library(muhaz)
library(data.table)
library(ggplot2)
dat <- readRDS(here::here("Reports","WHAMPsurvey","MM","Testing",
"coxdata.RDS"))
# remove never testers: never_tested = "T" and age 45+
# 8 men
dat <- dat %>% filter(!(never_tested == "T" & age > 45))
sdf <- with(dat, Surv(mlt, tested))
The code here is taken from Devin Vincenti’s excellent tutorial.
Of the null models, the Gomperz fits the best.
# Estimate the overall hazard function using nonparametric kernel density estimator
# from the muhaz package.
kernel_haz_est <- muhaz(dat$mlt, dat$tested)
kernel_haz <- data.table(time = kernel_haz_est$est.grid,
est = kernel_haz_est$haz.est,
method = "Kernel density")
# Use flexsurv to estimate intercept only models for a range of probability distributions. Ordered to highlight Gomperz as final choice
dists <- c("gompertz", "exp", "gamma", "gengamma",
"lognormal", "llogis", "weibull")
dists_long <- c("Gompertz", "Exponential",
"Gamma", "Generalized gamma",
"Lognormal", "Log-logistic",
"Weibull (AFT)")
parametric_haz <- vector(mode = "list", length = length(dists))
for (i in 1:length(dists)){
fit <- flexsurvreg(sdf ~ 1, data = dat, dist = dists[i])
parametric_haz[[i]] <- summary(fit, type = "hazard",
ci = FALSE, tidy = TRUE)
parametric_haz[[i]]$method <- dists_long[i]
}
parametric_haz <- rbindlist(parametric_haz)
# Plot the hazard functions from the parametric models and compare them
# to the kernel density estimate.
haz <- rbind(kernel_haz, parametric_haz)
haz[, method := factor(method,
levels = c("Kernel density",
dists_long))]
n_dists <- length(dists)
ggplot(haz,
aes(x = time, y = est, col = method, size = method)) +
geom_line() + #lwd=c(0.8, rep(0.6, 7))) +
labs(title = "Testing Hazard: Null model comparison",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
xlim(0,200) +
scale_colour_manual(name = "Model",
values = c("black", rainbow(n_dists))) +
scale_size_manual(name = "Model",
values = c(rep(0.8, 2),rep(0.6, n_dists-1)))
fit_exp <- flexsurvreg(sdf ~
age.young +
race2 +
region.ewa +
snap.grp3,
data = dat,
weights = ego.wawt,
dist = "exponential")
print(fit_exp)
## Call:
## flexsurvreg(formula = sdf ~ age.young + race2 + region.ewa +
## snap.grp3, data = dat, weights = ego.wawt, dist = "exponential")
##
## Estimates:
## data mean est L95% U95% se exp(est)
## rate NA 0.00572 0.00382 0.00856 0.00118 NA
## age.young25+ 0.78860 0.52343 0.27007 0.77680 0.12927 1.68781
## race2B 0.03125 -0.29455 -0.74573 0.15664 0.23020 0.74487
## race2H 0.12684 -0.75337 -1.07486 -0.43188 0.16403 0.47078
## region.ewaOther 0.84743 0.77160 0.45608 1.08711 0.16098 2.16322
## snap.grp32-5 0.28309 0.59813 0.38013 0.81614 0.11123 1.81872
## snap.grp36-125 0.15257 1.82542 1.55024 2.10061 0.14040 6.20541
## L95% U95%
## rate NA NA
## age.young25+ 1.31005 2.17451
## race2B 0.47439 1.16958
## race2H 0.34134 0.64929
## region.ewaOther 1.57788 2.96570
## snap.grp32-5 1.46248 2.26175
## snap.grp36-125 4.71258 8.17112
##
## N = 544, Events: 461, Censored: 83
## Total time at risk: 21729.77
## Log-likelihood = -2081.794, df = 7
## AIC = 4177.587
Covariates on rate parameter only
fit_gomp <- flexsurvreg(sdf ~
age.young +
race2 +
region.ewa +
snap.grp3,
data = dat,
weights = ego.wawt,
dist = "gompertz")
print(fit_gomp)
## Call:
## flexsurvreg(formula = sdf ~ age.young + race2 + region.ewa +
## snap.grp3, data = dat, weights = ego.wawt, dist = "gompertz")
##
## Estimates:
## data mean est L95% U95% se exp(est)
## shape NA -0.02100 -0.02438 -0.01762 0.00173 NA
## rate NA 0.01469 0.00980 0.02201 0.00303 NA
## age.young25+ 0.78860 0.73620 0.48273 0.98967 0.12932 2.08798
## race2B 0.03125 -0.36598 -0.81779 0.08583 0.23052 0.69352
## race2H 0.12684 -0.44278 -0.75122 -0.13434 0.15737 0.64225
## region.ewaOther 0.84743 0.44395 0.13383 0.75408 0.15823 1.55886
## snap.grp32-5 0.28309 0.60302 0.38382 0.82223 0.11184 1.82764
## snap.grp36-125 0.15257 1.35952 1.08911 1.62992 0.13796 3.89432
## L95% U95%
## shape NA NA
## rate NA NA
## age.young25+ 1.62049 2.69034
## race2B 0.44141 1.08963
## race2H 0.47179 0.87429
## region.ewaOther 1.14320 2.12565
## snap.grp32-5 1.46789 2.27556
## snap.grp36-125 2.97164 5.10349
##
## N = 544, Events: 461, Censored: 83
## Total time at risk: 21729.77
## Log-likelihood = -1943.402, df = 8
## AIC = 3902.804
Add age.young to scale fit (other terms were n.s.). Makes sense because the age range for 15-24 is truncated, which gives it a distinct scale.
fit_gomp_scale <- flexsurvreg(sdf ~
age.young +
race2 +
region.ewa +
snap.grp3,
anc = list(shape =~ age.young),
data = dat,
weights = ego.wawt,
dist = "gompertz")
print(fit_gomp_scale)
## Call:
## flexsurvreg(formula = sdf ~ age.young + race2 + region.ewa +
## snap.grp3, anc = list(shape = ~age.young), data = dat, weights = ego.wawt,
## dist = "gompertz")
##
## Estimates:
## data mean est L95% U95% se
## shape NA -0.05223 -0.06891 -0.03554 0.00851
## rate NA 0.02467 0.01572 0.03870 0.00567
## age.young25+ 0.78860 0.15669 -0.18540 0.49878 0.17454
## race2B 0.03125 -0.34109 -0.79311 0.11092 0.23062
## race2H 0.12684 -0.48154 -0.79147 -0.17160 0.15813
## region.ewaOther 0.84743 0.45291 0.14234 0.76349 0.15846
## snap.grp32-5 0.28309 0.60675 0.38735 0.82616 0.11194
## snap.grp36-125 0.15257 1.38688 1.11510 1.65866 0.13866
## shape(age.young25+) 0.78860 0.03339 0.01639 0.05040 0.00868
## exp(est) L95% U95%
## shape NA NA NA
## rate NA NA NA
## age.young25+ 1.16963 0.83077 1.64671
## race2B 0.71099 0.45244 1.11730
## race2H 0.61783 0.45318 0.84232
## region.ewaOther 1.57289 1.15296 2.14575
## snap.grp32-5 1.83447 1.47307 2.28452
## snap.grp36-125 4.00233 3.04987 5.25224
## shape(age.young25+) 1.03396 1.01652 1.05169
##
## N = 544, Events: 461, Censored: 83
## Total time at risk: 21729.77
## Log-likelihood = -1933.356, df = 9
## AIC = 3884.712
# define covariate grid
covardf <- with(dat, expand.grid(
age.young = unique(age.young),
race2 = unique(race2),
region.ewa = unique(region.ewa),
snap.grp3 = unique(snap.grp3)))
# weights for each strata
wtdf <- dat %>%
group_by(age.young, race2, region.ewa, snap.grp3) %>%
summarize(wt = sum(ego.wawt))
# predict hazards
haz_exp <- summary(fit_exp, newdata = covardf,
type = "hazard", tidy = TRUE)
haz_gomp <- summary(fit_gomp_scale, newdata = covardf,
type = "hazard", tidy = TRUE)
colors <- c("obsKD" = "grey4", "expfit" = "blue")
ggplot(haz_exp, aes(x = time, y = est)) +
geom_smooth(aes(col="expfit")) +
geom_smooth(data = kernel_haz,
aes(x=time, y=est, col = "obsKD")) +
labs(title = "Testing Hazard: Exponential model vs. Observed kernel density",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard",
color = "Legend") +
xlim(0,200) +
scale_color_manual(name = "",
values = colors,
guide = guide_legend(reverse=TRUE),
labels = c("Exponential fit",
"Obs kernel density")) +
theme(legend.position = "bottom")
ggplot(haz_exp, aes(x = time, y = est, col = age.young)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by Age: Exp model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "Age Group") +
theme(legend.position = "bottom") +
facet_grid(race2 ~ snap.grp3)
ggplot(haz_exp, aes(x = time, y = est, col = race2)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by Race: Exp model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "Race") +
theme(legend.position = "bottom") +
facet_grid(age.young ~ snap.grp3)
ggplot(haz_exp, aes(x = time, y = est, col = region.ewa)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by Region: Exp model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "Region") +
theme(legend.position = "bottom") +
facet_grid(race2 ~ snap.grp3)
ggplot(haz_exp, aes(x = time, y = est, col = snap.grp3)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by SNAP: Exp model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "SNAP") +
theme(legend.position = "bottom") +
facet_grid(race2 ~ age.young)
colors <- c("obsKD" = "grey4", "GomperzFit" = "blue")
ggplot(haz_gomp, aes(x = time, y = est)) +
geom_smooth(aes(col="GomperzFit")) +
geom_smooth(data = kernel_haz %>%
filter(time < max(haz_gomp$time)),
aes(x=time, y=est, col = "obsKD")) +
xlim(0,200) +
labs(title = "Testing Hazard: Gomperz model vs. Observed kernel density",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard",
color = "Legend") +
scale_color_manual(name = "",
values = colors,
guide = guide_legend(reverse=TRUE),
labels = c("Gomperz fit",
"Obs kernel density")) +
theme(legend.position = "bottom")
ggplot(haz_gomp, aes(x = time, y = est, col = age.young)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by Age: Gomperz model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "Age Group") +
theme(legend.position = "bottom") +
facet_grid(race2 ~ snap.grp3)
ggplot(haz_gomp, aes(x = time, y = est, col = race2)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by Race: Gomperz model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "Race") +
theme(legend.position = "bottom") +
facet_grid(age.young ~ snap.grp3)
ggplot(haz_gomp, aes(x = time, y = est, col = region.ewa)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by Region: Gomperz model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "Region") +
theme(legend.position = "bottom") +
facet_grid(race2 ~ snap.grp3)
ggplot(haz_gomp, aes(x = time, y = est, col = snap.grp3)) +
geom_smooth() +
xlim(0,200) +
labs(title = "Testing Hazard by SNAP: Gomperz model",
caption = "Not on PrEP, excl. never-testers",
x = "Months", y = "Hazard") +
scale_color_discrete(name = "SNAP") +
theme(legend.position = "bottom") +
facet_grid(race2 ~ age.young)
Here we re-estimate the Gomperz model in weeks, and take advantage of the proportional hazard feature of this model (thanks Adam!) to store a lookup table the size of the covariate grid: (36, 4).
The lookup table stores the base (week 1) hazard, and the weekly decay rate, for each covariate group. The base hazard varies by the full joint covariate grid. The decay rate is determined by the Gomperz scale parameter \(\gamma\): \(decay = 1/exp(\gamma)\). We have specified our Gomperz model with an age-group dependent scale, so our decay rate is age-group specific.
For simulation, the weekly testing hazard at time \(t\) for person \(j\) in full joint-covariate group \(i\), and age-group \(k\), with last negative test \(wlt\) weeks ago is given by:
\[ haz_{i,j,k,}(t) = basehaz_{i} * decay_k^{wlt_{j}(t)-1} \]
Note that age-group \(k\) aggregates over the other covariates reference by \(i\).
The lookup table is stored in the WHAMP2 Data folder
# define the survival object in weeks
sdfwk <- with(dat, Surv(mlt*4, tested))
# refit
fit_gomp_scalewk <- flexsurvreg(sdfwk ~
age.young +
race2 +
region.ewa +
snap.grp3,
anc = list(shape =~ age.young),
data = dat,
weights = ego.wawt,
dist = "gompertz")
print(fit_gomp_scalewk)
## Call:
## flexsurvreg(formula = sdfwk ~ age.young + race2 + region.ewa +
## snap.grp3, anc = list(shape = ~age.young), data = dat, weights = ego.wawt,
## dist = "gompertz")
##
## Estimates:
## data mean est L95% U95% se
## shape NA -0.01306 -0.01721 -0.00890 0.00212
## rate NA 0.00617 0.00393 0.00967 0.00142
## age.young25+ 0.78860 0.15671 -0.18478 0.49820 0.17423
## race2B 0.03125 -0.34114 -0.79317 0.11088 0.23063
## race2H 0.12684 -0.48155 -0.79145 -0.17165 0.15812
## region.ewaOther 0.84743 0.45285 0.14227 0.76342 0.15846
## snap.grp32-5 0.28309 0.60680 0.38741 0.82618 0.11193
## snap.grp36-125 0.15257 1.38690 1.11525 1.65855 0.13860
## shape(age.young25+) 0.78860 0.00835 0.00412 0.01258 0.00216
## exp(est) L95% U95%
## shape NA NA NA
## rate NA NA NA
## age.young25+ 1.16966 0.83129 1.64576
## race2B 0.71096 0.45241 1.11726
## race2H 0.61783 0.45319 0.84227
## region.ewaOther 1.57278 1.15289 2.14560
## snap.grp32-5 1.83454 1.47317 2.28457
## snap.grp36-125 4.00241 3.05032 5.25168
## shape(age.young25+) 1.00838 1.00412 1.01266
##
## N = 544, Events: 461, Censored: 83
## Total time at risk: 86919.08
## Log-likelihood = -2552.978, df = 9
## AIC = 5123.956
# pull the coefs to set the decay/scale rates in the lookup table
coef <- fit_gomp_scalewk$coefficients
# calculate hazards
haz_gompwk <- summary(fit_gomp_scalewk, newdata = covardf,
type = "hazard", tidy = TRUE)
# create lookup table
haz_lookup <- haz_gompwk %>%
filter(time==2) %>%
mutate(hazard = est,
race = race2,
phaz = ifelse(age.young == "15-24",
exp(coef["shape"]),
exp(coef["shape"] + coef["shape(age.young25+)"])),
haz1 = hazard/phaz) %>%
mutate(week = 1) %>%
select(week, haz1, phaz, age.young, race, region.ewa, snap.grp3) %>%
data.table::setDT()
data.table::setkeyv(haz_lookup,
c("age.young", "race", "region.ewa", "snap.grp3"))
saveRDS(haz_lookup,
file = here::here("Data","test_haz_lookup.RDS"))