knitr::opts_chunk$set(error = TRUE, message = FALSE, warning = FALSE)
library(dplyr)
library(tidyr)
library(lubridate)
library(spatstat)
library(ggplot2)
library(kableExtra)
library(survival)
library(survminer)

Preliminaries

egoDF <- WHAMPData::whampArtnetWA_egodata$main$egos

wDF <- WHAMPData::whamp_wide %>%
  haven::zap_label() %>%
  mutate(RCNTESTMONTH = as.numeric(RCNTESTMONTH),
         RCNTESTYEAR = as.numeric(RCNTESTYEAR)) %>%
  select(ego.id = Vrid, Vdatesub,
         EVERTEST, TEST2YRS,
         PREPCHKFREQ_CURR, PREP_HIVTESTFREQ, 
         RCNTESTMONTH, RCNTESTYEAR, RCNTRSLT) 

aDF <- ARTnetData::ARTnet.wide %>% 
  filter(State == "WA") %>%
  select(ego.id = AMIS_ID, Vdatesub = SUB_DATE,
         EVERTEST, TEST2YRS, #aprep,
         PREPCHKFREQ_CURR, PREP_HIVTESTFREQ, 
         RCNTESTMONTH = RCNTSTMONTH, RCNTESTYEAR = RCNTSTYEAR,
         RCNTRSLT)

newvars <- bind_rows(wDF,aDF)

allDF <- egoDF %>% left_join(newvars, by = "ego.id") 

# note we lose 181 cases if we filter on both
# EVERTEST != 1 (91), and diag.status=1 (90)
allDF %>% 
  with(table(notest.or.pos=(EVERTEST != 1 | diag.status==1), 
             if_artnet, useNA = "al"))
##              if_artnet
## notest.or.pos   0   1 <NA>
##         FALSE 541 110    0
##         TRUE  140  40    0
##         <NA>    1   0    0
# About 40% of HIV+ were dx this year 
allDF %>% 
  with(table(last.testyr = RCNTESTYEAR, 
             HIV.status = diag.status, useNA = "al"))
##            HIV.status
## last.testyr   0   1 <NA>
##        0      3   0    0
##        1962   0   1    0
##        1963   0   1    0
##        1985   0   1    0
##        1986   0   2    0
##        1987   0   3    0
##        1989   0   1    0
##        1992   0   3    0
##        1994   0   3    0
##        1995   1   1    0
##        1996   0   1    0
##        1997   0   4    0
##        1998   0   1    0
##        2000   0   2    0
##        2001   2   1    0
##        2002   1   0    0
##        2003   0   2    0
##        2004   0   1    0
##        2005   4   0    0
##        2006   3   1    0
##        2007   4   1    0
##        2008   2   5    0
##        2009   1   1    0
##        2010   3   1    0
##        2011   5   1    0
##        2012   3   0    0
##        2013  13   1    0
##        2014   8   0    0
##        2015  20   0    0
##        2016  25   1    0
##        2017  91   7    0
##        2018 121   5    1
##        2019 332  33    0
##        <NA>  66   0    0
##        NaN   34   4    0
allDF %>% 
  with(table(last.test.2019 = RCNTESTYEAR==2019, 
             HIV.status = diag.status, useNA = "al"))
##               HIV.status
## last.test.2019   0   1 <NA>
##          FALSE 310  52    1
##          TRUE  332  33    0
##          <NA>  100   4    0
# ## and for ARTnet
# ARTnetData::ARTnet.wide %>% 
#   with(table(last.testyr = RCNTSTYEAR, 
#              HIV.status = RCNTRSLT==2, useNA = "al"))
# 
# ARTnetData::ARTnet.wide %>% 
#   with(table(last.test.2017 = RCNTSTYEAR > 2016, 
#              HIV.status = RCNTRSLT==2, useNA = "al"))

# Dx after 2017 have been testing in last 2 yrs
allDF %>%  
  filter(diag.status==1) %>% 
  with(table(TEST2YRS, diag.after.2017 = RCNTESTYEAR > 2017, 
             useNA = "al"))
##         diag.after.2017
## TEST2YRS FALSE TRUE <NA>
##     0       35    1    2
##     1        7    3    0
##     2        2    6    0
##     3        0    2    0
##     4        1   12    0
##     5        0    4    0
##     6        0    3    0
##     7        0    1    0
##     8        0    4    0
##     12       0    1    0
##     <NA>     2    1    0
##     NaN      0    0    2
# So we filter on diag.status==1 (Negative), 
# and set tests in last 2 yrs to 0 for EVERTEST != 1.  
# Last test date is still NA for those folks.

allTestDF <- egoDF %>%
  left_join(newvars, by = "ego.id") %>%
  filter(diag.status == 0) %>%
  mutate(age.grp = ifelse(age.grp == 6, 5, age.grp),
         snap.grp = cut(snap, 
                        breaks = c(0, .5, 1, 5, 10, 20,
                                   max(snap)),
                        include.lowest = T,
                        include.hightest = T),
         snap.grp = recode(snap.grp, "[0,0.5]" = "[0]",
                           "(0.5,1]" = "[1]"),
         snap.grp3 = cut(snap, 
                        breaks = c(0, 1, 5,
                                   max(snap)),
                        include.lowest = T,
                        include.hightest = T),
         snap.grp3 = recode(snap.grp3, "[0,1]" = "0-1",
                            "(1,5]" = "2-5",
                            "(5,125]" = "6-125"),
         never_tested = ifelse(is.na(EVERTEST) | EVERTEST != 1, 
                               "T", "F"),
         tests_2yr = case_when(never_tested == "T" ~ 0,
                               is.na(TEST2YRS) ~ 0,
                               TEST2YRS == 100 ~ NA_real_,
                               TRUE ~ TEST2YRS),
         no_tests_2yrs = ifelse(tests_2yr==0, "T", "F"),
         testratemo_2yr = round(tests_2yr/24,3),
         RCNTESTYEAR = ifelse(RCNTESTYEAR==0, 2019, RCNTESTYEAR),
         last_test = case_when(
           is.na(RCNTESTYEAR) | is.na(RCNTESTMONTH) ~ NA_character_,
           as.numeric(RCNTESTYEAR) - (2019-age) < 13 ~ NA_character_,
           as.numeric(RCNTESTYEAR) > 2019 ~ NA_character_,
           TRUE ~ paste(RCNTESTYEAR, RCNTESTMONTH, 1, sep="-")),
         mos_last_test = interval(ymd(last_test), 
                                  ymd(Vdatesub)) %/% months(1),
         mos_last_test = ifelse(mos_last_test<0, 0, mos_last_test),
         test_group = case_when(never_tested == "T" ~ 0,
                                never_tested == "F" & no_tests_2yrs == "T" ~ 1,
                                !is.na(mos_last_test) & mos_last_test < 13 ~ 3,
                                TRUE ~ 2),
         prep_checkup = ifelse(prep==1, 
                               case_when(PREPCHKFREQ_CURR==1 ~ 1,
                                         PREPCHKFREQ_CURR==2 ~ 3,
                                         PREPCHKFREQ_CURR==3 ~ 6,
                                         PREPCHKFREQ_CURR==5 ~ 12,
                                         TRUE ~ 12), 
                               NA_real_),
         prep_hivtest = ifelse(prep==1, 
                              case_when(PREP_HIVTESTFREQ==1 ~ 1,
                                        TRUE ~ 2), 
                              NA_real_),
         prep_rate = prep_checkup * prep_hivtest) 

labelled::val_labels(allTestDF$test_group) <- 
  c("Never"=0, ">2yrs ago"=1, "1-2yrs"=2, "this yr"=3)

saveRDS(allTestDF, 
        file = here::here("Reports","WHAMPsurvey","MM","Testing",
                          "allTestDF.RDS"))


# # look at key variables
# table(allTestDF$never_tested, useNA = "al")
# table(allTestDF$no_tests_2yr, useNA = "al")
# table(allTestDF$tests_2yr, useNA = "al")
# table(allTestDF$testratemo_2yr, useNA = "al")
# table(allTestDF$mos_last_test, useNA = "al")
# 
# table(allTestDF$prep, useNA = "al")
# table(allTestDF$prep_checkup, useNA = "al")
# table(allTestDF$prep_hivtest, useNA = "al")
# table(allTestDF$prep_rate, useNA = "al")

# Evertesters DF, for analyses of months since last test
mltDF <- allTestDF %>% 
  filter(!is.na(mos_last_test) & mos_last_test >= 0)

Descriptives

Test group distribution

Age group

tab.alltest <- allTestDF %>% 
  group_by(age.grp, test_group) %>%
  summarize(n.test.grp = sum(ego.wawt)) 

# table of counts
tab.alltest %>%
  pivot_wider(names_from = test_group, 
              values_from = n.test.grp) %>%
  kable(caption = "Test group by age group", 
        col.names = c("Age grp", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 0) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by age group
Test group
Age grp Never >2yrs ago 1-2yrs ago In last yr
1 56 5 24 58
2 21 30 34 118
3 11 24 20 102
4 4 17 18 88
5 4 12 18 72
tab.alltest %>%
  group_by(age.grp) %>%
  mutate(prop.test.grp = scales::percent(n.test.grp / sum(n.test.grp))) %>%
  pivot_wider(-n.test.grp, names_from = test_group, 
              values_from = prop.test.grp) %>%
  kable(caption = "Test group by age group (%)", 
        col.names = c("Age grp", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by age group (%)
Test group
Age grp Never >2yrs ago 1-2yrs ago In last yr
1 39.3% 3.6% 16.6% 40.5%
2 10.2% 14.9% 16.7% 58.3%
3 7.0% 15.0% 13.0% 65.0%
4 3.46% 13.56% 14.03% 68.95%
5 4.2% 10.9% 16.8% 68.0%
tab.alltest %>% 
  group_by(age.grp) %>%
  mutate(prop.test.grp = n.test.grp / sum(n.test.grp)) %>%
  ungroup() %>%
  ggplot(aes(x = age.grp, y=prop.test.grp, color = factor(test_group))) +
  geom_line() +
  geom_point() +
  scale_color_discrete(name="Test group",
                      labels=c("Never", ">2yrs ago", "1-2 yrs ago", "In last yr")) +
  labs(title="Testing patterns by age",
       x = "Age group",
       y = "Proportion in test group")

tab.alltest %>% 
  group_by(age.grp) %>%
  mutate(prop.test.grp = n.test.grp / sum(n.test.grp)) %>%
  ungroup() %>%
  ggplot(aes(x = test_group, y=prop.test.grp, color = factor(age.grp))) +
  geom_line() +
  geom_point() +
  # scale_color_discrete(name="Test group",
  #                     labels=c("Never", ">2yrs ago", "1-2 yrs ago", "In last yr")) +
  labs(title="Testing patterns by age",
       x = "Test group",
       y = "Proportion in test group")

Race

## Rest is for no Prep
# table of counts
tab.alltest <- allTestDF %>% filter(prep == 0) %>%
  group_by(race, test_group) %>%
  summarize(n.test.grp = sum(ego.wawt)) 

tab.alltest %>%
  pivot_wider(names_from = test_group, 
              values_from = n.test.grp) %>%
  kable(caption = "Test group by race (not on PrEP)", 
        col.names = c("", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 0) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by race (not on PrEP)
Test group
Never >2yrs ago 1-2yrs ago In last yr
B 12 5 3 11
H 19 9 9 31
O 66 72 94 216
tab.alltest %>%
  group_by(race) %>%
  mutate(prop.test.grp = scales::percent(n.test.grp / sum(n.test.grp))) %>%
  pivot_wider(-n.test.grp, names_from = test_group, 
              values_from = prop.test.grp) %>%
  kable(caption = "Test group by race (not on PrEP)", 
        col.names = c("Race", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by race (not on PrEP)
Test group
Race Never >2yrs ago 1-2yrs ago In last yr
B 36.8% 16.7% 10.7% 35.7%
H 28.16% 13.70% 12.99% 45.15%
O 14.7% 16.2% 21.0% 48.2%
tab.alltest %>% 
  group_by(race) %>%
  mutate(prop.test.grp = n.test.grp / sum(n.test.grp)) %>%
  ungroup() %>%
  ggplot(aes(x = test_group, y=prop.test.grp, color = race)) +
  geom_line() +
  geom_point() +
  # scale_color_discrete(name="Test group",
  #                     labels=c("Never", ">2yrs ago", "1-2 yrs ago", "In last yr")) +
  labs(title="Testing patterns by race (not on PrEP)",
       x = "Test group",
       y = "Proportion in test group")

Region

# Rest is no Prep
# table of counts
tab.alltest <- allTestDF %>% 
  group_by(region, test_group) %>%
  summarize(n.test.grp = sum(ego.wawt)) 

tab.alltest %>%
  pivot_wider(names_from = test_group, 
              values_from = n.test.grp) %>%
  kable(caption = "Test group by region (not on PrEP)", 
        col.names = c("", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 0) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by region (not on PrEP)
Test group
Never >2yrs ago 1-2yrs ago In last yr
EasternWA 20 5 16 36
King 50 51 57 262
WesternWA 28 32 41 140
tab.alltest %>%
  group_by(region) %>%
  mutate(prop.test.grp = scales::percent(n.test.grp / sum(n.test.grp))) %>%
  pivot_wider(-n.test.grp, names_from = test_group, 
              values_from = prop.test.grp) %>%
  kable(caption = "Test group by region (not on PrEP)", 
        col.names = c("Region", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by region (not on PrEP)
Test group
Region Never >2yrs ago 1-2yrs ago In last yr
EasternWA 25.7% 6.4% 20.5% 47.4%
King 11.81% 12.09% 13.66% 62.45%
WesternWA 11.5% 13.4% 16.9% 58.2%
tab.alltest %>% 
  group_by(region) %>%
  mutate(prop.test.grp = n.test.grp / sum(n.test.grp)) %>%
  ungroup() %>%
  ggplot(aes(x = test_group, y=prop.test.grp, color = region)) +
  geom_line() +
  geom_point() +
  # scale_color_discrete(name="Test group",
  #                     labels=c("Never", ">2yrs ago", "1-2 yrs ago", "In last yr")) +
  labs(title="Testing patterns by region (not on PrEP)",
       x = "Test group",
       y = "Proportion in test group")

### SNAP Group

# table of counts
tab.alltest <- allTestDF %>% 
  group_by(snap.grp, test_group) %>%
  summarize(n.test.grp = sum(ego.wawt)) 

tab.alltest %>%
  pivot_wider(names_from = test_group, 
              values_from = n.test.grp) %>%
  kable(caption = "Test group by snap.grp (not on PrEP)", 
        col.names = c("", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 0) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by snap.grp (not on PrEP)
Test group
Never >2yrs ago 1-2yrs ago In last yr
[0] 14 22 15 19
[1] 50 57 53 98
(1,5] 27 6 31 148
(5,10] 2 1 5 80
(10,20] 3 1 4 48
(20,125] NA 1 5 45
tab.alltest %>%
  group_by(snap.grp) %>%
  mutate(prop.test.grp = scales::percent(n.test.grp / sum(n.test.grp))) %>%
  pivot_wider(-n.test.grp, names_from = test_group, 
              values_from = prop.test.grp) %>%
  kable(caption = "Test group by snap.grp (not on PrEP)", 
        col.names = c("snap.grp", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by snap.grp (not on PrEP)
Test group
snap.grp Never >2yrs ago 1-2yrs ago In last yr
[0] 19.8% 31.4% 21.7% 27.1%
[1] 19.5% 22.0% 20.7% 37.9%
(1,5] 12.8% 2.9% 14.7% 69.7%
(5,10] 2.6% 0.9% 6.1% 90.4%
(10,20] 5.77% 2.33% 6.29% 85.61%
(20,125] NA 1.7% 9.7% 88.5%
tab.alltest %>% 
  group_by(snap.grp) %>%
  mutate(prop.test.grp = n.test.grp / sum(n.test.grp)) %>%
  ungroup() %>%
  ggplot(aes(x = test_group, y=prop.test.grp, color = snap.grp)) +
  geom_line() +
  geom_point() +
  # scale_color_discrete(name="Test group",
  #                     labels=c("Never", ">2yrs ago", "1-2 yrs ago", "In last yr")) +
  labs(title="Testing patterns by snap.grp (not on PrEP)",
       x = "Test group",
       y = "Proportion in test group")

SNAP Group (3 category)

# table of counts
tab.alltest <- allTestDF %>% 
  group_by(snap.grp3, test_group) %>%
  summarize(n.test.grp = sum(ego.wawt)) 

tab.alltest %>%
  pivot_wider(names_from = test_group, 
              values_from = n.test.grp) %>%
  kable(caption = "Test group by snap.grp3 (not on PrEP)", 
        col.names = c("", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 0) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by snap.grp3 (not on PrEP)
Test group
Never >2yrs ago 1-2yrs ago In last yr
0-1 64 79 69 117
2-5 27 6 31 148
6-125 6 3 14 174
tab.alltest %>%
  group_by(snap.grp3) %>%
  mutate(prop.test.grp = scales::percent(n.test.grp / sum(n.test.grp))) %>%
  pivot_wider(-n.test.grp, names_from = test_group, 
              values_from = prop.test.grp) %>%
  kable(caption = "Test group by snap.grp3 (not on PrEP)", 
        col.names = c("snap.grp3", "Never", ">2yrs ago", "1-2yrs ago", "In last yr"),
        digits = 1) %>%
  kable_styling(bootstrap_options = c("striped")) %>%
  add_header_above(c(" " = 1, "Test group" = 4))
Test group by snap.grp3 (not on PrEP)
Test group
snap.grp3 Never >2yrs ago 1-2yrs ago In last yr
0-1 19.5% 24.0% 20.9% 35.6%
2-5 12.8% 2.9% 14.7% 69.7%
6-125 2.8% 1.5% 7.1% 88.5%
tab.alltest %>% 
  group_by(snap.grp3) %>%
  mutate(prop.test.grp = n.test.grp / sum(n.test.grp)) %>%
  ungroup() %>%
  ggplot(aes(x = test_group, y=prop.test.grp, color = snap.grp3)) +
  geom_line() +
  geom_point() +
  # scale_color_discrete(name="Test group",
  #                     labels=c("Never", ">2yrs ago", "1-2 yrs ago", "In last yr")) +
  labs(title="Testing patterns by snap.grp3 (not on PrEP)",
       x = "Test group",
       y = "Proportion in test group")

Never testers

Age

allTestDF %>% group_by(age.grp) %>%
  mutate(nt = ifelse(never_tested == "F", 0, ego.wawt)) %>%
  summarize(neverTestProp = scales::percent(sum(nt)/sum(ego.wawt), 0.1)) %>%
  kable(caption = "Never tested by age group", 
        col.names = c("Age", "Percent")) %>%
  kable_styling(bootstrap_options = c("striped"))
Never tested by age group
Age Percent
1 39.3%
2 10.2%
3 7.0%
4 3.5%
5 4.2%
p <- allTestDF %>% group_by(age.grp) %>%
  mutate(nt = ifelse(never_tested == "F", 0, ego.wawt)) %>%
  summarize(neverTestProp = sum(nt)/sum(ego.wawt)) %>%
  ggplot(aes(x = age.grp, y = neverTestProp)) +
  ylim(0,1) +
  geom_point() + geom_line() +
  labs(title = "Never test proportion by age")

plotly::ggplotly(p)

Race

allTestDF %>% group_by(race) %>%
  mutate(nt = ifelse(never_tested == "F", 0, ego.wawt)) %>%
  summarize(neverTestProp = scales::percent(sum(nt)/sum(ego.wawt), 0.1)) %>%
  kable(caption = "Never tested by race", 
        col.names = c("Race", "Percent")) %>%
  kable_styling(bootstrap_options = c("striped"))
Never tested by race
Race Percent
B 23.8%
H 23.8%
O 10.9%
p <- allTestDF %>% group_by(age.grp, race) %>%
  mutate(nt = ifelse(never_tested == "F", 0, ego.wawt)) %>%
  summarize(neverTestProp = sum(nt)/sum(ego.wawt)) %>%
  ggplot(aes(x = age.grp, y = neverTestProp, color = race)) +
  ylim(0,1) +
  geom_point() + geom_line() +
  labs(title = "Never test proportion by race and age")

plotly::ggplotly(p)

Region

allTestDF %>% group_by(region) %>%
  mutate(nt = ifelse(never_tested == "F", 0, ego.wawt)) %>%
  summarize(neverTestProp = scales::percent(sum(nt)/sum(ego.wawt), 0.1)) %>%
  kable(caption = "Never tested by region", 
        col.names = c("Region", "Percent")) %>%
  kable_styling(bootstrap_options = c("striped"))
Never tested by region
Region Percent
EasternWA 25.7%
King 11.8%
WesternWA 11.5%
p <- allTestDF %>% group_by(age.grp, region) %>%
  mutate(nt = ifelse(never_tested == "F", 0, ego.wawt)) %>%
  summarize(neverTestProp = sum(nt)/sum(ego.wawt)) %>%
  ggplot(aes(x = age.grp, y = neverTestProp, color = region)) +
  ylim(0,1) +
  geom_point() + geom_line() +
  labs(title = "Never test proportion by region and age")
  
plotly::ggplotly(p)

Months since last test

Overall

# compare mos since last test by prep status
with(mltDF, boxplot(mos_last_test ~ prep, 
        ylim=c(0,100), varwidth=T))

# is mos since last test exponentially distributed?
# here just look for linear trend on log scale, will
# do more below.
# NB: ever testers only, not on PrEP

hist.data.wtd = with(mltDF[mltDF$prep==0,],
                 plotrix::weighted.hist(mos_last_test, 
                                        w = ego.wawt,
                                        breaks=20, plot=F))

hist.data.wtd$counts = log(hist.data.wtd$counts+1)
class(hist.data.wtd) <- "histogram"
plot(hist.data.wtd,
     ylab='log(Frequency)', 
     main = "Log plot of months since last test",
     sub = "(filter: Ever tested & not on PrEP)",
     xlab = "Months")

Exp fits by race

# Compare exponential fits for mlt for those not on PrEP

obs <- mltDF %>%
  filter(prep == 0) %>%
  mutate(mlt = mos_last_test,
         rate = 1/mlt) %>%
  select(race, age.grp, mlt, rate, ego.wawt)

## by Race  ------------------------------------------

byrace <- obs %>%
  group_by(race) %>%
  summarize(wtd.mean.mlt = weighted.mean(mlt, ego.wawt, na.rm=T),
            wtd.med.mean.mlt = weighted.median(mlt, ego.wawt)/log(2),
            wtd.median.mlt = weighted.median(mlt, ego.wawt),
            max = max(mlt),
            wtd.num = sum(ego.wawt),
            num = n())

all <- obs %>%
  summarize(race = "Total",
            wtd.mean.mlt = weighted.mean(mlt, ego.wawt, na.rm=T),
            wtd.med.mean.mlt = weighted.median(mlt, ego.wawt)/log(2),
            wtd.median.mlt = weighted.median(mlt, ego.wawt),
            max = max(mlt),
            wtd.num = sum(ego.wawt),
            num = n())

testtab <- bind_rows(byrace, all)


pred <- data.frame(t = rep(1:max(obs$mlt), 4),
                   race = c(rep("B", max(obs$mlt)),
                            rep("H", max(obs$mlt)),
                            rep("O", max(obs$mlt)),
                            rep("Total", max(obs$mlt)))) %>%
  left_join(testtab, by="race") %>%
  mutate(y.mean = wtd.num * 1/wtd.mean.mlt * 1/exp(t/wtd.mean.mlt),
         y.med.mean = wtd.num * 1/wtd.med.mean.mlt * 1/exp(t/wtd.med.mean.mlt),
         y.median = wtd.num * 1/wtd.median.mlt * 1/exp(t/wtd.median.mlt))

chkpred <- pred %>%
  group_by(race) %>%
  summarize(tot.mean = sum(y.mean),
         tot.medmean = sum(y.med.mean),
         tot.median = sum(y.median))

# Plot regular scale

# Observed data
obs2 <- obs %>%
  mutate(race = "Total") %>%
  bind_rows(., obs)

colors <- c("Adj.median" = "blue", 
            "Mean" = "red", 
            "Median" = "green")

ggplot(obs2, aes(x=mlt)) +
  geom_bar() +
  geom_line(data=pred, 
            aes(x=t, y=y.mean, color="Mean")) +
  geom_line(data=pred, 
            aes(x=t, y=y.med.mean, color="Adj.median")) +
  geom_line(data=pred, 
            aes(x=t, y=y.median, color="Median")) +
  xlim(0,100) +
  labs(title = "Months since last HIV test",
       caption = "Not on PrEP",
       x = "Months",
       color = "Legend") +
  scale_color_manual(values = colors) +
  facet_wrap(~race, scales = "free_y")

By age

# Histogram overlay

ggplot(obs, aes(x = pmin(36, mlt), 
                after_stat(density), fill=race, color=race)) +
  geom_histogram(alpha=0.5, binwidth = 3) +
  labs(title = "Histogram of MLT density by race",
       caption = "Not on Prep")

## Age analysis ------------------------------------------

byage.grp5 <- obs %>%
  mutate(age.grp = case_when(
           age.grp == 1 ~ "15-24",
           age.grp == 2 ~ "25-34",
           age.grp == 3 ~ "35-44",
           age.grp == 4 ~ "45-54",
           age.grp >4  ~ "55-65")) %>%
  group_by(age.grp) %>%
  summarize(wtd.mean.mlt = weighted.mean(mlt, ego.wawt, na.rm=T),
            wtd.med.mean.mlt = weighted.median(mlt, ego.wawt)/log(2),
            wtd.median.mlt = weighted.median(mlt, ego.wawt),
            max = max(mlt),
            wtd.num = sum(ego.wawt),
            num = n())

all <- obs %>%
  summarize(age.grp = "Total",
            wtd.mean.mlt = weighted.mean(mlt, ego.wawt, na.rm=T),
            wtd.med.mean.mlt = weighted.median(mlt, ego.wawt)/log(2),
            wtd.median.mlt = weighted.median(mlt, ego.wawt),
            max = max(mlt),
            wtd.num = sum(ego.wawt),
            num = n())

testtab5 <- bind_rows(byage.grp5, all)

# Split in 2 groups: 55+ vs. everyone else
byage.grp2 <- obs %>%
  mutate(age.grp = case_when(
    age.grp < 5 ~ "15-54",
    TRUE  ~ "55-65")) %>%
  group_by(age.grp) %>%
  summarize(wtd.mean.mlt = weighted.mean(mlt, ego.wawt, na.rm=T),
            wtd.med.mean.mlt = weighted.median(mlt, ego.wawt)/log(2),
            wtd.median.mlt = weighted.median(mlt, ego.wawt),
            max = max(mlt),
            wtd.num = sum(ego.wawt),
            num = n())

testtab2 <- bind_rows(byage.grp2, all)

# Plot 5 group version
ggplot(obs, aes(x = pmin(36, mlt), 
                after_stat(density),
                color=factor(age.grp),
                fill=factor(age.grp)) ) +
  geom_density(alpha = 0.3) +
  labs(title = "Histogram of MLT by age group",
       x = "Months (capped at 36+)",
       caption = "Not on Prep") +
  scale_fill_brewer(name="Age Group",
                    palette = "YlGnBu", direction = -1) +
  scale_color_brewer(palette = "YlGnBu", direction = -1) +
  guides(color=F)

# Plot 2 group version
ggplot(obs, aes(x = pmin(36, mlt), 
                after_stat(density),
                fill=factor(age.grp < 5)) ) +
  geom_density(alpha = 0.5) +
  labs(title = "Histogram of MLT by <45 vs 45+",
       x = "Months (capped at 36+)",
       caption = "Not on Prep") +
  scale_fill_discrete(name="Age Group",
                      breaks=c("FALSE", "TRUE"),
                      labels=c("45+", "15-44"))

Tests in last 2 years

Histograms

# is number of tests in last 2 yrs Poisson distributed?
# NB: never testers included
with(allTestDF, hist(tests_2yr[prep==0 & tests_2yr < 13], 
                  freq=F, breaks=13, right=F,
                  main = "Number of tests in last 2 yrs",
                  xlab = "Number (truncated at 10)",
                  sub = "Poisson overlay with lambda=mean(tests)"))
p = dpois(x = allTestDF$tests_2yr, #size = 1000, 
          lambda = mean(allTestDF$tests_2yr, na.rm=T))
lines(x = allTestDF$tests_2yr+0.5, y = p,
     type="h", lwd=2, col="blue", ylab="p")
points(x= allTestDF$tests_2yr+0.5,y = p,
       pch=16,cex=1.2,col="dark red")

# a bit better, but still way off with lambda = median
with(allTestDF, hist(tests_2yr[prep==0 & tests_2yr < 13], 
                  freq=F, breaks=13, right=F,
                  main = "Number of tests in last 2 yrs",
                  xlab = "Number (truncated at 10)",
                  sub = "Poisson overlay with lambda=median(tests)"))
p = dpois(x = allTestDF$tests_2yr, #size = 1000, 
          lambda = median(allTestDF$tests_2yr, na.rm=T))
lines(x = allTestDF$tests_2yr+0.5, y = p,
      type="h", lwd=2, col="blue", ylab="p")
points(x= allTestDF$tests_2yr+0.5,y = p,
       pch=16,cex=1.2,col="dark red")

QQplot

qqplot(rpois(1000, weighted.mean(allTestDF$tests_2yr, na.rm=T)),
       allTestDF$tests_2yr[allTestDF$prep==0],
       xlim = c(0,20), ylim = c(0,20),
       main = "QQplot for Poisson Distn of Tests",
       sub = "filter: Ever tested & not on PrEP",
       xlab = "Poisson quantiles",
       ylab = "Obs number of tests")
abline(0,1, col="red")

dev.off()
## null device 
##           1

Models

Method 1: Mixed exponential

The model assumes the distribution of months since last test is a mixed exponential, with two latent groups:

\[ MLT \sim exp(\phi\frac{1}{\lambda_1} + (1 - \phi)\frac{1}{\lambda_2}) \]

Estimated using the package VGAM, you get estimates of all 3 parameters:

  • \(\lambda_1\), the testing rate in latent grp 1

  • \(\lambda_2\), the testing rate in latent grp 2

  • \(\phi\), for the proportion in grp 1

Sample filtering:

We need to have value for months since last test, and we will recode 0 months to 0.1 months.

Size Sample criteria
832 Starting Sample
742 HIV-
557 Not on PrEP
461 Valid MLT

Overall

library(VGAM)

data <- mltDF %>% filter(prep==0) %>% 
  select(Y=mos_last_test, race) %>%
  mutate(Y = ifelse(Y==0, 0.1, Y))
summary(data)
##        Y              race          
##  Min.   :  0.10   Length:461        
##  1st Qu.:  4.00   Class :character  
##  Median : 10.00   Mode  :character  
##  Mean   : 22.48                     
##  3rd Qu.: 24.00                     
##  Max.   :295.00
myfit <- vglm(Y ~ 1, mix2exp, data = data, trace = TRUE)
## VGLM    linear loop  1 :  loglikelihood = -1843.5064
## VGLM    linear loop  2 :  loglikelihood = -1837.4827
## VGLM    linear loop  3 :  loglikelihood = -1835.8508
## VGLM    linear loop  4 :  loglikelihood = -1835.8315
## VGLM    linear loop  5 :  loglikelihood = -1835.8315
## VGLM    linear loop  6 :  loglikelihood = -1835.8315
Coef(myfit)
##        phi    lambda1    lambda2 
## 0.30055077 0.01892653 0.10597097
pred <- data.frame(y1 = rexp(nn <- 1000, Coef(myfit)[2])) %>%
  mutate(y2 = rexp(nn, Coef(myfit)[3]),
         Y  = ifelse(runif(nn) < Coef(myfit)[1], y1, y2))

with(data, plot(density(Y), xlim = c(-5, 100), 
                ylim = c(0,0.07), lwd=2,
                main = "All cases: Exponential mixture fit to MLT",
                xlab = "Months since last test (scale truncated at 100)",
                sub = paste("No PrEP: n=", dim(data)[1])))
with(pred, lines(density(y1), col="blue", lty=2))
with(pred, lines(density(y2), col="orange", lty=2))
with(pred, lines(density(Y), col="gray", lty=2))

abline(v=1/Coef(myfit)[2], col="blue")
abline(v=1/Coef(myfit)[3], col="orange")
abline(v=mean(data$Y))

text(1/Coef(myfit)[2], 0.06, paste(round(1/Coef(myfit)[2], 1)), 
     pos=4, col="blue")
text(1/Coef(myfit)[3], 0.06, paste(round(1/Coef(myfit)[3], 1)), 
     pos=4, col="orange")
text(mean(data$Y), 0.06, round(mean(data$Y), 1),
     pos=4)
text(80, 0.04, paste(scales::percent(Coef(myfit)[1]), "late testers"))

By race: B&H

data.bh <- data %>% filter(race !="O")
summary(data.bh)
##        Y              race          
##  Min.   :  0.10   Length:66         
##  1st Qu.:  2.00   Class :character  
##  Median :  6.50   Mode  :character  
##  Mean   : 19.70                     
##  3rd Qu.: 20.75                     
##  Max.   :170.00
myfit <- vglm(Y ~ 1, mix2exp, data = data.bh, trace = TRUE)
## VGLM    linear loop  1 :  loglikelihood = -249.23229
## VGLM    linear loop  2 :  loglikelihood = -246.69049
## VGLM    linear loop  3 :  loglikelihood = -246.33987
## VGLM    linear loop  4 :  loglikelihood = -246.33787
## VGLM    linear loop  5 :  loglikelihood = -246.33787
Coef(myfit)
##        phi    lambda1    lambda2 
## 0.44443977 0.02528401 0.26142372
pred.bh <- data.frame(y1 = rexp(nn <- 1000, Coef(myfit)[2])) %>%
  mutate(y2 = rexp(nn, Coef(myfit)[3]),
         Y  = ifelse(runif(nn) < Coef(myfit)[1], y1, y2))


with(data.bh, plot(density(Y), xlim = c(-5, 100), 
                ylim = c(0,0.07), lwd=2,
                main = "B&H:  Exponential mixture fit to MLT",
                xlab = "Months since last test (scale truncated at 100)",
                sub = paste("No PrEP: n=", dim(data.bh)[1])))
with(pred.bh, lines(density(y1), col="blue", lty=2))
with(pred.bh, lines(density(y2), col="orange", lty=2))
with(pred.bh, lines(density(Y), col="gray", lty=2))

abline(v=1/Coef(myfit)[2], col="blue")
abline(v=1/Coef(myfit)[3], col="orange")
abline(v=mean(data.bh$Y))
abline(v=mean(pred.bh$Y), col="grey")

text(1/Coef(myfit)[2], 0.06, paste(round(1/Coef(myfit)[2], 1)), 
     pos=4, col="blue")
text(1/Coef(myfit)[3], 0.06, paste(round(1/Coef(myfit)[3], 1)), 
     pos=4, col="orange")
text(mean(data.bh$Y), 0.06, round(mean(data.bh$Y), 1),
     pos=4)
text(mean(pred.bh$Y), 0.05, round(mean(pred.bh$Y), 1),
     pos=4, col='grey')
text(80, 0.04, paste(scales::percent(Coef(myfit)[1]), "late testers"))

By race: O

data.o <- data %>% filter(race == "O")
summary(data.o)
##        Y              race          
##  Min.   :  0.10   Length:395        
##  1st Qu.:  4.00   Class :character  
##  Median : 11.00   Mode  :character  
##  Mean   : 22.94                     
##  3rd Qu.: 24.00                     
##  Max.   :295.00
myfit <- vglm(Y ~ 1, mix2exp, data = data.o, trace = TRUE)
## VGLM    linear loop  1 :  loglikelihood = -1594.0674
## VGLM    linear loop  2 :  loglikelihood = -1588.6032
## VGLM    linear loop  3 :  loglikelihood = -1585.6365
## VGLM    linear loop  4 :  loglikelihood = -1585.6126
## VGLM    linear loop  5 :  loglikelihood = -1585.6126
Coef(myfit)
##        phi    lambda1    lambda2 
## 0.28649333 0.01830980 0.09777659
pred.o <- data.frame(y1 = rexp(nn <- 1000, Coef(myfit)[2])) %>%
  mutate(y2 = rexp(nn, Coef(myfit)[3]),
         Y  = ifelse(runif(nn) < Coef(myfit)[1], y1, y2))


with(data.o, plot(density(Y), xlim = c(-5, 100), 
                ylim = c(0,0.07), lwd=2,
                main = "O: Exponential mixture fit to MLT",
                xlab = "Months since last test (scale truncated at 100)",
                sub = paste("No PrEP: n=", dim(data.o)[1])))
with(pred.o, lines(density(y1), col="blue", lty=2))
with(pred.o, lines(density(y2), col="orange", lty=2))
with(pred.o, lines(density(Y), col="gray", lty=2))

abline(v=1/Coef(myfit)[2], col="blue")
abline(v=1/Coef(myfit)[3], col="orange")
abline(v=mean(data.o$Y))
abline(v=mean(pred.o$Y), col="grey")

text(1/Coef(myfit)[2], 0.06, paste(round(1/Coef(myfit)[2], 1)), 
     pos=4, col="blue")
text(1/Coef(myfit)[3], 0.06, paste(round(1/Coef(myfit)[3], 1)), 
     pos=4, col="orange")
text(mean(data.o$Y), 0.06, round(mean(data.o$Y), 1),
     pos=4)
text(mean(pred.o$Y), 0.05, round(mean(pred.o$Y), 1),
     pos=4, col='grey')
text(80, 0.04, paste(scales::percent(Coef(myfit)[1]), "late testers"))

Method 2: Survival models

Here we can use the never tested cases also, and treat them as tested. We will assume first possible test age is 15 so their observation time is (current.age-15-1) years.

We also add epsilon to the value 0 months.

Sample filtering

Size Sample criteria
832 Starting Sample
742 HIV-
557 Not on PrEP
coxdata <- allTestDF %>% 
  filter(prep==0) %>%
  mutate(mlt = case_when(never_tested=="T" ~ (age-14)*12,
                         mos_last_test == 0 ~ 0.5,
                         TRUE ~ mos_last_test),
         tested = ifelse(never_tested=="T", 0, 1),
         age.young = factor(ifelse(age.grp==1, "15-24", "25+")),
                            #levels = c("25+", "15-24")),
         race2 = factor(race, 
                        levels = c("O", "B", "H")),
         race.bipoc = factor(ifelse(race == "O", "O", "B+H"),
                             levels = c("O", "B+H")),
         region.ewa = factor(ifelse(region == "EasternWA", "EWA", "Other"))
         ) %>%
  filter(!is.na(mlt)) %>%
  select(age, age.grp, age.young, race, race2, race.bipoc, 
         region, region.ewa, snap.grp3,
         mlt, tested, never_tested, tests_2yr, ego.wawt)

saveRDS(coxdata, 
        file = here::here("Reports","WHAMPsurvey","MM","Testing",
                          "coxdata.RDS"))


# par(mfrow=c(1,3), oma=c(1,1,2,1))
# with(coxdata, boxplot(mlt~trunc(age)))
# with(coxdata, boxplot(mlt~age.grp, varwidth=T, ylim = c(0,100)))
# with(coxdata, boxplot(mlt~age.young, varwidth=T, ylim = c(0,100)))
# mtext("Months since last test by age", outer=T)

Survival plots

Overall

# survfit creates the KM plots

sfit <- survfit(Surv(mlt, tested) ~ 1 ,
                weights = ego.wawt,
                data=coxdata)

ggsurvplot(sfit, data=coxdata,
           xlim = c(0,200),
           break.time.by = 12,
           surv.median.line = "hv",
           main = "Survival Curve",
           xlab="Months",
           ylab="Proportion Not Tested")

Age

Clear age effects for youth.

sfit <- survfit(Surv(mlt, tested) ~ age.young ,
                weights = ego.wawt,
                data=coxdata)
ggsurvplot(sfit,
           pval = TRUE,
           xlim = c(0,120),
           break.time.by = 12,
           conf.int = T,
           title = "By age group (15-24 vs. older)",
           xlab = "Months since last test")

Race

Main

The small sample sizes for B & H really don’t support a detailed estimate.

sfit <- survfit(Surv(mlt, tested) ~ race ,
                weights = ego.wawt,
                data=coxdata)
ggsurvplot(sfit,
           pval = TRUE,
           conf.int = T,
           title = "By Race",
           xlab = "Months since last test")

Race x age
ggsurvplot_facet(sfit, data=coxdata, 
                 facet.by =  "age.young",
                 xlim = c(0,120),
                 break.time.by = 12,
                 pval = TRUE,
                 title = "Race, by age (15-24 vs. older)",
                 xlab = "Months since last test")

Race x region
ggsurvplot_facet(sfit, data=coxdata, 
                 facet.by =  "region",
                 pval = TRUE,
                 title = "Race, by region",
           xlab = "Months since last test")

Race x snap

Noisy, but highest risk group does show borderline significant differences. So there is an interaction by race and risk at the highest risk levels.

ggsurvplot_facet(sfit, data=coxdata, 
                 facet.by =  "snap.grp3",
                 pval = TRUE,
                 title = "Race, by snap.grp",
                 xlab = "Months since last test")

Region

Main

There’s almost no difference between King and Western WA, which supports collapsing these regions.

sfit <- survfit(Surv(mlt, tested) ~ region ,
                weights = ego.wawt,
                data=coxdata)

ggsurvplot(sfit,
           pval = TRUE,
           title = "By Region",
           conf.int = T,
           xlab = "Months since last test")

Region x age

Evidence of main effects, minimal interaction.

ggsurvplot_facet(sfit, data=coxdata, 
                 facet.by =  "age.young",
                 pval = TRUE,
                 title = "Region by age group (15-24 vs. older)",
                 xlab = "Months since last test")

SNAP

Significant differences.

sfit <- survfit(Surv(mlt, tested) ~ snap.grp3 ,
                weights = ego.wawt,
                data=coxdata)

ggsurvplot(sfit,
           pval = TRUE,
           conf.int = T,
           title = "By SNAP risk group",
           xlab = "Months since last test")

Age x Race x region

The lower rates of testing for 15-24 year olds is consistent except in Western WA. But the small sample size for this group is still a concern.

sfit <- survfit(Surv(mlt, tested) ~ age.young ,
                weights = ego.wawt,
                data=coxdata)

ggsp <- ggsurvplot_facet(sfit, data=coxdata, 
                         facet.by = c("race.bipoc", "region"),
                         xlim = c(0,120), # only show comparable time frame
                         break.time.by = 12,
                         pval = TRUE,
                         title = "Age 15-24, by race and region",
                         xlab = "Months since last test")
ggsp <- ggpar(ggsp,
              font.xtickslab = c(7, "plain"))
ggsp

Parametric Plots

For these models we will remove never-testers, defined as those who have not tested by age 45.

with(coxdata, table(never_tested, age.grp, useNA = "al"))
##             age.grp
## never_tested   1   2   3   4   5 <NA>
##         F     69 156  96  71  69    0
##         T     46  25  12   4   4    0
##         <NA>   0   0   0   0   0    0
coxdata <- coxdata %>% 
  filter(!(never_tested == "T" & age > 45))

Null model

sdf <- with(coxdata, Surv(mlt, tested))

KM.fit <- survfit(sdf ~ 1, data=coxdata, weights = ego.wawt)
exp.fit <- survreg(sdf ~ 1, data=coxdata, weights = ego.wawt, 
                   dist = "exponential")
weib.fit <- survreg(sdf ~ 1, data=coxdata, weights = ego.wawt, 
                    dist = "weibull")
lolog.fit <- survreg(sdf ~ 1, data=coxdata, weights = ego.wawt, 
                     dist = "loglogistic")

probs <- seq(.01,.99,by=.01)

pred.df = data.frame(y = rev(probs), 
                     exponential = predict(exp.fit, type="quantile", p=probs)[1,], 
                     weibull = predict(weib.fit, type="quantile", p=probs)[1,], 
                     lolog = predict(lolog.fit, type="quantile", p=probs)[1,])
pred.df_long = pivot_longer(pred.df, !y, names_to = "model", values_to = "x")

p <- ggsurvplot(KM.fit, data = coxdata,
           xlim = c(0,240),
           break.time.by = 24,
           surv.median.line = "hv",
           main = "Survival Curves: KM and parametric",
           xlab="Months",
           ylab="Proportion Not Tested",
           caption = "HIV-, not on PrEP, xlim 20 yrs") 

p <- p$plot + geom_line(data=pred.df_long, 
            aes(x=x, y=y, color=model),
            lwd=1) +
  scale_fill_discrete(guide=FALSE)

p

Age

age.levels = levels(coxdata$age.young)

## Kaplan-Meier estimator
km.age <- survfit(sdf ~ age.young, data = coxdata, weight = ego.wawt)

p <- ggsurvplot(km.age, 
                xlim = c(0,240),
                break.time.by = 24,
                #surv.median.line = "hv",
                title = "Cox PH Curves by age",
                xlab="Months",
                ylab="Proportion Not Tested",
                caption = "HIV-, not on PrEP, xlim 20 yrs",
                legend.labs = age.levels,
                legend.title = "KM:")
#p
Cox PH
## Cox regression by Efron method
cox.age <- coxph(sdf ~ age.young, data = coxdata, weight = ego.wawt)

pred.age <- survfit(cox.age, 
                    newdata = data.frame(age.young = age.levels))
pred.age.df <- data.frame(x = rep(pred.age$time, ncol(pred.age$surv)),
                          y = c(pred.age$surv[,1], 
                                pred.age$surv[,2]),
                          age.young = c(rep(age.levels[1], nrow(pred.age$surv)),
                                        rep(age.levels[2], nrow(pred.age$surv))))

coxp <- p$plot +
  geom_line(data = pred.age.df,
            aes(x=x, y=y, group=age.young, linetype=age.young, color=age.young), 
            lwd = .7) +
  labs(title = "Cox PH Curves by Age") +
  scale_linetype_discrete(name="CoxPH: ")

coxp

Exponential
## Exponential regression
exp.age <- survreg(sdf ~ age.young,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "exponential")

# pred now returns the x values (quantiles)
pred.age <- predict(exp.age, 
                     newdata = data.frame(age.young = age.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.age.df <- data.frame(x = c(pred.age[1,], pred.age[2,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.age)),
                           age = c(rep(age.levels[1], ncol(pred.age)),
                                    rep(age.levels[2], ncol(pred.age))))

expp <- p$plot +
  geom_line(data = pred.age.df,
            aes(x=x, y=y, group=age, color = age, linetype=age), 
            lwd=.8) +
  labs(title = "Exponential Curves by Age") +
  scale_linetype_discrete(name="Exponential: ")

expp

Weibull
## Weibull regression
weib.age <- survreg(sdf ~ age.young,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "weibull")

# pred now returns the x values (quantiles)
pred.age <- predict(weib.age, 
                     newdata = data.frame(age.young = age.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.age.df <- data.frame(x = c(pred.age[1,], pred.age[2,]),
                          y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.age)),
                          age = c(rep(age.levels[1], ncol(pred.age)),
                                  rep(age.levels[2], ncol(pred.age))))

weibp <- p$plot +
  geom_line(data = pred.age.df,
            aes(x=x, y=y, group=age, color = age, linetype=age), 
            lwd=.8) +
  labs(title = "Weibull Curves by Age") +
  scale_linetype_discrete(name="Weibull: ")

weibp

Log-logistic
lolog.age <- survreg(sdf ~ age.young,
                     data=coxdata,
                     weights = ego.wawt,
                     dist = "loglogistic")

# pred now returns the x values (quantiles)
pred.age <- predict(lolog.age, 
                     newdata = data.frame(age.young = age.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.age.df <- data.frame(x = c(pred.age[1,], pred.age[2,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.age)),
                           age = c(rep(age.levels[1], ncol(pred.age)),
                                    rep(age.levels[2], ncol(pred.age))))

lologp <- p$plot +
  geom_line(data = pred.age.df,
            aes(x=x, y=y, group=age, color = age, linetype=age), 
            lwd=.8) +
  labs(title = "Log-logistic Curves by Age") +
  scale_linetype_discrete(name="Lolog: ")

lologp

Race

race.levels = sort(unique(coxdata$race)) # alphabetic

## Kaplan-Meier curves
km.race <- survfit(sdf ~ race, data = coxdata, weight = ego.wawt)

p <- ggsurvplot(km.race, 
                xlim = c(0,240),
                break.time.by = 24,
                #surv.median.line = "hv",
                title = "Cox PH Curves by Race",
                xlab="Months",
                ylab="Proportion Not Tested",
                caption = "HIV-, not on PrEP, xlim 20 yrs",
                legend.labs = race.levels, # comment out to verify
                legend.title = "KM Race")
#p
Cox PH
## Cox regression by Efron method
cox.race <- coxph(sdf ~ race, data = coxdata, weight = ego.wawt)
pred.race <- survfit(cox.race, 
                newdata = data.frame(race = race.levels))
pred.race.df <- data.frame(x = rep(pred.race$time, ncol(pred.race$surv)),
                           y = c(pred.race$surv[,1], 
                                 pred.race$surv[,2], 
                                 pred.race$surv[,3]),
                           race = c(rep(race.levels[1],nrow(pred.race$surv)),
                                    rep(race.levels[2],nrow(pred.race$surv)),
                                    rep(race.levels[3],nrow(pred.race$surv))))

coxp <- p$plot +
  geom_line(data = pred.race.df,
            aes(x=x, y=y, group=race, color = race, linetype=race), 
            lwd=.8) +
  labs(title = "Cox PH Curves by Race") +
  scale_linetype_discrete(name="CoxPH: ")

coxp

Exponential
## Exponential regression
exp.race <- survreg(sdf ~ race,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "exponential")

# pred now returns the x values (quantiles)
pred.race <- predict(exp.race, 
                     newdata = data.frame(race = race.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.race.df <- data.frame(x = c(pred.race[1,], pred.race[2,], pred.race[3,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.race)),
                           race = c(rep(race.levels[1], ncol(pred.race)),
                                    rep(race.levels[2], ncol(pred.race)),
                                    rep(race.levels[3], ncol(pred.race))))

expp <- p$plot +
  geom_line(data = pred.race.df,
            aes(x=x, y=y, group=race, color = race, linetype=race), 
            lwd=.8) +
  labs(title = "Exponential Curves by Race") +
  scale_linetype_discrete(name="Exponential: ")

expp

Weibull
## Weibull regression
weib.race <- survreg(sdf ~ race,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "weibull")

# pred now returns the x values (quantiles)
pred.race <- predict(weib.race, 
                     newdata = data.frame(race = race.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.race.df <- data.frame(x = c(pred.race[1,], pred.race[2,], pred.race[3,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.race)),
                           race = c(rep(race.levels[1], ncol(pred.race)),
                                    rep(race.levels[2], ncol(pred.race)),
                                    rep(race.levels[3], ncol(pred.race))))

weibp <- p$plot +
  geom_line(data = pred.race.df,
            aes(x=x, y=y, group=race, color = race, linetype=race), 
            lwd=.8) +
  labs(title = "Weibull Curves by Race") +
  scale_linetype_discrete(name="Weibull: ")

weibp

Log-logistic
lolog.race <- survreg(sdf ~ race,
                     data=coxdata,
                     weights = ego.wawt,
                     dist = "loglogistic")

# pred now returns the x values (quantiles)
pred.race <- predict(lolog.race, 
                     newdata = data.frame(race = race.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.race.df <- data.frame(x = c(pred.race[1,], pred.race[2,], pred.race[3,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.race)),
                           race = c(rep(race.levels[1], ncol(pred.race)),
                                    rep(race.levels[2], ncol(pred.race)),
                                    rep(race.levels[3], ncol(pred.race))))

lologp <- p$plot +
  geom_line(data = pred.race.df,
            aes(x=x, y=y, group=race, color = race, linetype=race), 
            lwd=.8) +
  labs(title = "Log-logistic Curves by Race") +
  scale_linetype_discrete(name="Lolog: ")

lologp

Region

region.levels <- levels(coxdata$region.ewa)

## Kaplan-Meier estimator
km.region <- survfit(sdf ~ region.ewa, data = coxdata, weight = ego.wawt)

p <- ggsurvplot(km.region, 
                xlim = c(0,240),
                break.time.by = 24,
                #surv.median.line = "hv",
                title = "Cox PH Curves by region",
                xlab="Months",
                ylab="Proportion Not Tested",
                caption = "HIV-, not on PrEP, xlim 20 yrs",
                legend.labs = region.levels, # comment out to verify
                legend.title = "KM region")
#p
Cox PH
## Cox regression by Efron method
cox.region <- coxph(sdf ~ region.ewa, data = coxdata, weight = ego.wawt)

pred.region <- survfit(cox.region, 
                       newdata = data.frame(region.ewa = region.levels))
pred.region.df <- data.frame(x = rep(pred.region$time, ncol(pred.region$surv)),
                          region.ewa = c(rep(region.levels[1], 
                                             nrow(pred.region$surv)), 
                                         rep(region.levels[2],
                                             nrow(pred.region$surv))),
                          y = c(pred.region$surv[,1], 
                                pred.region$surv[,2]))


coxp <- p$plot +
  geom_line(data = pred.region.df,
            aes(x=x, y=y, group=region.ewa, linetype=region.ewa, color=region.ewa), 
            lwd = .7) +
  labs(title = "Cox PH Curves by Region") +
  scale_linetype_discrete(name="CoxPH: ")

coxp

Exponential
## Exponential regression
exp.region <- survreg(sdf ~ region.ewa,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "exponential")

# pred now returns the x values (quantiles)
pred.region <- predict(exp.region, 
                     newdata = data.frame(region.ewa = region.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.region.df <- data.frame(x = c(pred.region[1,], pred.region[2,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.region)),
                           region = c(rep(region.levels[1], ncol(pred.region)),
                                    rep(region.levels[2], ncol(pred.region))))

expp <- p$plot +
  geom_line(data = pred.region.df,
            aes(x=x, y=y, group=region, color = region, linetype=region), 
            lwd=.8) +
  labs(title = "Exponential Curves by Region") +
  scale_linetype_discrete(name="Exponential: ")

expp

Weibull
## Weibull regression
weib.region <- survreg(sdf ~ region.ewa,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "weibull")

# pred now returns the x values (quantiles)
pred.region <- predict(weib.region, 
                     newdata = data.frame(region.ewa = region.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.region.df <- data.frame(x = c(pred.region[1,], pred.region[2,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.region)),
                           region = c(rep(region.levels[1], ncol(pred.region)),
                                    rep(region.levels[2], ncol(pred.region))))

weibp <- p$plot +
  geom_line(data = pred.region.df,
            aes(x=x, y=y, group=region, color = region, linetype=region), 
            lwd=.8) +
  labs(title = "Weibull Curves by Region") +
  scale_linetype_discrete(name="Weibull: ")

weibp

Log-logistic
lolog.region <- survreg(sdf ~ region.ewa,
                     data=coxdata,
                     weights = ego.wawt,
                     dist = "loglogistic")

# pred now returns the x values (quantiles)
pred.region <- predict(lolog.region, 
                     newdata = data.frame(region.ewa = region.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.region.df <- data.frame(x = c(pred.region[1,], pred.region[2,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.region)),
                           region = c(rep(region.levels[1], ncol(pred.region)),
                                    rep(region.levels[2], ncol(pred.region))))

lologp <- p$plot +
  geom_line(data = pred.region.df,
            aes(x=x, y=y, group=region, color = region, linetype=region), 
            lwd=.8) +
  labs(title = "Log-logistic Curves by Region") +
  scale_linetype_discrete(name="Lolog: ")

lologp

SNAP

snap.levels = levels(coxdata$snap.grp3)

## Kaplan-Meier estimator
km.snap <- survfit(sdf ~ snap.grp3, data = coxdata, weight = ego.wawt)

p <- ggsurvplot(km.snap, 
                xlim = c(0,240),
                break.time.by = 24,
                #surv.median.line = "hv",
                xlab="Months",
                ylab="Proportion Not Tested",
                caption = "HIV-, not on PrEP, xlim 20 yrs",
                legend.labs = snap.levels,
                legend.title = "KM SNAP")
#p
Cox PH
## Cox regression by Efron method
cox.snap <- coxph(sdf ~ snap.grp3, data = coxdata, weight = ego.wawt)

pred.snap <- survfit(cox.snap, 
                newdata = data.frame(snap.grp3 = snap.levels))
pred.snap.df <- data.frame(x = rep(pred.snap$time, ncol(pred.snap$surv)),
                       snap.grp3 = c(rep(snap.levels[1],
                                         nrow(pred.snap$surv)), # order follows factor
                                     rep(snap.levels[2],
                                         nrow(pred.snap$surv)),
                                     rep(snap.levels[3],
                                         nrow(pred.snap$surv))),
                       y = c(pred.snap$surv[,1], 
                             pred.snap$surv[,2], 
                             pred.snap$surv[,3]))

coxp <- p$plot +
  geom_line(data = pred.snap.df,
            aes(x=x, y=y, group=snap.grp3, linetype=snap.grp3, color=snap.grp3), 
            lwd = .7) +
  labs(title = "Cox PH Curves by SNAP") +
  scale_linetype_discrete(name="CoxPH: ")

coxp

Exponential
## Exponential regression
exp.snap <- survreg(sdf ~ snap.grp3,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "exponential")

# pred now returns the x values (quantiles)
pred.snap <- predict(exp.snap, 
                     newdata = data.frame(snap.grp3 = snap.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.snap.df <- data.frame(x = c(pred.snap[1,], pred.snap[2,], pred.snap[3,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.snap)),
                           snap = c(rep(snap.levels[1], ncol(pred.snap)),
                                    rep(snap.levels[2], ncol(pred.snap)),
                                    rep(snap.levels[3], ncol(pred.snap))))

expp <- p$plot +
  geom_line(data = pred.snap.df,
            aes(x=x, y=y, group=snap, color = snap, linetype=snap), 
            lwd=.8) +
  labs(title = "Exponential Curves by SNAP") +
  scale_linetype_discrete(name="Exponential: ")

expp

Weibull
## Weibull regression
weib.snap <- survreg(sdf ~ snap.grp3,
                    data=coxdata,
                    weights = ego.wawt,
                    dist = "weibull")

# pred now returns the x values (quantiles)
pred.snap <- predict(weib.snap, 
                     newdata = data.frame(snap.grp3 = snap.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.snap.df <- data.frame(x = c(pred.snap[1,], pred.snap[2,], pred.snap[3,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.snap)),
                           snap = c(rep(snap.levels[1], ncol(pred.snap)),
                                    rep(snap.levels[2], ncol(pred.snap)),
                                    rep(snap.levels[3], ncol(pred.snap))))

weibp <- p$plot +
  geom_line(data = pred.snap.df,
            aes(x=x, y=y, group=snap, color = snap, linetype=snap), 
            lwd=.8) +
  labs(title = "Weibull Curves by SNAP") +
  scale_linetype_discrete(name="Weibull: ")

weibp

Log-logistic
lolog.snap <- survreg(sdf ~ snap.grp3,
                     data=coxdata,
                     weights = ego.wawt,
                     dist = "loglogistic")

# pred now returns the x values (quantiles)
pred.snap <- predict(lolog.snap, 
                     newdata = data.frame(snap.grp3 = snap.levels),
                     type="quantile", p=seq(.01,.99,by=.01))
pred.snap.df <- data.frame(x = c(pred.snap[1,], pred.snap[2,], pred.snap[3,]),
                           y = rep(rev(seq(0.01, 0.99, by = 0.01)), nrow(pred.snap)),
                           snap = c(rep(snap.levels[1], ncol(pred.snap)),
                                    rep(snap.levels[2], ncol(pred.snap)),
                                    rep(snap.levels[3], ncol(pred.snap))))

lologp <- p$plot +
  geom_line(data = pred.snap.df,
            aes(x=x, y=y, group=snap, color = snap, linetype=snap), 
            lwd=.8) +
  labs(title = "Log-logistic Curves by SNAP") +
  scale_linetype_discrete(name="Lolog: ")

lologp

Full model fits

Cox PH

Estimates
cox.fit <- coxph(Surv(mlt, tested) ~
                   age.young + 
                   race2 +
                   region.ewa + snap.grp3,
                 weights = ego.wawt,
                 data=coxdata)
summary(cox.fit)
## Call:
## coxph(formula = Surv(mlt, tested) ~ age.young + race2 + region.ewa + 
##     snap.grp3, data = coxdata, weights = ego.wawt)
## 
##   n= 544, number of events= 461 
## 
##                    coef exp(coef) se(coef) robust se      z Pr(>|z|)    
## age.young25+     0.6692    1.9527   0.1301    0.1634  4.095 4.23e-05 ***
## race2B          -0.3323    0.7173   0.2307    0.3335 -0.996   0.3191    
## race2H          -0.3977    0.6719   0.1574    0.1814 -2.193   0.0283 *  
## region.ewaOther  0.4454    1.5611   0.1584    0.1735  2.566   0.0103 *  
## snap.grp32-5     0.5404    1.7167   0.1125    0.1292  4.182 2.89e-05 ***
## snap.grp36-125   1.2495    3.4885   0.1398    0.1574  7.936 2.09e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## age.young25+       1.9527     0.5121    1.4175    2.6899
## race2B             0.7173     1.3941    0.3731    1.3791
## race2H             0.6719     1.4884    0.4709    0.9587
## region.ewaOther    1.5611     0.6406    1.1110    2.1936
## snap.grp32-5       1.7167     0.5825    1.3326    2.2115
## snap.grp36-125     3.4885     0.2867    2.5623    4.7496
## 
## Concordance= 0.654  (se = 0.017 )
## Likelihood ratio test= 113.2  on 6 df,   p=<2e-16
## Wald test            = 92.86  on 6 df,   p=<2e-16
## Score (logrank) test = 123.2  on 6 df,   p=<2e-16,   Robust = 74.35  p=5e-14
## 
##   (Note: the likelihood ratio and score tests assume independence of
##      observations within a cluster, the Wald and robust score tests do not).
Forest plot
ggforest(cox.fit, data = coxdata)

Residual plot
ggcoxdiagnostics(cox.fit, type = "deviance")

Term plots

Base R plot, kinda neat

termplot(cox.fit, data=coxdata, partial.resid = T)

Exponential

exp.fit <- survreg(Surv(mlt, tested) ~
                   age.young + 
                   race2 +
                   region.ewa + snap.grp3,
                 data=coxdata,
                 weights = ego.wawt,
                 dist = "exponential")
summary(exp.fit)
## 
## Call:
## survreg(formula = Surv(mlt, tested) ~ age.young + race2 + region.ewa + 
##     snap.grp3, data = coxdata, weights = ego.wawt, dist = "exponential")
##                  Value Std. Error      z       p
## (Intercept)      5.164      0.205  25.14 < 2e-16
## age.young25+    -0.523      0.129  -4.05 5.1e-05
## race2B           0.295      0.230   1.28     0.2
## race2H           0.753      0.164   4.59 4.4e-06
## region.ewaOther -0.772      0.161  -4.79 1.6e-06
## snap.grp32-5    -0.598      0.111  -5.38 7.5e-08
## snap.grp36-125  -1.825      0.140 -13.00 < 2e-16
## 
## Scale fixed at 1 
## 
## Exponential distribution
## Loglik(model)= -2081.8   Loglik(intercept only)= -2165.6
##  Chisq= 167.6 on 6 degrees of freedom, p= 1.4e-33 
## Number of Newton-Raphson Iterations: 6 
## n= 544

Weibull

weib.fit <- survreg(Surv(mlt, tested) ~
                   age.young + 
                   race2 +
                   region.ewa + snap.grp3,
                 data=coxdata,
                 weights = ego.wawt,
                 dist = "weibull")
summary(weib.fit)
## 
## Call:
## survreg(formula = Surv(mlt, tested) ~ age.young + race2 + region.ewa + 
##     snap.grp3, data = coxdata, weights = ego.wawt, dist = "weibull")
##                   Value Std. Error     z       p
## (Intercept)      5.3802     0.3152 17.07 < 2e-16
## age.young25+    -0.9066     0.2057 -4.41 1.0e-05
## race2B           0.4843     0.3626  1.34 0.18159
## race2H           0.7321     0.2480  2.95 0.00315
## region.ewaOther -0.8321     0.2486 -3.35 0.00082
## snap.grp32-5    -0.8010     0.1754 -4.57 4.9e-06
## snap.grp36-125  -1.9970     0.2143 -9.32 < 2e-16
## Log(scale)       0.4531     0.0363 12.50 < 2e-16
## 
## Scale= 1.57 
## 
## Weibull distribution
## Loglik(model)= -1981.1   Loglik(intercept only)= -2034.9
##  Chisq= 107.73 on 6 degrees of freedom, p= 6.1e-21 
## Number of Newton-Raphson Iterations: 5 
## n= 544

Log-logistic

lolog.fit <- survreg(Surv(mlt, tested) ~
                   age.young + 
                   race2 +
                   region.ewa + snap.grp3,
                 data=coxdata,
                 weights = ego.wawt,
                 dist = "loglogistic")
summary(lolog.fit)
## 
## Call:
## survreg(formula = Surv(mlt, tested) ~ age.young + race2 + region.ewa + 
##     snap.grp3, data = coxdata, weights = ego.wawt, dist = "loglogistic")
##                   Value Std. Error     z       p
## (Intercept)      4.2909     0.2792 15.37 < 2e-16
## age.young25+    -0.8176     0.1807 -4.53 6.0e-06
## race2B           0.5861     0.3407  1.72   0.085
## race2H           0.3059     0.2368  1.29   0.196
## region.ewaOther -0.4879     0.2349 -2.08   0.038
## snap.grp32-5    -1.0378     0.1674 -6.20 5.7e-10
## snap.grp36-125  -1.9731     0.2050 -9.62 < 2e-16
## Log(scale)      -0.0482     0.0396 -1.22   0.224
## 
## Scale= 0.953 
## 
## Log logistic distribution
## Loglik(model)= -1932.6   Loglik(intercept only)= -1993.8
##  Chisq= 122.35 on 6 degrees of freedom, p= 5.2e-24 
## Number of Newton-Raphson Iterations: 4 
## n= 544

Observed vs. predicted MLT by model

For the MLT metric, it’s not clear that there is a Cox PH prediction, but we can compare all of the parametric models

Individual level

These all look pretty wonky. Note the y-axis scale differences across the models.

# for CoxPH we predict using "expected" 
# not sure how to compare this to observed, since 
# it looks nothing like MLT
pred.cox <- predict(cox.fit, type="expected", se.fit=TRUE)

# for survreg, response=mlt so risk = 1/response
pred.exp <- predict(exp.fit, type="response")
pred.weib <- predict(weib.fit, type="response")
pred.lolog <- predict(lolog.fit, type="response")

result.df <- data.frame(mlt = coxdata$mlt,
                        exp = pred.exp,
                        weibull = pred.weib,
                        lolog = pred.lolog)
GGally::ggpairs(result.df,
                title = "Individual level observed vs. predicted MLT")

Distribution level

Log-logistic looks like the best fit, but it does not do a great job with the long tail.

ggplot(result.df) +
  geom_histogram(aes(x=mlt, y = ..density..)) +
  scale_x_continuous(limits = c(0, 300)) +
  geom_density(data = result.df %>% 
                 select(-mlt) %>%
                 pivot_longer(everything(),
                              names_to = "model",
                              values_to = "fit"),
               aes(x=fit, group=model, color=model),
               adjust=3) +
  labs(title = "Observed vs. Predicted MLT Distns: Parametric survival models",
       x = "Months since last test",
       caption = "Full Models; xlab truncated at 300")

Predicted testing risks by model

These are the predicted risks of testing, by covariate, that we will use to govern the dynamics of testing in EpiModel. Here we can compare to the Cox PH estimates, and again, the Log-logistic wins – it is almost identical to the non-parametric Cox PH risk estimates by covariate.

All models

# Set up covariate DF for newdata
covardf <- with(coxdata, expand.grid(
  age.young = unique(age.young),
  race2 = unique(race2),
  region.ewa = unique(region.ewa),
  snap.grp3 = unique(snap.grp3)))

# for CoxPH we predict relative risk; 
# rr = baseline hazard * absolute risk
pred.cox <- predict(cox.fit, newdata = covardf, type="risk", se.fit=TRUE)
pred.cox.df <- data.frame(covardf, pred = pred.cox$fit)

# for survreg, response=mlt so risk = 1/response
pred.exp <- predict(exp.fit, newdata=covardf, type="response")
pred.exp.df <- data.frame(covardf, pred = 1/pred.exp)

pred.weib <- predict(weib.fit, newdata=covardf, type="response")
pred.weib.df <- data.frame(covardf, pred = 1/pred.weib)

pred.lolog <- predict(lolog.fit, newdata=covardf, type="response")
pred.lolog.df <- data.frame(covardf, pred = 1/pred.lolog)

result.df <- data.frame(cox = pred.cox.df$pred,
                        exp = pred.exp.df$pred,
                        weibull = pred.weib.df$pred,
                        lolog = pred.lolog.df$pred)
GGally::ggpairs(result.df,
                title = "Predicted risk of testing in covariate groups by model")

Cox PH vs. Lolog

lmfit <- lm(result.df$lolog ~ result.df$cox)

plot(result.df$cox, result.df$lolog,
     #xlim = c(0, 0.3), ylim = c(0, 0.3),
     lwd = 2, col="slateblue",
     main = "Predicted Risk: Cox PH vs. Log-logistic",
     xlab = "Cox PH RR", ylab = "Lolog risk",
     sub = "Covariate grid: age.young, race, region.ewa, snap.grp3")
abline(lmfit$coef, col="red")