Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  readxl,
  patchwork,
  mgcv,
  ggeffects
)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## 
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
theme_set(theme_bw())

options(
    digits = 3
)

Data

#read data
le = read_csv("data/life-expectancy/life-expectancy.csv") %>% df_legalize_names()
## Rows: 21565 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Period life expectancy at birth - Sex: total - Age: 0
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hiv = read_csv("data/share-of-the-population-infected-with-hiv/share-of-the-population-infected-with-hiv.csv") %>% df_legalize_names()
## Rows: 6289 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, HIV prevalence - Sex: total - Age: 15-49 - Central estimate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
antivirals = read_csv("data/antivirals.csv") %>% df_legalize_names()
## Rows: 1552 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (11): IndicatorCode, Indicator, ValueType, ParentLocationCode, ParentLo...
## dbl   (5): Period, FactValueNumeric, FactValueNumericLow, FactValueNumericHi...
## lgl  (17): IsLatestYear, Dim1 type, Dim1, Dim1ValueCode, Dim2 type, Dim2, Di...
## dttm  (1): DateModified
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
regions = read_excel("data/UNSD — Methodology.xlsx") %>% df_legalize_names()

#join
d = full_join(
  le %>% select(-Entity) %>% rename(ISO = Code, LE = Period_life_expectancy_at_birth_Sex_total_Age_0) %>% filter(!is.na(ISO)),
  hiv %>% select(-Entity) %>% rename(ISO = Code, HIV = HIV_prevalence_Sex_total_Age_15_49_Central_estimate) %>% filter(!is.na(ISO))
) %>% full_join(
  antivirals %>% select(SpatialDimValueCode, Period, FactValueNumeric) %>% rename(ISO = SpatialDimValueCode, antivirals = FactValueNumeric, Year = Period) %>% filter(!is.na(ISO))
) %>% full_join(
  regions %>% select(ISO = ISO_alpha3_Code, Region1 = Region_Name, Region2 = Sub_region_Name, Region3 = Intermediate_Region_Code) %>% filter(!is.na(ISO))
) %>% mutate(
  name = pu_translate(ISO, reverse = T)
)
## Joining with `by = join_by(ISO, Year)`
## Joining with `by = join_by(ISO, Year)`
## Joining with `by = join_by(ISO)`
## No match: OWID_KOS
## No match: OWID_USS
## No match: OWID_WRL
#codings
d %<>% mutate(
  HIV_region = name %in% (d %>% filter(Year == 2019, HIV > 10) %>% pull(name))
)

#no dups for a single year
d %>% filter(Year == 2019) %>% pull(ISO) %>% anyDuplicated()
## [1] 0

Analysis

#reproduce initial plot
d %>% 
  filter(Region2 == "Sub-Saharan Africa", Year == 2019) %>% 
  GG_scatter("HIV", "LE", case_names = "name") +
  labs(
    title = "HIV vs. life expectancy in Sub-Saharan Africa",
    x = "HIV prevalence (%)",
    y = "Life expectancy (years)"
  )
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/HIV_LE_Africa_2019.png")
## `geom_smooth()` using formula = 'y ~ x'
#timelines of LE in SSA by HIV prevalence
p_le = d %>% 
  filter(
    Year %in% (1980:2019),
    Region2 == "Sub-Saharan Africa"
  ) %>% 
  ggplot(aes(Year, LE, color = HIV_region, group = ISO)) +
  geom_line() +
  labs(
    title = "Life expectancy in Sub-Saharan Africas countries",
    x = "Year",
    y = "Life expectancy (years)",
    color = "HIV > 10% in 2019"
  ) +
  theme(plot.title = element_text(size = 9))

#average across countries
p_le_avg = d %>% 
  filter(
    Year %in% (1980:2019),
    Region2 == "Sub-Saharan Africa"
  ) %>% 
  group_by(Year, HIV_region) %>%
  summarise(LE = mean(LE)) %>% 
  ungroup() %>% 
  ggplot(aes(Year, LE, color = HIV_region)) +
  geom_line() +
  labs(
    title = "Average by group",
    x = "Year",
    y = "Life expectancy (years)",
    color = "HIV > 10% in 2019"
  )
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
#combine plots
wrap_plots(
  p_le, p_le_avg,
  guides = "collect",
  axes = "collect"
  )

GG_save("figs/LE_Africa_1980_2019_HIV.png")

#start with linear model
lm_model = lm(
  LE ~ HIV * antivirals + Year,
  data = d %>% filter(Region2 == "Sub-Saharan Africa")
)

lm_model %>% summary()
## 
## Call:
## lm(formula = LE ~ HIV * antivirals + Year, data = d %>% filter(Region2 == 
##     "Sub-Saharan Africa"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.383  -3.011   0.187   2.916  16.202 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -5.92e+02   1.37e+02   -4.31  2.1e-05 ***
## HIV            -3.94e-01   7.23e-02   -5.45  9.7e-08 ***
## antivirals      2.01e-02   1.89e-02    1.06   0.2881    
## Year            3.24e-01   6.84e-02    4.73  3.3e-06 ***
## HIV:antivirals  3.52e-03   1.20e-03    2.92   0.0037 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.95 on 333 degrees of freedom
##   (3458 observations deleted due to missingness)
## Multiple R-squared:  0.422,  Adjusted R-squared:  0.415 
## F-statistic: 60.8 on 4 and 333 DF,  p-value: <2e-16
p_hiv_lm = lm_model %>% predict_response(terms = c("HIV", "antivirals [0, 50, 100]")) %>% 
  as_tibble() %>% 
  ggplot(aes(x, predicted)) +
  geom_line(aes(color = group)) +
  #ribbon for the CI, but remove the outer lines
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1) +
  labs(
    title = "Predicted life expectancy by HIV and antivirals",
    subtitle = "Linear model",
    x = "HIV prevalence (%)",
    y = "Life expectancy (years)",
    color = "HIV in treatment%"
  ) +
  #remove guide for "fill", move "color" to bottom
  guides(fill = F, color = guide_legend(position = "bottom"))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p_hiv_lm

GG_save("figs/LE_HIV_antivirals.png")

#time series model
# Fit a GAMM
gamm_model <- gamm(
  LE ~ s(Year, k=5) + te(HIV, antivirals, k=5),
  random = list(ISO = ~1),  # Random intercept for each country
  correlation = corAR1(form = ~ Year | ISO),  # AR1 correlation structure for time series
  data = d %>% filter(Region2 == "Sub-Saharan Africa")
)

summary(gamm_model$gam)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## LE ~ s(Year, k = 5) + te(HIV, antivirals, k = 5)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   60.251      0.721    83.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df     F p-value    
## s(Year)            2.81   2.81  8.81 1.7e-05 ***
## te(HIV,antivirals) 4.61   4.61 13.10 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.404   
##   Scale est. = 5.0023    n = 338
plot(gamm_model$gam, pages=1, all.terms=TRUE)

plot(gamm_model$gam, select=2, scheme=1)  # 'select=2' refers to the interaction term

p_hiv_gam = gamm_model %>% predict_response(terms = c("HIV", "antivirals [0, 50, 100]")) %>% 
  as_tibble() %>% 
  ggplot(aes(x, predicted)) +
  geom_line(aes(color = group)) +
  #ribbon for the CI, but remove the outer lines
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1) +
  labs(
    title = "Predicted life expectancy by HIV and antivirals",
    subtitle = "GAMM model",
    x = "HIV prevalence (%)",
    y = "Life expectancy (years)",
    color = "HIV in treatment%"
  ) +
  #remove guide for "fill", move "color" to bottom
  guides(fill = F, color = guide_legend(position = "bottom"))

p_hiv_gam

GG_save("figs/LE_HIV_antivirals_gamm.png")

#combined plot
wrap_plots(
  p_hiv_lm, p_hiv_gam,
  guides = "keep",
  axes = "collect"
  )

GG_save("figs/LE_HIV_antivirals_combined.png")

Meta

#versions
write_sessioninfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggeffects_2.2.1       mgcv_1.9-1            nlme_3.1-167         
##  [4] patchwork_1.3.0       readxl_1.4.5          kirkegaard_2025-03-12
##  [7] psych_2.4.12          assertthat_0.2.1      weights_1.0.4        
## [10] Hmisc_5.2-2           magrittr_2.0.3        lubridate_1.9.4      
## [13] forcats_1.0.0         stringr_1.5.1         dplyr_1.1.4          
## [16] purrr_1.0.4           readr_2.1.5           tidyr_1.3.1          
## [19] tibble_3.2.1          ggplot2_3.5.1         tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.2      mnormt_2.1.1      gridExtra_2.3     rlang_1.1.5      
##  [5] compiler_4.4.3    gdata_3.0.1       systemfonts_1.2.1 vctrs_0.6.5      
##  [9] pkgconfig_2.0.3   shape_1.4.6.1     crayon_1.5.3      fastmap_1.2.0    
## [13] backports_1.5.0   labeling_0.4.3    rmarkdown_2.29    tzdb_0.4.0       
## [17] haven_2.5.4       nloptr_2.1.1      ragg_1.3.3        bit_4.6.0        
## [21] xfun_0.51         glmnet_4.1-8      jomo_2.7-6        cachem_1.1.0     
## [25] jsonlite_1.9.1    pan_1.9           broom_1.0.7       parallel_4.4.3   
## [29] cluster_2.1.8     R6_2.6.1          bslib_0.9.0       stringi_1.8.4    
## [33] boot_1.3-31       rpart_4.1.24      jquerylib_0.1.4   cellranger_1.1.0 
## [37] Rcpp_1.0.14       iterators_1.0.14  knitr_1.49        base64enc_0.1-3  
## [41] Matrix_1.7-2      splines_4.4.3     nnet_7.3-20       timechange_0.3.0 
## [45] tidyselect_1.2.1  rstudioapi_0.17.1 yaml_2.3.10       codetools_0.2-19 
## [49] lattice_0.22-5    withr_3.0.2       evaluate_1.0.3    foreign_0.8-88   
## [53] survival_3.8-3    pillar_1.10.1     mice_3.17.0       checkmate_2.3.2  
## [57] foreach_1.5.2     reformulas_0.4.0  insight_1.1.0     generics_0.1.3   
## [61] vroom_1.6.5       hms_1.1.3         munsell_0.5.1     scales_1.3.0     
## [65] minqa_1.2.8       gtools_3.9.5      glue_1.8.0        tools_4.4.3      
## [69] data.table_1.17.0 lme4_1.1-36       grid_4.4.3        rbibutils_2.3    
## [73] datawizard_1.0.1  colorspace_2.1-1  htmlTable_2.4.3   Formula_1.2-5    
## [77] cli_3.6.4         textshaping_1.0.0 gtable_0.3.6      sass_0.4.9       
## [81] digest_0.6.37     htmlwidgets_1.6.4 farver_2.1.2      htmltools_0.5.8.1
## [85] lifecycle_1.0.4   mitml_0.4-5       bit64_4.6.0-1     MASS_7.3-64
#write data to file for reuse
# d %>% write_rds("data/data_for_reuse.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}