Init
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: Hmisc
##
##
## Attaching package: 'Hmisc'
##
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
readxl,
patchwork,
mgcv,
ggeffects
)
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
theme_set(theme_bw())
options(
digits = 3
)
Data
#read data
le = read_csv("data/life-expectancy/life-expectancy.csv") %>% df_legalize_names()
## Rows: 21565 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Period life expectancy at birth - Sex: total - Age: 0
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hiv = read_csv("data/share-of-the-population-infected-with-hiv/share-of-the-population-infected-with-hiv.csv") %>% df_legalize_names()
## Rows: 6289 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, HIV prevalence - Sex: total - Age: 15-49 - Central estimate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
antivirals = read_csv("data/antivirals.csv") %>% df_legalize_names()
## Rows: 1552 Columns: 34
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): IndicatorCode, Indicator, ValueType, ParentLocationCode, ParentLo...
## dbl (5): Period, FactValueNumeric, FactValueNumericLow, FactValueNumericHi...
## lgl (17): IsLatestYear, Dim1 type, Dim1, Dim1ValueCode, Dim2 type, Dim2, Di...
## dttm (1): DateModified
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
regions = read_excel("data/UNSD — Methodology.xlsx") %>% df_legalize_names()
#join
d = full_join(
le %>% select(-Entity) %>% rename(ISO = Code, LE = Period_life_expectancy_at_birth_Sex_total_Age_0) %>% filter(!is.na(ISO)),
hiv %>% select(-Entity) %>% rename(ISO = Code, HIV = HIV_prevalence_Sex_total_Age_15_49_Central_estimate) %>% filter(!is.na(ISO))
) %>% full_join(
antivirals %>% select(SpatialDimValueCode, Period, FactValueNumeric) %>% rename(ISO = SpatialDimValueCode, antivirals = FactValueNumeric, Year = Period) %>% filter(!is.na(ISO))
) %>% full_join(
regions %>% select(ISO = ISO_alpha3_Code, Region1 = Region_Name, Region2 = Sub_region_Name, Region3 = Intermediate_Region_Code) %>% filter(!is.na(ISO))
) %>% mutate(
name = pu_translate(ISO, reverse = T)
)
## Joining with `by = join_by(ISO, Year)`
## Joining with `by = join_by(ISO, Year)`
## Joining with `by = join_by(ISO)`
## No match: OWID_KOS
## No match: OWID_USS
## No match: OWID_WRL
#codings
d %<>% mutate(
HIV_region = name %in% (d %>% filter(Year == 2019, HIV > 10) %>% pull(name))
)
#no dups for a single year
d %>% filter(Year == 2019) %>% pull(ISO) %>% anyDuplicated()
## [1] 0
Analysis
#reproduce initial plot
d %>%
filter(Region2 == "Sub-Saharan Africa", Year == 2019) %>%
GG_scatter("HIV", "LE", case_names = "name") +
labs(
title = "HIV vs. life expectancy in Sub-Saharan Africa",
x = "HIV prevalence (%)",
y = "Life expectancy (years)"
)
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/HIV_LE_Africa_2019.png")
## `geom_smooth()` using formula = 'y ~ x'
#timelines of LE in SSA by HIV prevalence
p_le = d %>%
filter(
Year %in% (1980:2019),
Region2 == "Sub-Saharan Africa"
) %>%
ggplot(aes(Year, LE, color = HIV_region, group = ISO)) +
geom_line() +
labs(
title = "Life expectancy in Sub-Saharan Africas countries",
x = "Year",
y = "Life expectancy (years)",
color = "HIV > 10% in 2019"
) +
theme(plot.title = element_text(size = 9))
#average across countries
p_le_avg = d %>%
filter(
Year %in% (1980:2019),
Region2 == "Sub-Saharan Africa"
) %>%
group_by(Year, HIV_region) %>%
summarise(LE = mean(LE)) %>%
ungroup() %>%
ggplot(aes(Year, LE, color = HIV_region)) +
geom_line() +
labs(
title = "Average by group",
x = "Year",
y = "Life expectancy (years)",
color = "HIV > 10% in 2019"
)
## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.
#combine plots
wrap_plots(
p_le, p_le_avg,
guides = "collect",
axes = "collect"
)

GG_save("figs/LE_Africa_1980_2019_HIV.png")
#start with linear model
lm_model = lm(
LE ~ HIV * antivirals + Year,
data = d %>% filter(Region2 == "Sub-Saharan Africa")
)
lm_model %>% summary()
##
## Call:
## lm(formula = LE ~ HIV * antivirals + Year, data = d %>% filter(Region2 ==
## "Sub-Saharan Africa"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.383 -3.011 0.187 2.916 16.202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.92e+02 1.37e+02 -4.31 2.1e-05 ***
## HIV -3.94e-01 7.23e-02 -5.45 9.7e-08 ***
## antivirals 2.01e-02 1.89e-02 1.06 0.2881
## Year 3.24e-01 6.84e-02 4.73 3.3e-06 ***
## HIV:antivirals 3.52e-03 1.20e-03 2.92 0.0037 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.95 on 333 degrees of freedom
## (3458 observations deleted due to missingness)
## Multiple R-squared: 0.422, Adjusted R-squared: 0.415
## F-statistic: 60.8 on 4 and 333 DF, p-value: <2e-16
p_hiv_lm = lm_model %>% predict_response(terms = c("HIV", "antivirals [0, 50, 100]")) %>%
as_tibble() %>%
ggplot(aes(x, predicted)) +
geom_line(aes(color = group)) +
#ribbon for the CI, but remove the outer lines
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1) +
labs(
title = "Predicted life expectancy by HIV and antivirals",
subtitle = "Linear model",
x = "HIV prevalence (%)",
y = "Life expectancy (years)",
color = "HIV in treatment%"
) +
#remove guide for "fill", move "color" to bottom
guides(fill = F, color = guide_legend(position = "bottom"))
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p_hiv_lm

GG_save("figs/LE_HIV_antivirals.png")
#time series model
# Fit a GAMM
gamm_model <- gamm(
LE ~ s(Year, k=5) + te(HIV, antivirals, k=5),
random = list(ISO = ~1), # Random intercept for each country
correlation = corAR1(form = ~ Year | ISO), # AR1 correlation structure for time series
data = d %>% filter(Region2 == "Sub-Saharan Africa")
)
summary(gamm_model$gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## LE ~ s(Year, k = 5) + te(HIV, antivirals, k = 5)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.251 0.721 83.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Year) 2.81 2.81 8.81 1.7e-05 ***
## te(HIV,antivirals) 4.61 4.61 13.10 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.404
## Scale est. = 5.0023 n = 338
plot(gamm_model$gam, pages=1, all.terms=TRUE)

plot(gamm_model$gam, select=2, scheme=1) # 'select=2' refers to the interaction term

p_hiv_gam = gamm_model %>% predict_response(terms = c("HIV", "antivirals [0, 50, 100]")) %>%
as_tibble() %>%
ggplot(aes(x, predicted)) +
geom_line(aes(color = group)) +
#ribbon for the CI, but remove the outer lines
geom_ribbon(aes(ymin = conf.low, ymax = conf.high, fill = group), alpha = 0.1) +
labs(
title = "Predicted life expectancy by HIV and antivirals",
subtitle = "GAMM model",
x = "HIV prevalence (%)",
y = "Life expectancy (years)",
color = "HIV in treatment%"
) +
#remove guide for "fill", move "color" to bottom
guides(fill = F, color = guide_legend(position = "bottom"))
p_hiv_gam

GG_save("figs/LE_HIV_antivirals_gamm.png")
#combined plot
wrap_plots(
p_hiv_lm, p_hiv_gam,
guides = "keep",
axes = "collect"
)

GG_save("figs/LE_HIV_antivirals_combined.png")