Association of ART Duration and ARV Regimen Type with Elevated NT-proBNP among People Living with HIV Aged 40 Years and Older
# Read the Excel file from the same folder as this Rmd
hiv_raw <- read_excel("C:/Users/userp/OneDrive/Рабочий стол/HSTA553/Project EPI 553/nursultans full.xlsx")
# Starting sample size
n_start <- nrow(hiv_raw)
n_start
## [1] 150
# Recode helper function for regimen groups
classify_regimen <- function(x) {
x <- tolower(as.character(x))
has_pi <- grepl("резолста|rezolsta|калетра|kaletra|darun", x)
has_insti <- grepl("теград|dtg|долутегравир|dolutegravir|тивикай|tivicay|триумек|triumeq", x)
has_nnrti <- grepl("efv|эфавиренз|атрипла|тенмифа|комплера|complera|rilp", x)
n_classes <- sum(c(has_pi, has_insti, has_nnrti))
if (n_classes > 1) {
"Other/mixed"
} else if (has_pi) {
"PI-based"
} else if (has_insti) {
"INSTI-based"
} else if (has_nnrti) {
"NNRTI-based"
} else {
"Other/mixed"
}
}
hiv <- hiv_raw %>%
mutate(
bmi = Weight / (Height / 100)^2,
elevated_ntprobnp = ifelse(proBNP >= 125, 1, 0),
sex_label = ifelse(sex == 1, "Male", "Female"),
current_smoker = ifelse(smoke100 == 1 & smokeday %in% c(1, 2), 1, 0),
smoker_label = ifelse(current_smoker == 1, "Current", "Non-current"),
htn_label = ifelse(HTN == 1, "Yes", "No"),
regimen_group = sapply(ART, classify_regimen),
art_duration_cat = case_when(
ARTyrs < 5 ~ "<5 years",
ARTyrs >= 5 & ARTyrs <= 9 ~ "5-9 years",
ARTyrs >= 10 ~ ">=10 years",
TRUE ~ NA_character_
)
) %>%
filter(age >= 40)
n_final <- nrow(hiv)
n_excluded_age <- n_start - n_final
data.frame(
Step = c("Raw dataset", "Exclude age < 40 years", "Final analytic sample"),
N = c(n_start, n_excluded_age, n_final)
) %>%
kable()
| Step | N |
|---|---|
| Raw dataset | 150 |
| Exclude age < 40 years | 0 |
| Final analytic sample | 150 |
The dataset was imported from a single Excel worksheet, so no merging was required.The final analytic sample was 150.
The primary modeled outcome is elevated NT-proBNP, defined as proBNP >= 125 pg/mL.
hiv %>%
summarise(
elevated_n = sum(elevated_ntprobnp, na.rm = TRUE),
elevated_pct = mean(elevated_ntprobnp, na.rm = TRUE) * 100,
ntprobnp_median = median(proBNP, na.rm = TRUE),
ntprobnp_q1 = quantile(proBNP, 0.25, na.rm = TRUE),
ntprobnp_q3 = quantile(proBNP, 0.75, na.rm = TRUE)
) %>%
kable(digits = 1)
| elevated_n | elevated_pct | ntprobnp_median | ntprobnp_q1 | ntprobnp_q3 |
|---|---|---|---|---|
| 55 | 36.7 | 90.5 | 46.2 | 190.2 |
ARTyrs) and categorized as <5 years,
5-9 years, and >=10 yearsART) into NNRTI-based,
INSTI-based, PI-based, or
Other/mixedage), continuous in yearssex), recoded as male/femalebmi), calculated from height and
weightcurrent_smoker), recoded as
current vs non-current using smoke100 and
smokedayHTN), binary yes/nokey_vars <- c("proBNP", "ARTyrs", "ART", "age", "sex", "Height", "Weight", "HTN", "smokeday")
missing_table <- data.frame(
Variable = key_vars,
Missing_n = sapply(key_vars, function(v) sum(is.na(hiv[[v]]))),
Missing_pct = sapply(key_vars, function(v) mean(is.na(hiv[[v]])) * 100)
)
missing_table %>%
kable(digits = 1)
| Variable | Missing_n | Missing_pct | |
|---|---|---|---|
| proBNP | proBNP | 0 | 0.0 |
| ARTyrs | ARTyrs | 0 | 0.0 |
| ART | ART | 0 | 0.0 |
| age | age | 0 | 0.0 |
| sex | sex | 0 | 0.0 |
| Height | Height | 0 | 0.0 |
| Weight | Weight | 0 | 0.0 |
| HTN | HTN | 0 | 0.0 |
| smokeday | smokeday | 31 | 20.7 |
Smoking-day values are missing only for participants who reported never smoking 100 cigarettes in their lifetime; these participants were classified as non-current smokers. All other key variables had complete data in the final analytic sample.
table1 <- bind_rows(
data.frame(Characteristic = "Age, years", Summary = sprintf("%.1f (%.1f)", mean(hiv$age, na.rm = TRUE), sd(hiv$age, na.rm = TRUE))),
data.frame(Characteristic = "BMI, kg/m^2", Summary = sprintf("%.1f (%.1f)", mean(hiv$bmi, na.rm = TRUE), sd(hiv$bmi, na.rm = TRUE))),
data.frame(Characteristic = "NT-proBNP, pg/mL", Summary = sprintf("%.1f [%.1f, %.1f]",
median(hiv$proBNP, na.rm = TRUE),
quantile(hiv$proBNP, 0.25, na.rm = TRUE),
quantile(hiv$proBNP, 0.75, na.rm = TRUE))),
data.frame(Characteristic = "Elevated NT-proBNP: Yes", Summary = sprintf("%d (%.1f%%)",
sum(hiv$elevated_ntprobnp == 1, na.rm = TRUE),
mean(hiv$elevated_ntprobnp == 1, na.rm = TRUE) * 100)),
data.frame(Characteristic = "Sex: Male", Summary = sprintf("%d (%.1f%%)",
sum(hiv$sex_label == "Male", na.rm = TRUE),
mean(hiv$sex_label == "Male", na.rm = TRUE) * 100)),
data.frame(Characteristic = "Sex: Female", Summary = sprintf("%d (%.1f%%)",
sum(hiv$sex_label == "Female", na.rm = TRUE),
mean(hiv$sex_label == "Female", na.rm = TRUE) * 100)),
data.frame(Characteristic = "Current smoker: Current", Summary = sprintf("%d (%.1f%%)",
sum(hiv$smoker_label == "Current", na.rm = TRUE),
mean(hiv$smoker_label == "Current", na.rm = TRUE) * 100)),
data.frame(Characteristic = "Current smoker: Non-current", Summary = sprintf("%d (%.1f%%)",
sum(hiv$smoker_label == "Non-current", na.rm = TRUE),
mean(hiv$smoker_label == "Non-current", na.rm = TRUE) * 100)),
data.frame(Characteristic = "Hypertension: Yes", Summary = sprintf("%d (%.1f%%)",
sum(hiv$htn_label == "Yes", na.rm = TRUE),
mean(hiv$htn_label == "Yes", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ARV regimen: INSTI-based", Summary = sprintf("%d (%.1f%%)",
sum(hiv$regimen_group == "INSTI-based", na.rm = TRUE),
mean(hiv$regimen_group == "INSTI-based", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ARV regimen: NNRTI-based", Summary = sprintf("%d (%.1f%%)",
sum(hiv$regimen_group == "NNRTI-based", na.rm = TRUE),
mean(hiv$regimen_group == "NNRTI-based", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ARV regimen: PI-based", Summary = sprintf("%d (%.1f%%)",
sum(hiv$regimen_group == "PI-based", na.rm = TRUE),
mean(hiv$regimen_group == "PI-based", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ARV regimen: Other/mixed", Summary = sprintf("%d (%.1f%%)",
sum(hiv$regimen_group == "Other/mixed", na.rm = TRUE),
mean(hiv$regimen_group == "Other/mixed", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ART duration: <5 years", Summary = sprintf("%d (%.1f%%)",
sum(hiv$art_duration_cat == "<5 years", na.rm = TRUE),
mean(hiv$art_duration_cat == "<5 years", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ART duration: 5-9 years", Summary = sprintf("%d (%.1f%%)",
sum(hiv$art_duration_cat == "5-9 years", na.rm = TRUE),
mean(hiv$art_duration_cat == "5-9 years", na.rm = TRUE) * 100)),
data.frame(Characteristic = "ART duration: >=10 years", Summary = sprintf("%d (%.1f%%)",
sum(hiv$art_duration_cat == ">=10 years", na.rm = TRUE),
mean(hiv$art_duration_cat == ">=10 years", na.rm = TRUE) * 100))
)
kable(table1, col.names = c("Characteristic", paste0("Overall (n = ", n_final, ")")))
| Characteristic | Overall (n = 150) |
|---|---|
| Age, years | 50.6 (7.9) |
| BMI, kg/m^2 | 23.8 (3.6) |
| NT-proBNP, pg/mL | 90.5 [46.2, 190.2] |
| Elevated NT-proBNP: Yes | 55 (36.7%) |
| Sex: Male | 82 (54.7%) |
| Sex: Female | 68 (45.3%) |
| Current smoker: Current | 111 (74.0%) |
| Current smoker: Non-current | 39 (26.0%) |
| Hypertension: Yes | 22 (14.7%) |
| ARV regimen: INSTI-based | 75 (50.0%) |
| ARV regimen: NNRTI-based | 63 (42.0%) |
| ARV regimen: PI-based | 7 (4.7%) |
| ARV regimen: Other/mixed | 5 (3.3%) |
| ART duration: <5 years | 56 (37.3%) |
| ART duration: 5-9 years | 59 (39.3%) |
| ART duration: >=10 years | 35 (23.3%) |
Note: Continuous variables are reported as mean (SD), except NT-proBNP, which is reported as median [IQR] because of right skew. Smoking status was coded as current vs non-current. Because the main exposure of interest is ART duration measured continuously, an overall unstratified Table 1 is presented.
ggplot(hiv, aes(x = proBNP)) +
geom_histogram(bins = 30, color = "black", fill = "steelblue", alpha = 0.8) +
labs(
title = "Figure 1. Distribution of NT-proBNP",
x = "NT-proBNP (pg/mL)",
y = "Frequency"
) +
theme_minimal()
The continuous NT-proBNP distribution is strongly right-skewed, with most observations clustered at lower values and a small number of very high values. This pattern supports using a clinically meaningful binary threshold for the main analysis rather than treating the raw biomarker as normally distributed.
ggplot(hiv, aes(x = ARTyrs, y = proBNP)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Figure 2. NT-proBNP by ART Duration",
x = "ART duration (years)",
y = "NT-proBNP (pg/mL)"
) +
theme_minimal()
The scatterplot suggests a moderate positive association between longer ART duration and higher NT-proBNP values, although the pattern is influenced by a few extreme values. The direction is consistent with the hypothesis, but the wide spread indicates that confounding by age, hypertension, and regimen type should be addressed in later models.
ggplot(hiv, aes(x = regimen_group, y = proBNP)) +
geom_boxplot() +
labs(
title = "Figure 3. NT-proBNP across ARV Regimen Groups",
x = "ARV regimen group",
y = "NT-proBNP (pg/mL)"
) +
theme_minimal()
NT-proBNP distributions overlap across the major regimen groups, although the mixed/other group appears to have a higher center and wider spread. Because PI-based and mixed regimens include few participants, these apparent differences should be interpreted cautiously until adjusted models are fit.
model_unadj <- glm(elevated_ntprobnp ~ ARTyrs, data = hiv, family = binomial())
coef_table <- summary(model_unadj)$coefficients
or <- exp(coef(model_unadj)["ARTyrs"])
ci <- exp(confint(model_unadj, parm = "ARTyrs"))
data.frame(
Term = "ART duration (per year)",
OR = round(or, 2),
CI_Lower = round(ci[1], 2),
CI_Upper = round(ci[2], 2),
p_value = round(coef_table["ARTyrs", "Pr(>|z|)"], 3)
) %>%
kable()
| Term | OR | CI_Lower | CI_Upper | p_value | |
|---|---|---|---|---|---|
| ARTyrs | ART duration (per year) | 1.07 | 0.99 | 1.15 | 0.094 |
The unadjusted logistic regression gives a preliminary estimate of the association between ART duration and elevated NT-proBNP. This estimate is in the hypothesized positive direction, but the confidence interval includes the null, suggesting that precision is limited and adjustment for confounding will be important in Progress Check-in 2.