Project Title

Association of ART Duration and ARV Regimen Type with Elevated NT-proBNP among People Living with HIV Aged 40 Years and Older

1. Data Loading and Analytical Sample

# Read the Excel file from the same folder as this Rmd
hiv_raw <- read_excel("C:/Users/userp/OneDrive/Рабочий стол/HSTA553/Project EPI 553/nursultans full.xlsx")

# Starting sample size
n_start <- nrow(hiv_raw)
n_start

## [1] 150

# Recode helper function for regimen groups
classify_regimen <- function(x) {
  x <- tolower(as.character(x))

  has_pi <- grepl("резолста|rezolsta|калетра|kaletra|darun", x)
  has_insti <- grepl("теград|dtg|долутегравир|dolutegravir|тивикай|tivicay|триумек|triumeq", x)
  has_nnrti <- grepl("efv|эфавиренз|атрипла|тенмифа|комплера|complera|rilp", x)

  n_classes <- sum(c(has_pi, has_insti, has_nnrti))

  if (n_classes > 1) {
    "Other/mixed"
  } else if (has_pi) {
    "PI-based"
  } else if (has_insti) {
    "INSTI-based"
  } else if (has_nnrti) {
    "NNRTI-based"
  } else {
    "Other/mixed"
  }
}

hiv <- hiv_raw %>%
  mutate(
    bmi = Weight / (Height / 100)^2,
    elevated_ntprobnp = ifelse(proBNP >= 125, 1, 0),
    sex_label = ifelse(sex == 1, "Male", "Female"),
    current_smoker = ifelse(smoke100 == 1 & smokeday %in% c(1, 2), 1, 0),
    smoker_label = ifelse(current_smoker == 1, "Current", "Non-current"),
    htn_label = ifelse(HTN == 1, "Yes", "No"),
    regimen_group = sapply(ART, classify_regimen),
    art_duration_cat = case_when(
      ARTyrs < 5 ~ "<5 years",
      ARTyrs >= 5 & ARTyrs <= 9 ~ "5-9 years",
      ARTyrs >= 10 ~ ">=10 years",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(age >= 40)

n_final <- nrow(hiv)
n_excluded_age <- n_start - n_final

data.frame(
  Step = c("Raw dataset", "Exclude age < 40 years", "Final analytic sample"),
  N = c(n_start, n_excluded_age, n_final)
) %>%
  kable()

Step	N
Raw dataset	150
Exclude age < 40 years	0
Final analytic sample	150

The dataset was imported from a single Excel worksheet, so no merging was required.The final analytic sample was 150.

2. Variable Selection, Recoding, and Data Cleaning

Outcome

The primary modeled outcome is elevated NT-proBNP, defined as proBNP >= 125 pg/mL.

hiv %>%
  summarise(
    elevated_n = sum(elevated_ntprobnp, na.rm = TRUE),
    elevated_pct = mean(elevated_ntprobnp, na.rm = TRUE) * 100,
    ntprobnp_median = median(proBNP, na.rm = TRUE),
    ntprobnp_q1 = quantile(proBNP, 0.25, na.rm = TRUE),
    ntprobnp_q3 = quantile(proBNP, 0.75, na.rm = TRUE)
  ) %>%
  kable(digits = 1)

elevated_n	elevated_pct	ntprobnp_median	ntprobnp_q1	ntprobnp_q3
55	36.7	90.5	46.2	190.2

Exposure variables

ART duration: continuous in years (ARTyrs) and categorized as <5 years, 5-9 years, and >=10 years
ARV regimen type: recoded from the raw regimen string (ART) into NNRTI-based, INSTI-based, PI-based, or Other/mixed

Covariates

Age (age), continuous in years
Sex (sex), recoded as male/female
BMI (bmi), calculated from height and weight
Smoking (current_smoker), recoded as current vs non-current using smoke100 and smokeday
Hypertension (HTN), binary yes/no

Missing data

key_vars <- c("proBNP", "ARTyrs", "ART", "age", "sex", "Height", "Weight", "HTN", "smokeday")

missing_table <- data.frame(
  Variable = key_vars,
  Missing_n = sapply(key_vars, function(v) sum(is.na(hiv[[v]]))),
  Missing_pct = sapply(key_vars, function(v) mean(is.na(hiv[[v]])) * 100)
)

missing_table %>%
  kable(digits = 1)

	Variable	Missing_n	Missing_pct
proBNP	proBNP	0	0.0
ARTyrs	ARTyrs	0	0.0
ART	ART	0	0.0
age	age	0	0.0
sex	sex	0	0.0
Height	Height	0	0.0
Weight	Weight	0	0.0
HTN	HTN	0	0.0
smokeday	smokeday	31	20.7

Smoking-day values are missing only for participants who reported never smoking 100 cigarettes in their lifetime; these participants were classified as non-current smokers. All other key variables had complete data in the final analytic sample.

3. Descriptive Statistics (Table 1)

table1 <- bind_rows(
  data.frame(Characteristic = "Age, years", Summary = sprintf("%.1f (%.1f)", mean(hiv$age, na.rm = TRUE), sd(hiv$age, na.rm = TRUE))),
  data.frame(Characteristic = "BMI, kg/m^2", Summary = sprintf("%.1f (%.1f)", mean(hiv$bmi, na.rm = TRUE), sd(hiv$bmi, na.rm = TRUE))),
  data.frame(Characteristic = "NT-proBNP, pg/mL", Summary = sprintf("%.1f [%.1f, %.1f]",
                                                                    median(hiv$proBNP, na.rm = TRUE),
                                                                    quantile(hiv$proBNP, 0.25, na.rm = TRUE),
                                                                    quantile(hiv$proBNP, 0.75, na.rm = TRUE))),
  data.frame(Characteristic = "Elevated NT-proBNP: Yes", Summary = sprintf("%d (%.1f%%)",
                                                                            sum(hiv$elevated_ntprobnp == 1, na.rm = TRUE),
                                                                            mean(hiv$elevated_ntprobnp == 1, na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Sex: Male", Summary = sprintf("%d (%.1f%%)",
                                                              sum(hiv$sex_label == "Male", na.rm = TRUE),
                                                              mean(hiv$sex_label == "Male", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Sex: Female", Summary = sprintf("%d (%.1f%%)",
                                                                sum(hiv$sex_label == "Female", na.rm = TRUE),
                                                                mean(hiv$sex_label == "Female", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Current smoker: Current", Summary = sprintf("%d (%.1f%%)",
                                                                            sum(hiv$smoker_label == "Current", na.rm = TRUE),
                                                                            mean(hiv$smoker_label == "Current", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Current smoker: Non-current", Summary = sprintf("%d (%.1f%%)",
                                                                                sum(hiv$smoker_label == "Non-current", na.rm = TRUE),
                                                                                mean(hiv$smoker_label == "Non-current", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Hypertension: Yes", Summary = sprintf("%d (%.1f%%)",
                                                                      sum(hiv$htn_label == "Yes", na.rm = TRUE),
                                                                      mean(hiv$htn_label == "Yes", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: INSTI-based", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$regimen_group == "INSTI-based", na.rm = TRUE),
                                                                             mean(hiv$regimen_group == "INSTI-based", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: NNRTI-based", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$regimen_group == "NNRTI-based", na.rm = TRUE),
                                                                             mean(hiv$regimen_group == "NNRTI-based", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: PI-based", Summary = sprintf("%d (%.1f%%)",
                                                                          sum(hiv$regimen_group == "PI-based", na.rm = TRUE),
                                                                          mean(hiv$regimen_group == "PI-based", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: Other/mixed", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$regimen_group == "Other/mixed", na.rm = TRUE),
                                                                             mean(hiv$regimen_group == "Other/mixed", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ART duration: <5 years", Summary = sprintf("%d (%.1f%%)",
                                                                           sum(hiv$art_duration_cat == "<5 years", na.rm = TRUE),
                                                                           mean(hiv$art_duration_cat == "<5 years", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ART duration: 5-9 years", Summary = sprintf("%d (%.1f%%)",
                                                                            sum(hiv$art_duration_cat == "5-9 years", na.rm = TRUE),
                                                                            mean(hiv$art_duration_cat == "5-9 years", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ART duration: >=10 years", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$art_duration_cat == ">=10 years", na.rm = TRUE),
                                                                             mean(hiv$art_duration_cat == ">=10 years", na.rm = TRUE) * 100))
)

kable(table1, col.names = c("Characteristic", paste0("Overall (n = ", n_final, ")")))

Characteristic	Overall (n = 150)
Age, years	50.6 (7.9)
BMI, kg/m^2	23.8 (3.6)
NT-proBNP, pg/mL	90.5 [46.2, 190.2]
Elevated NT-proBNP: Yes	55 (36.7%)
Sex: Male	82 (54.7%)
Sex: Female	68 (45.3%)
Current smoker: Current	111 (74.0%)
Current smoker: Non-current	39 (26.0%)
Hypertension: Yes	22 (14.7%)
ARV regimen: INSTI-based	75 (50.0%)
ARV regimen: NNRTI-based	63 (42.0%)
ARV regimen: PI-based	7 (4.7%)
ARV regimen: Other/mixed	5 (3.3%)
ART duration: <5 years	56 (37.3%)
ART duration: 5-9 years	59 (39.3%)
ART duration: >=10 years	35 (23.3%)

Note: Continuous variables are reported as mean (SD), except NT-proBNP, which is reported as median [IQR] because of right skew. Smoking status was coded as current vs non-current. Because the main exposure of interest is ART duration measured continuously, an overall unstratified Table 1 is presented.

4. Exploratory Data Analysis (EDA)

Figure 1. Distribution of NT-proBNP

ggplot(hiv, aes(x = proBNP)) +
  geom_histogram(bins = 30, color = "black", fill = "steelblue", alpha = 0.8) +
  labs(
    title = "Figure 1. Distribution of NT-proBNP",
    x = "NT-proBNP (pg/mL)",
    y = "Frequency"
  ) +
  theme_minimal()

The continuous NT-proBNP distribution is strongly right-skewed, with most observations clustered at lower values and a small number of very high values. This pattern supports using a clinically meaningful binary threshold for the main analysis rather than treating the raw biomarker as normally distributed.

Figure 2. NT-proBNP and ART duration

ggplot(hiv, aes(x = ARTyrs, y = proBNP)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Figure 2. NT-proBNP by ART Duration",
    x = "ART duration (years)",
    y = "NT-proBNP (pg/mL)"
  ) +
  theme_minimal()

The scatterplot suggests a moderate positive association between longer ART duration and higher NT-proBNP values, although the pattern is influenced by a few extreme values. The direction is consistent with the hypothesis, but the wide spread indicates that confounding by age, hypertension, and regimen type should be addressed in later models.

Figure 3. NT-proBNP across ARV regimen groups

ggplot(hiv, aes(x = regimen_group, y = proBNP)) +
  geom_boxplot() +
  labs(
    title = "Figure 3. NT-proBNP across ARV Regimen Groups",
    x = "ARV regimen group",
    y = "NT-proBNP (pg/mL)"
  ) +
  theme_minimal()

NT-proBNP distributions overlap across the major regimen groups, although the mixed/other group appears to have a higher center and wider spread. Because PI-based and mixed regimens include few participants, these apparent differences should be interpreted cautiously until adjusted models are fit.

Optional Preliminary Regression Preview

model_unadj <- glm(elevated_ntprobnp ~ ARTyrs, data = hiv, family = binomial())

coef_table <- summary(model_unadj)$coefficients
or <- exp(coef(model_unadj)["ARTyrs"])
ci <- exp(confint(model_unadj, parm = "ARTyrs"))

data.frame(
  Term = "ART duration (per year)",
  OR = round(or, 2),
  CI_Lower = round(ci[1], 2),
  CI_Upper = round(ci[2], 2),
  p_value = round(coef_table["ARTyrs", "Pr(>|z|)"], 3)
) %>%
  kable()

	Term	OR	CI_Lower	CI_Upper	p_value
ARTyrs	ART duration (per year)	1.07	0.99	1.15	0.094

The unadjusted logistic regression gives a preliminary estimate of the association between ART duration and elevated NT-proBNP. This estimate is in the hypothesized positive direction, but the confidence interval includes the null, suggesting that precision is limited and adjustment for confounding will be important in Progress Check-in 2.

EPI 553 Progress Check-in 1

Nursultan Nurzhigitov

2026-03-11