Project Title

Association of ART Duration and ARV Regimen Type with Elevated NT-proBNP among People Living with HIV Aged 40 Years and Older

1. Data Loading and Analytical Sample

# Read the Excel file from the same folder as this Rmd
hiv_raw <- read_excel("C:/Users/userp/OneDrive/Рабочий стол/HSTA553/Project EPI 553/nursultans full.xlsx")

# Starting sample size
n_start <- nrow(hiv_raw)
n_start
## [1] 150
# Recode helper function for regimen groups
classify_regimen <- function(x) {
  x <- tolower(as.character(x))

  has_pi <- grepl("резолста|rezolsta|калетра|kaletra|darun", x)
  has_insti <- grepl("теград|dtg|долутегравир|dolutegravir|тивикай|tivicay|триумек|triumeq", x)
  has_nnrti <- grepl("efv|эфавиренз|атрипла|тенмифа|комплера|complera|rilp", x)

  n_classes <- sum(c(has_pi, has_insti, has_nnrti))

  if (n_classes > 1) {
    "Other/mixed"
  } else if (has_pi) {
    "PI-based"
  } else if (has_insti) {
    "INSTI-based"
  } else if (has_nnrti) {
    "NNRTI-based"
  } else {
    "Other/mixed"
  }
}

hiv <- hiv_raw %>%
  mutate(
    bmi = Weight / (Height / 100)^2,
    elevated_ntprobnp = ifelse(proBNP >= 125, 1, 0),
    sex_label = ifelse(sex == 1, "Male", "Female"),
    current_smoker = ifelse(smoke100 == 1 & smokeday %in% c(1, 2), 1, 0),
    smoker_label = ifelse(current_smoker == 1, "Current", "Non-current"),
    htn_label = ifelse(HTN == 1, "Yes", "No"),
    regimen_group = sapply(ART, classify_regimen),
    art_duration_cat = case_when(
      ARTyrs < 5 ~ "<5 years",
      ARTyrs >= 5 & ARTyrs <= 9 ~ "5-9 years",
      ARTyrs >= 10 ~ ">=10 years",
      TRUE ~ NA_character_
    )
  ) %>%
  filter(age >= 40)

n_final <- nrow(hiv)
n_excluded_age <- n_start - n_final

data.frame(
  Step = c("Raw dataset", "Exclude age < 40 years", "Final analytic sample"),
  N = c(n_start, n_excluded_age, n_final)
) %>%
  kable()
Step N
Raw dataset 150
Exclude age < 40 years 0
Final analytic sample 150

The dataset was imported from a single Excel worksheet, so no merging was required.The final analytic sample was 150.

2. Variable Selection, Recoding, and Data Cleaning

Outcome

The primary modeled outcome is elevated NT-proBNP, defined as proBNP >= 125 pg/mL.

hiv %>%
  summarise(
    elevated_n = sum(elevated_ntprobnp, na.rm = TRUE),
    elevated_pct = mean(elevated_ntprobnp, na.rm = TRUE) * 100,
    ntprobnp_median = median(proBNP, na.rm = TRUE),
    ntprobnp_q1 = quantile(proBNP, 0.25, na.rm = TRUE),
    ntprobnp_q3 = quantile(proBNP, 0.75, na.rm = TRUE)
  ) %>%
  kable(digits = 1)
elevated_n elevated_pct ntprobnp_median ntprobnp_q1 ntprobnp_q3
55 36.7 90.5 46.2 190.2

Exposure variables

  • ART duration: continuous in years (ARTyrs) and categorized as <5 years, 5-9 years, and >=10 years
  • ARV regimen type: recoded from the raw regimen string (ART) into NNRTI-based, INSTI-based, PI-based, or Other/mixed

Covariates

  • Age (age), continuous in years
  • Sex (sex), recoded as male/female
  • BMI (bmi), calculated from height and weight
  • Smoking (current_smoker), recoded as current vs non-current using smoke100 and smokeday
  • Hypertension (HTN), binary yes/no

Missing data

key_vars <- c("proBNP", "ARTyrs", "ART", "age", "sex", "Height", "Weight", "HTN", "smokeday")

missing_table <- data.frame(
  Variable = key_vars,
  Missing_n = sapply(key_vars, function(v) sum(is.na(hiv[[v]]))),
  Missing_pct = sapply(key_vars, function(v) mean(is.na(hiv[[v]])) * 100)
)

missing_table %>%
  kable(digits = 1)
Variable Missing_n Missing_pct
proBNP proBNP 0 0.0
ARTyrs ARTyrs 0 0.0
ART ART 0 0.0
age age 0 0.0
sex sex 0 0.0
Height Height 0 0.0
Weight Weight 0 0.0
HTN HTN 0 0.0
smokeday smokeday 31 20.7

Smoking-day values are missing only for participants who reported never smoking 100 cigarettes in their lifetime; these participants were classified as non-current smokers. All other key variables had complete data in the final analytic sample.

3. Descriptive Statistics (Table 1)

table1 <- bind_rows(
  data.frame(Characteristic = "Age, years", Summary = sprintf("%.1f (%.1f)", mean(hiv$age, na.rm = TRUE), sd(hiv$age, na.rm = TRUE))),
  data.frame(Characteristic = "BMI, kg/m^2", Summary = sprintf("%.1f (%.1f)", mean(hiv$bmi, na.rm = TRUE), sd(hiv$bmi, na.rm = TRUE))),
  data.frame(Characteristic = "NT-proBNP, pg/mL", Summary = sprintf("%.1f [%.1f, %.1f]",
                                                                    median(hiv$proBNP, na.rm = TRUE),
                                                                    quantile(hiv$proBNP, 0.25, na.rm = TRUE),
                                                                    quantile(hiv$proBNP, 0.75, na.rm = TRUE))),
  data.frame(Characteristic = "Elevated NT-proBNP: Yes", Summary = sprintf("%d (%.1f%%)",
                                                                            sum(hiv$elevated_ntprobnp == 1, na.rm = TRUE),
                                                                            mean(hiv$elevated_ntprobnp == 1, na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Sex: Male", Summary = sprintf("%d (%.1f%%)",
                                                              sum(hiv$sex_label == "Male", na.rm = TRUE),
                                                              mean(hiv$sex_label == "Male", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Sex: Female", Summary = sprintf("%d (%.1f%%)",
                                                                sum(hiv$sex_label == "Female", na.rm = TRUE),
                                                                mean(hiv$sex_label == "Female", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Current smoker: Current", Summary = sprintf("%d (%.1f%%)",
                                                                            sum(hiv$smoker_label == "Current", na.rm = TRUE),
                                                                            mean(hiv$smoker_label == "Current", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Current smoker: Non-current", Summary = sprintf("%d (%.1f%%)",
                                                                                sum(hiv$smoker_label == "Non-current", na.rm = TRUE),
                                                                                mean(hiv$smoker_label == "Non-current", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "Hypertension: Yes", Summary = sprintf("%d (%.1f%%)",
                                                                      sum(hiv$htn_label == "Yes", na.rm = TRUE),
                                                                      mean(hiv$htn_label == "Yes", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: INSTI-based", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$regimen_group == "INSTI-based", na.rm = TRUE),
                                                                             mean(hiv$regimen_group == "INSTI-based", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: NNRTI-based", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$regimen_group == "NNRTI-based", na.rm = TRUE),
                                                                             mean(hiv$regimen_group == "NNRTI-based", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: PI-based", Summary = sprintf("%d (%.1f%%)",
                                                                          sum(hiv$regimen_group == "PI-based", na.rm = TRUE),
                                                                          mean(hiv$regimen_group == "PI-based", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ARV regimen: Other/mixed", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$regimen_group == "Other/mixed", na.rm = TRUE),
                                                                             mean(hiv$regimen_group == "Other/mixed", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ART duration: <5 years", Summary = sprintf("%d (%.1f%%)",
                                                                           sum(hiv$art_duration_cat == "<5 years", na.rm = TRUE),
                                                                           mean(hiv$art_duration_cat == "<5 years", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ART duration: 5-9 years", Summary = sprintf("%d (%.1f%%)",
                                                                            sum(hiv$art_duration_cat == "5-9 years", na.rm = TRUE),
                                                                            mean(hiv$art_duration_cat == "5-9 years", na.rm = TRUE) * 100)),
  data.frame(Characteristic = "ART duration: >=10 years", Summary = sprintf("%d (%.1f%%)",
                                                                             sum(hiv$art_duration_cat == ">=10 years", na.rm = TRUE),
                                                                             mean(hiv$art_duration_cat == ">=10 years", na.rm = TRUE) * 100))
)

kable(table1, col.names = c("Characteristic", paste0("Overall (n = ", n_final, ")")))
Characteristic Overall (n = 150)
Age, years 50.6 (7.9)
BMI, kg/m^2 23.8 (3.6)
NT-proBNP, pg/mL 90.5 [46.2, 190.2]
Elevated NT-proBNP: Yes 55 (36.7%)
Sex: Male 82 (54.7%)
Sex: Female 68 (45.3%)
Current smoker: Current 111 (74.0%)
Current smoker: Non-current 39 (26.0%)
Hypertension: Yes 22 (14.7%)
ARV regimen: INSTI-based 75 (50.0%)
ARV regimen: NNRTI-based 63 (42.0%)
ARV regimen: PI-based 7 (4.7%)
ARV regimen: Other/mixed 5 (3.3%)
ART duration: <5 years 56 (37.3%)
ART duration: 5-9 years 59 (39.3%)
ART duration: >=10 years 35 (23.3%)

Note: Continuous variables are reported as mean (SD), except NT-proBNP, which is reported as median [IQR] because of right skew. Smoking status was coded as current vs non-current. Because the main exposure of interest is ART duration measured continuously, an overall unstratified Table 1 is presented.

4. Exploratory Data Analysis (EDA)

Figure 1. Distribution of NT-proBNP

ggplot(hiv, aes(x = proBNP)) +
  geom_histogram(bins = 30, color = "black", fill = "steelblue", alpha = 0.8) +
  labs(
    title = "Figure 1. Distribution of NT-proBNP",
    x = "NT-proBNP (pg/mL)",
    y = "Frequency"
  ) +
  theme_minimal()

The continuous NT-proBNP distribution is strongly right-skewed, with most observations clustered at lower values and a small number of very high values. This pattern supports using a clinically meaningful binary threshold for the main analysis rather than treating the raw biomarker as normally distributed.

Figure 2. NT-proBNP and ART duration

ggplot(hiv, aes(x = ARTyrs, y = proBNP)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Figure 2. NT-proBNP by ART Duration",
    x = "ART duration (years)",
    y = "NT-proBNP (pg/mL)"
  ) +
  theme_minimal()

The scatterplot suggests a moderate positive association between longer ART duration and higher NT-proBNP values, although the pattern is influenced by a few extreme values. The direction is consistent with the hypothesis, but the wide spread indicates that confounding by age, hypertension, and regimen type should be addressed in later models.

Figure 3. NT-proBNP across ARV regimen groups

ggplot(hiv, aes(x = regimen_group, y = proBNP)) +
  geom_boxplot() +
  labs(
    title = "Figure 3. NT-proBNP across ARV Regimen Groups",
    x = "ARV regimen group",
    y = "NT-proBNP (pg/mL)"
  ) +
  theme_minimal()

NT-proBNP distributions overlap across the major regimen groups, although the mixed/other group appears to have a higher center and wider spread. Because PI-based and mixed regimens include few participants, these apparent differences should be interpreted cautiously until adjusted models are fit.

Optional Preliminary Regression Preview

model_unadj <- glm(elevated_ntprobnp ~ ARTyrs, data = hiv, family = binomial())

coef_table <- summary(model_unadj)$coefficients
or <- exp(coef(model_unadj)["ARTyrs"])
ci <- exp(confint(model_unadj, parm = "ARTyrs"))

data.frame(
  Term = "ART duration (per year)",
  OR = round(or, 2),
  CI_Lower = round(ci[1], 2),
  CI_Upper = round(ci[2], 2),
  p_value = round(coef_table["ARTyrs", "Pr(>|z|)"], 3)
) %>%
  kable()
Term OR CI_Lower CI_Upper p_value
ARTyrs ART duration (per year) 1.07 0.99 1.15 0.094

The unadjusted logistic regression gives a preliminary estimate of the association between ART duration and elevated NT-proBNP. This estimate is in the hypothesized positive direction, but the confidence interval includes the null, suggesting that precision is limited and adjustment for confounding will be important in Progress Check-in 2.