1 Data exploration

# Explore the data structure using str()
str(htndat)
## 'data.frame':    4998 obs. of  17 variables:
##  $ DBP               : int  60 75 60 60 60 60 59 60 70 58 ...
##  $ SBP               : int  90 110 80 90 100 120 100 100 120 110 ...
##  $ BMI               : num  NA 27.3 17.7 19.9 21.3 ...
##  $ age               : num  28 26.5 43 50.1 30.6 ...
##  $ married           : int  0 1 0 1 0 1 0 0 1 1 ...
##  $ male.gender       : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ hgb_centered      : num  NA -3.9 -3.2 NA -0.4 ...
##  $ adv_HIV           : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ survtime          : int  338 439 752 526 215 272 1669 388 14 419 ...
##  $ event             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ arv_naive         : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ urban.clinic      : int  0 1 0 1 0 0 1 0 0 0 ...
##  $ log_creat_centered: num  NA 5.42e-02 -3.60e-01 NA -1.00e-07 ...
##  $ IPW_weight        : num  0.924 1.164 0.721 0.829 0.856 ...
##  $ SBP_ge120         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ survmonth         : num  12 15 26 18 8 10 56 13 1 14 ...
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...
# Explore % of missing values by variable
miss <- apply(htndat, 2, function(x) sum(is.na(x))/nrow(htndat)*100)

# Use kable() function from kableExtra package to get a clean table
miss <- miss[miss != 0] %>% 
  kableExtra::kable(booktabs = TRUE,
        digits = 2, 
        col.names = "",
        caption="Missing data percentage among variables in the dataset") %>% 
  kableExtra::kable_styling(latex_options = c("striped", "HOLD_position")
  )

miss
Missing data percentage among variables in the dataset
BMI 13.39
married 3.26
hgb_centered 27.87
adv_HIV 39.22
log_creat_centered 30.75

2 Descriptive statistics

## ----summaries and tables for HTN data  -----------------------------------------------------

# Create a summary table
gtsummary::tbl_summary(data = htndat,
                              by = adv_HIV,
                              include = c(SBP, BMI, age, married, 
                                          male.gender, hgb_centered, event,
                                          arv_naive, urban.clinic,
                                          log_creat_centered, 
                                          SBP_ge120, adv_HIV),
                              label = list(adv_HIV = "Advanced HIV, 1 = Yes", 
                                           event = "Event (Death)"),
                              missing = "ifany",
                              missing_text = "Missing %",
                              missing_stat = "{p_miss}") %>% 
  gtsummary::bold_labels() %>% 
  gtsummary::add_overall() %>% 
  gtsummary::add_p()
Characteristic Overall
N = 3,038
1
0
N = 1,130
1
1
N = 1,908
1
p-value2
SBP 110 (100, 120) 110 (100, 120) 110 (100, 120) 0.003
BMI 21.0 (18.9, 23.4) 21.9 (19.7, 24.7) 20.5 (18.6, 22.8) <0.001
    Missing % 5.4 5.5 5.3
age 34 (27, 42) 31 (26, 38) 35 (29, 44) <0.001
married 1,773 (59%) 693 (62%) 1,080 (58%) 0.010
    Missing % 1.9 1.9 1.9
male.gender 743 (24%) 206 (18%) 537 (28%) <0.001
hgb_centered 0.20 (-1.60, 1.70) 0.60 (-1.10, 1.80) -0.20 (-2.00, 1.50) <0.001
    Missing % 12 12 12
Event (Death) 398 (13%) 99 (8.8%) 299 (16%) <0.001
arv_naive 2,871 (95%) 1,057 (94%) 1,814 (95%) 0.073
urban.clinic 1,526 (50%) 573 (51%) 953 (50%) 0.7
log_creat_centered -0.09 (-0.28, 0.10) -0.13 (-0.29, 0.06) -0.08 (-0.28, 0.12) 0.002
    Missing % 12 13 12
SBP_ge120 174 (5.7%) 63 (5.6%) 111 (5.8%) 0.8
1 Median (Q1, Q3); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

3 Inspect key variables and creating the survival object

## ---view table of HTN data
# Inspect key variables
str(htndat[c("survmonth", "event")])
## 'data.frame':    4998 obs. of  2 variables:
##  $ survmonth: num  12 15 26 18 8 10 56 13 1 14 ...
##  $ event    : int  1 1 1 1 1 1 1 1 1 1 ...
#  'survmonth': num  ...
#  'event'    : int  ...

## ----Create survival object, fit KM curve 
# Create the survival object

surv_obj <- with(htndat, Surv(time = survmonth, event = event))

# Explanation:
#  - 'time'  = months from baseline to death or censoring
#  - 'event' = 1 if death occurred, 0 if censored
# Fit overall KM curve
km_overall <- survfit(surv_obj ~ 1, data = htndat)
print(km_overall)
## Call: survfit(formula = surv_obj ~ 1, data = htndat)
## 
##         n events median 0.95LCL 0.95UCL
## [1,] 4998    749     NA      NA      NA
# basic plot
plot(km_overall, xlab="Months", ylab="Survival proportion")

plot(km_overall, xlab="Months", ylab="Survival proportion", 
     ylim=c(.6,1),
     conf.int=FALSE)

# fancy plot
p = ggsurvplot(
  km_overall,
  data       = htndat,
  risk.table = TRUE,
  conf.int   = TRUE,
  xlab       = "Time (months)",
  ylab       = "Survival probability",
  ggtheme = theme_minimal()
)
print(p)

4 KM Summaries

5 Hazard Functions

5.1 Plot log-hazard