1 Data exploration

# Explore the data structure using str()
str(htndat)

## 'data.frame':    4998 obs. of  17 variables:
##  $ DBP               : int  60 75 60 60 60 60 59 60 70 58 ...
##  $ SBP               : int  90 110 80 90 100 120 100 100 120 110 ...
##  $ BMI               : num  NA 27.3 17.7 19.9 21.3 ...
##  $ age               : num  28 26.5 43 50.1 30.6 ...
##  $ married           : int  0 1 0 1 0 1 0 0 1 1 ...
##  $ male.gender       : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ hgb_centered      : num  NA -3.9 -3.2 NA -0.4 ...
##  $ adv_HIV           : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ survtime          : int  338 439 752 526 215 272 1669 388 14 419 ...
##  $ event             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ arv_naive         : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ urban.clinic      : int  0 1 0 1 0 0 1 0 0 0 ...
##  $ log_creat_centered: num  NA 5.42e-02 -3.60e-01 NA -1.00e-07 ...
##  $ IPW_weight        : num  0.924 1.164 0.721 0.829 0.856 ...
##  $ SBP_ge120         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ survmonth         : num  12 15 26 18 8 10 56 13 1 14 ...
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...

# Explore % of missing values by variable
miss <- apply(htndat, 2, function(x) sum(is.na(x))/nrow(htndat)*100)

# Use kable() function from kableExtra package to get a clean table
miss <- miss[miss != 0] %>% 
  kableExtra::kable(booktabs = TRUE,
        digits = 2, 
        col.names = "",
        caption="Missing data percentage among variables in the dataset") %>% 
  kableExtra::kable_styling(latex_options = c("striped", "HOLD_position")
  )

miss

Missing data percentage among variables in the dataset

BMI	13.39
married	3.26
hgb_centered	27.87
adv_HIV	39.22
log_creat_centered	30.75

2 Descriptive statistics

## ----summaries and tables for HTN data  -----------------------------------------------------

# Create a summary table
gtsummary::tbl_summary(data = htndat,
                              by = adv_HIV,
                              include = c(SBP, BMI, age, married, 
                                          male.gender, hgb_centered, event,
                                          arv_naive, urban.clinic,
                                          log_creat_centered, 
                                          SBP_ge120, adv_HIV),
                              label = list(adv_HIV = "Advanced HIV, 1 = Yes", 
                                           event = "Event (Death)"),
                              missing = "ifany",
                              missing_text = "Missing %",
                              missing_stat = "{p_miss}") %>% 
  gtsummary::bold_labels() %>% 
  gtsummary::add_overall() %>% 
  gtsummary::add_p()

Characteristic	Overall N = 3,038¹	0 N = 1,130¹	1 N = 1,908¹	p-value²
SBP	110 (100, 120)	110 (100, 120)	110 (100, 120)	0.003
BMI	21.0 (18.9, 23.4)	21.9 (19.7, 24.7)	20.5 (18.6, 22.8)	<0.001
Missing %	5.4	5.5	5.3
age	34 (27, 42)	31 (26, 38)	35 (29, 44)	<0.001
married	1,773 (59%)	693 (62%)	1,080 (58%)	0.010
Missing %	1.9	1.9	1.9
male.gender	743 (24%)	206 (18%)	537 (28%)	<0.001
hgb_centered	0.20 (-1.60, 1.70)	0.60 (-1.10, 1.80)	-0.20 (-2.00, 1.50)	<0.001
Missing %	12	12	12
Event (Death)	398 (13%)	99 (8.8%)	299 (16%)	<0.001
arv_naive	2,871 (95%)	1,057 (94%)	1,814 (95%)	0.073
urban.clinic	1,526 (50%)	573 (51%)	953 (50%)	0.7
log_creat_centered	-0.09 (-0.28, 0.10)	-0.13 (-0.29, 0.06)	-0.08 (-0.28, 0.12)	0.002
Missing %	12	13	12
SBP_ge120	174 (5.7%)	63 (5.6%)	111 (5.8%)	0.8
¹ Median (Q1, Q3); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

3 Inspect key variables and creating the survival object

## ---view table of HTN data
# Inspect key variables
str(htndat[c("survmonth", "event")])

## 'data.frame':    4998 obs. of  2 variables:
##  $ survmonth: num  12 15 26 18 8 10 56 13 1 14 ...
##  $ event    : int  1 1 1 1 1 1 1 1 1 1 ...

#  'survmonth': num  ...
#  'event'    : int  ...

## ----Create survival object, fit KM curve 
# Create the survival object

surv_obj <- with(htndat, Surv(time = survmonth, event = event))

# Explanation:
#  - 'time'  = months from baseline to death or censoring
#  - 'event' = 1 if death occurred, 0 if censored

# Fit overall KM curve
km_overall <- survfit(surv_obj ~ 1, data = htndat)
print(km_overall)

## Call: survfit(formula = surv_obj ~ 1, data = htndat)
## 
##         n events median 0.95LCL 0.95UCL
## [1,] 4998    749     NA      NA      NA

# basic plot
plot(km_overall, xlab="Months", ylab="Survival proportion")

plot(km_overall, xlab="Months", ylab="Survival proportion", 
     ylim=c(.6,1),
     conf.int=FALSE)

# fancy plot
p = ggsurvplot(
  km_overall,
  data       = htndat,
  risk.table = TRUE,
  conf.int   = TRUE,
  xlab       = "Time (months)",
  ylab       = "Survival probability",
  ggtheme = theme_minimal()
)
print(p)

Survival Analysis on HTN data

Bernard Osang’ir, Rudradev Sungepta, Ziv Shkedy, Joseph Hogan, Richard Mugo, Ann Mwangi (July, 2025)

2025-07-14

1 Data exploration

2 Descriptive statistics

3 Inspect key variables and creating the survival object

4 KM Summaries

5 Hazard Functions

5.1 Plot log-hazard