1 Data exploration

# Explore the data structure using str()
str(htndat)

## 'data.frame':    4998 obs. of  17 variables:
##  $ DBP               : int  60 75 60 60 60 60 59 60 70 58 ...
##  $ SBP               : int  90 110 80 90 100 120 100 100 120 110 ...
##  $ BMI               : num  NA 27.3 17.7 19.9 21.3 ...
##  $ age               : num  28 26.5 43 50.1 30.6 ...
##  $ married           : int  0 1 0 1 0 1 0 0 1 1 ...
##  $ male.gender       : int  0 0 0 1 1 0 0 1 0 0 ...
##  $ hgb_centered      : num  NA -3.9 -3.2 NA -0.4 ...
##  $ adv_HIV           : int  NA 0 NA NA NA NA NA NA NA NA ...
##  $ survtime          : int  338 439 752 526 215 272 1669 388 14 419 ...
##  $ event             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ arv_naive         : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ urban.clinic      : int  0 1 0 1 0 0 1 0 0 0 ...
##  $ log_creat_centered: num  NA 5.42e-02 -3.60e-01 NA -1.00e-07 ...
##  $ IPW_weight        : num  0.924 1.164 0.721 0.829 0.856 ...
##  $ SBP_ge120         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ survmonth         : num  12 15 26 18 8 10 56 13 1 14 ...
##  $ ID                : int  1 2 3 4 5 6 7 8 9 10 ...

# Explore % of missing values by variable
miss <- apply(htndat, 2, function(x) sum(is.na(x))/nrow(htndat)*100)

# Use kable() function from kableExtra package to get a clean table
miss <- miss[miss != 0] %>% 
  kableExtra::kable(booktabs = TRUE,
        digits = 2, 
        col.names = "",
        caption="Missing data percentage among variables in the dataset") %>% 
  kableExtra::kable_styling(latex_options = c("striped", "HOLD_position")
  )

miss

Missing data percentage among variables in the dataset

BMI	13.39
married	3.26
hgb_centered	27.87
adv_HIV	39.22
log_creat_centered	30.75

2 Descriptive statistics

## ----summaries and tables for HTN data  -----------------------------------------------------

# Create a summary table
gtsummary::tbl_summary(data = htndat,
                              by = adv_HIV,
                              include = c(SBP, BMI, age, married, 
                                          male.gender, hgb_centered, event,
                                          arv_naive, urban.clinic,
                                          log_creat_centered, 
                                          SBP_ge120, adv_HIV),
                              label = list(adv_HIV = "Advanced HIV, 1 = Yes", 
                                           event = "Event (Death)"),
                              missing = "ifany",
                              missing_text = "Missing %",
                              missing_stat = "{p_miss}") %>% 
  gtsummary::bold_labels() %>% 
  gtsummary::add_overall() %>% 
  gtsummary::add_p()

Characteristic	Overall N = 3,038¹	0 N = 1,130¹	1 N = 1,908¹	p-value²
SBP	110 (100, 120)	110 (100, 120)	110 (100, 120)	0.003
BMI	21.0 (18.9, 23.4)	21.9 (19.7, 24.7)	20.5 (18.6, 22.8)	<0.001
Missing %	5.4	5.5	5.3
age	34 (27, 42)	31 (26, 38)	35 (29, 44)	<0.001
married	1,773 (59%)	693 (62%)	1,080 (58%)	0.010
Missing %	1.9	1.9	1.9
male.gender	743 (24%)	206 (18%)	537 (28%)	<0.001
hgb_centered	0.20 (-1.60, 1.70)	0.60 (-1.10, 1.80)	-0.20 (-2.00, 1.50)	<0.001
Missing %	12	12	12
Event (Death)	398 (13%)	99 (8.8%)	299 (16%)	<0.001
arv_naive	2,871 (95%)	1,057 (94%)	1,814 (95%)	0.073
urban.clinic	1,526 (50%)	573 (51%)	953 (50%)	0.7
log_creat_centered	-0.09 (-0.28, 0.10)	-0.13 (-0.29, 0.06)	-0.08 (-0.28, 0.12)	0.002
Missing %	12	13	12
SBP_ge120	174 (5.7%)	63 (5.6%)	111 (5.8%)	0.8
¹ Median (Q1, Q3); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

3 Inspect key variables and creating the survival object

## ---view table of HTN data
# Inspect key variables
str(htndat[c("survmonth", "event")])

## 'data.frame':    4998 obs. of  2 variables:
##  $ survmonth: num  12 15 26 18 8 10 56 13 1 14 ...
##  $ event    : int  1 1 1 1 1 1 1 1 1 1 ...

#  'survmonth': num  ...
#  'event'    : int  ...

## ----Create survival object, fit KM curve 
# Create the survival object

surv_obj <- with(htndat, Surv(time = survmonth, event = event))

# Explanation:
#  - 'time'  = months from baseline to death or censoring
#  - 'event' = 1 if death occurred, 0 if censored

# Fit overall KM curve
km_overall <- survfit(surv_obj ~ 1, data = htndat)
print(km_overall)

## Call: survfit(formula = surv_obj ~ 1, data = htndat)
## 
##         n events median 0.95LCL 0.95UCL
## [1,] 4998    749     NA      NA      NA

# basic plot
plot(km_overall, xlab="Months", ylab="Survival proportion")

plot(km_overall, xlab="Months", ylab="Survival proportion", 
     ylim=c(.6,1),
     conf.int=FALSE)

# fancy plot
p = ggsurvplot(
  km_overall,
  data       = htndat,
  risk.table = TRUE,
  conf.int   = TRUE,
  xlab       = "Time (months)",
  ylab       = "Survival probability",
  ggtheme = theme_minimal()
)
print(p)

4 KM Summaries

summary(km_overall)

## Call: survfit(formula = surv_obj ~ 1, data = htndat)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1   4998      65    0.987 0.00160        0.984        0.990
##     2   4394      44    0.977 0.00217        0.973        0.981
##     3   4210      43    0.967 0.00263        0.962        0.972
##     4   4066      28    0.960 0.00290        0.955        0.966
##     5   3955      29    0.953 0.00316        0.947        0.960
##     6   3849      26    0.947 0.00338        0.940        0.954
##     7   3741      23    0.941 0.00357        0.934        0.948
##     8   3626      24    0.935 0.00377        0.928        0.942
##     9   3508      18    0.930 0.00391        0.923        0.938
##    10   3397      13    0.927 0.00402        0.919        0.934
##    11   3306      16    0.922 0.00415        0.914        0.930
##    12   3213      21    0.916 0.00433        0.908        0.925
##    13   3130      25    0.909 0.00454        0.900        0.918
##    14   3024      13    0.905 0.00464        0.896        0.914
##    15   2947      21    0.898 0.00482        0.889        0.908
##    16   2852      19    0.892 0.00498        0.883        0.902
##    17   2764      19    0.886 0.00514        0.876        0.896
##    18   2680      17    0.881 0.00528        0.870        0.891
##    19   2609       7    0.878 0.00535        0.868        0.889
##    20   2547      20    0.871 0.00552        0.861        0.882
##    21   2475       7    0.869 0.00558        0.858        0.880
##    22   2410      11    0.865 0.00569        0.854        0.876
##    23   2349      14    0.860 0.00582        0.848        0.871
##    24   2274      10    0.856 0.00591        0.845        0.868
##    25   2217      11    0.852 0.00602        0.840        0.864
##    26   2160      12    0.847 0.00614        0.835        0.859
##    27   2082       7    0.844 0.00621        0.832        0.856
##    28   2018       7    0.841 0.00629        0.829        0.854
##    29   1948       9    0.837 0.00639        0.825        0.850
##    30   1876       9    0.833 0.00650        0.821        0.846
##    31   1805       3    0.832 0.00654        0.819        0.845
##    32   1758      13    0.826 0.00671        0.813        0.839
##    33   1696       6    0.823 0.00679        0.810        0.836
##    34   1644       9    0.818 0.00692        0.805        0.832
##    35   1590       9    0.814 0.00705        0.800        0.828
##    36   1534       4    0.812 0.00711        0.798        0.826
##    37   1477       5    0.809 0.00719        0.795        0.823
##    38   1432       8    0.804 0.00733        0.790        0.819
##    39   1381      10    0.799 0.00750        0.784        0.813
##    40   1321       3    0.797 0.00756        0.782        0.812
##    41   1283       3    0.795 0.00761        0.780        0.810
##    42   1237       6    0.791 0.00774        0.776        0.806
##    45   1119       3    0.789 0.00781        0.774        0.804
##    46   1078       4    0.786 0.00792        0.771        0.802
##    47   1034      11    0.778 0.00823        0.762        0.794
##    48    987       2    0.776 0.00829        0.760        0.792
##    49    944       1    0.775 0.00832        0.759        0.792
##    50    890       7    0.769 0.00857        0.753        0.786
##    51    845       6    0.764 0.00879        0.747        0.781
##    52    806       3    0.761 0.00891        0.744        0.778
##    53    766       7    0.754 0.00921        0.736        0.772
##    54    703       2    0.752 0.00931        0.734        0.770
##    55    653       4    0.747 0.00953        0.729        0.766
##    56    616       6    0.740 0.00989        0.721        0.759
##    57    581       4    0.735 0.01014        0.715        0.755
##    58    536       2    0.732 0.01029        0.712        0.752
##    59    502       1    0.731 0.01037        0.710        0.751
##    60    471       4    0.724 0.01074        0.704        0.746
##    62    396       4    0.717 0.01123        0.695        0.739
##    65    318       1    0.715 0.01142        0.693        0.738
##    66    292       6    0.700 0.01266        0.676        0.725
##    72    155       3    0.687 0.01464        0.658        0.716
##    73    133       1    0.681 0.01541        0.652        0.712

summary(km_overall, times = 12)

## Call: survfit(formula = surv_obj ~ 1, data = htndat)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##    12   3213     350    0.916 0.00433        0.908        0.925

quantile(km_overall, probs=c(.05, .10, .25, .5))

## $quantile
##  5 10 25 50 
##  6 15 55 NA 
## 
## $lower
##  5 10 25 50 
##  5 13 51 NA 
## 
## $upper
##  5 10 25 50 
##  7 17 60 NA

tbl_survfit(km_overall, probs=c(.05, .10, .25, .5))

Characteristic	5.0% Percentile	10% Percentile	25% Percentile	50% Percentile
Overall	6.0 (5.0, 7.0)	15 (13, 17)	55 (51, 60)	— (—, —)

5 Hazard Functions

# Hazard functions
fit.hazard <- bshazard(surv_obj ~ 1, data = htndat)

## Iterations: relative error in phi-hat = 1e-04 
## phi= 1.414298   sv2= 0.03232137   df= 12.53428   lambda= 43.75735 
## phi= 1.453477   sv2= 0.01388223   df= 9.037744   lambda= 104.7005 
## phi= 1.477015   sv2= 0.00819069   df= 7.467542   lambda= 180.3286 
## phi= 1.495896   sv2= 0.005662076   df= 6.645518   lambda= 264.1958 
## phi= 1.511699   sv2= 0.004268779   df= 6.130303   lambda= 354.129 
## phi= 1.525176   sv2= 0.003393509   df= 5.766437   lambda= 449.4393 
## phi= 1.536825   sv2= 0.002797968   df= 5.488998   lambda= 549.2647 
## phi= 1.54695   sv2= 0.002373234   df= 5.26765   lambda= 651.8322 
## phi= 1.555726   sv2= 0.002062497   df= 5.087056   lambda= 754.2924 
## phi= 1.563251   sv2= 0.001832578   df= 4.938872   lambda= 853.034 
## phi= 1.569599   sv2= 0.001662059   df= 4.818013   lambda= 944.3702 
## phi= 1.574844   sv2= 0.001535947   df= 4.720765   lambda= 1025.325 
## phi= 1.57908   sv2= 0.001443162   df= 4.643867   lambda= 1094.181 
## phi= 1.582427   sv2= 0.001375298   df= 4.58417   lambda= 1150.606 
## phi= 1.585015   sv2= 0.001325938   df= 4.538627   lambda= 1195.392 
## phi= 1.586982   sv2= 0.001290207   df= 4.50441   lambda= 1230.021 
## phi= 1.588453   sv2= 0.001264441   df= 4.479026   lambda= 1256.249 
## phi= 1.589541   sv2= 0.001245918   df= 4.460384   lambda= 1275.799 
## phi= 1.590337   sv2= 0.001232631   df= 4.4468   lambda= 1290.197 
## phi= 1.590916   sv2= 0.001223116   df= 4.43696   lambda= 1300.707 
## phi= 1.591335   sv2= 0.001216311   df= 4.429863   lambda= 1308.328 
## phi= 1.591636   sv2= 0.001211449   df= 4.42476   lambda= 1313.829 
## phi= 1.591853   sv2= 0.001207977   df= 4.421101   lambda= 1317.784 
## phi= 1.592008   sv2= 0.001205499   df= 4.418481   lambda= 1320.622 
## phi= 1.592119   sv2= 0.00120373   df= 4.416608   lambda= 1322.654 
## phi= 1.592199   sv2= 0.001202469   df= 4.415269   lambda= 1324.108 
## phi= 1.592255   sv2= 0.001201569   df= 4.414313   lambda= 1325.146 
## phi= 1.592296   sv2= 0.001200928   df= 4.413631   lambda= 1325.888 
## phi= 1.592325   sv2= 0.00120047   df= 4.413144   lambda= 1326.417 
## phi= 1.592345   sv2= 0.001200144   df= 4.412797   lambda= 1326.795 
## phi= 1.59236   sv2= 0.001199912   df= 4.41255   lambda= 1327.064 
## phi= 1.592371   sv2= 0.001199746   df= 4.412373   lambda= 1327.256 
## phi= 1.592378   sv2= 0.001199628   df= 4.412247   lambda= 1327.393 
## phi= 1.592383   sv2= 0.001199543   df= 4.412157   lambda= 1327.491

# Extract smoothed hazard and log-hazard
time_vals  <- fit.hazard$time
hazard     <- fit.hazard$hazard
log_hazard <- log(hazard)

## Plot hazard
plot(time_vals, hazard, type = "l", lwd = 2,
     xlab = "Time", ylab = "h(t)", main = "Smoothed Hazard Function")

polygon(c(fit.hazard$time, rev(fit.hazard$time)),
        c(fit.hazard$upper, rev(fit.hazard$lower)),
        col = rgb(0, 0, 1, 0.2), border = NA)

5.1 Plot log-hazard

plot(time_vals, log_hazard, type = "l", lwd = 2,
     xlab = "Time", ylab = "log h(t)", main = "Smoothed Log-Hazard Function")

6 Stratified survival curves

6.1 Overall KM survival plot by advanced HIV status

km_adv <- survfit(surv_obj ~ factor(adv_HIV), data = htndat)

plot(km_adv, lwd=3, col=c("blue","red"), ylim=c(.6,1) ,
     xlab="Time in months", ylab="Survival proportion S(t)",
     main="Survival curves by HIV Disease Stage")

legend("topright",
       legend = c("Advanced HIV", "Not advanced"),
       col    = c("red", "blue"),
       lwd    = 2)

6.2 Plot the stratified survival function by hiv status

ggsurvplot(
  km_adv,
  data       = htndat,
  risk.table = TRUE,
  pval       = TRUE,
  legend.labs= c("Not advanced","Advanced"),
  xlab       = "Time (months)",
  ylab       = "Survival probability",
  title      = "Survival by advanced HIV status",
  conf.int = TRUE,
  ggtheme = theme_minimal()
)

7 Stratified Hazard Curves

7.1 Fit smoothed hazard models for each level

fit_adv    <- bshazard(Surv(time = survmonth, event = event) ~ 1,
                       data = htndat[htndat$adv_HIV == 1, ])

## Iterations: relative error in phi-hat = 1e-04 
## phi= 1.211447   sv2= 0.06181708   df= 10.05581   lambda= 19.59728 
## phi= 1.268917   sv2= 0.033306   df= 8.687428   lambda= 38.09876 
## phi= 1.307798   sv2= 0.01831245   df= 7.529565   lambda= 71.41576 
## phi= 1.338337   sv2= 0.01047754   df= 6.590567   lambda= 127.7338 
## phi= 1.366714   sv2= 0.006141416   df= 5.83713   lambda= 222.5406 
## phi= 1.39498   sv2= 0.003541221   df= 5.207246   lambda= 393.9264 
## phi= 1.423359   sv2= 0.001916264   df= 4.63941   lambda= 742.7782 
## phi= 1.451633   sv2= 0.0009413058   df= 4.092872   lambda= 1542.148 
## phi= 1.481331   sv2= 0.0004359487   df= 3.559289   lambda= 3397.948 
## phi= 1.51785   sv2= 0.0002243509   df= 3.0804   lambda= 6765.517 
## phi= 1.558386   sv2= 0.0001416363   df= 2.735216   lambda= 11002.73

fit_notadv <- bshazard(Surv(time = survmonth, event = event) ~ 1, 
                       data = htndat[htndat$adv_HIV == 0, ])

## Iterations: relative error in phi-hat = 1e-04 
## phi= 1.181919   sv2= 0.01696105   df= 8.003624   lambda= 69.68426 
## phi= 1.202664   sv2= 0.003641724   df= 5.290794   lambda= 330.2457 
## phi= 1.214462   sv2= 0.001546525   df= 3.904228   lambda= 785.2846 
## phi= 1.232795   sv2= 0.0007790577   df= 3.335568   lambda= 1582.419 
## phi= 1.249134   sv2= 0.0003846332   df= 2.955468   lambda= 3247.598 
## phi= 1.262423   sv2= 0.0001732224   df= 2.632501   lambda= 7287.877 
## phi= 1.271822   sv2= 7.016679e-05   df= 2.358936   lambda= 18125.69

## Common y-limits so the curves share the same scale
ylim_vals <- range(c(fit_adv$hazard, fit_notadv$hazard))

7.2 Plot the hazard functions

plot(fit_adv$time, fit_adv$hazard, type = "l", lwd = 2, col = "red",
     xlab = "Time", ylab = "Hazard",
     main = "Smoothed Hazard by HIV Disease Stage",
     ylim = ylim_vals)

lines(fit_notadv$time, fit_notadv$hazard, lwd = 2, col = "blue")

legend("topright",
       legend = c("Advanced HIV", "Not advanced"),
       col    = c("red", "blue"),
       lwd    = 2)

7.3 Plot log hazards

loghazard_adv = log(fit_adv$hazard)
loghazard_notadv = log(fit_notadv$hazard)

## Common y-limits so the curves share the same scale
ylim_vals <- range(c(loghazard_adv, loghazard_notadv))

## Plot the hazard functions
plot(fit_adv$time, loghazard_adv, type = "l", lwd = 2, col = "red",
     xlab = "Time", ylab = "log Hazard",
     main = "Smoothed log Hazard by HIV Disease Stage",
     ylim = ylim_vals)

lines(fit_notadv$time, loghazard_notadv, lwd = 2, col = "blue")

legend("topright",
       legend = c("Advanced HIV", "Not advanced"),
       col    = c("red", "blue"),
       lwd    = 2)

8 Logrank test

8.1 Overall Stratified KM: Advanced HIV status

km_adv <- survfit(surv_obj ~ factor(adv_HIV), data = htndat)


# Plot the stratified survival function by HIV status
ggsurvplot(
  km_adv,
  data       = htndat,
  risk.table = TRUE,
  pval       = TRUE,
  legend.labs= c("Not advanced","Advanced"),
  xlab       = "Time (months)",
  ylab       = "Survival probability",
  title      = "Survival by advanced HIV status",
  conf.int = TRUE,
  ggtheme = theme_minimal()
)

# Log‐rank test for HIV strata
lr_adv <- survdiff(surv_obj ~ factor(adv_HIV), data = htndat)
print(lr_adv)

## Call:
## survdiff(formula = surv_obj ~ factor(adv_HIV), data = htndat)
## 
## n=3038, 1960 observations deleted due to missingness.
## 
##                      N Observed Expected (O-E)^2/E (O-E)^2/V
## factor(adv_HIV)=0 1130       99      159      22.5      37.6
## factor(adv_HIV)=1 1908      299      239      14.9      37.6
## 
##  Chisq= 37.6  on 1 degrees of freedom, p= 9e-10

8.2 Cox regression with one binary variable

8.2.1 Univariate Cox regression

univ_cox <- coxph(Surv(survmonth, event) ~ adv_HIV, 
                            data = htndat,
                            ties = "efron")

# Get summary of the first regression model that has only adv_HIV as covariate
summary(univ_cox)

## Call:
## coxph(formula = Surv(survmonth, event) ~ adv_HIV, data = htndat, 
##     ties = "efron")
## 
##   n= 3038, number of events= 398 
##    (1960 observations deleted due to missingness)
## 
##          coef exp(coef) se(coef)    z Pr(>|z|)    
## adv_HIV 0.697     2.008    0.116 6.01 1.86e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## adv_HIV     2.008     0.4981       1.6      2.52
## 
## Concordance= 0.584  (se = 0.011 )
## Likelihood ratio test= 40.02  on 1 df,   p=3e-10
## Wald test            = 36.12  on 1 df,   p=2e-09
## Score (logrank) test = 37.6  on 1 df,   p=9e-10

8.2.2 Display the results in a nice table

# return a nice table of summary Cox regression results using
gtsummary::tbl_regression(univ_cox, exponentiate = TRUE)

Characteristic	HR¹	95% CI¹	p-value
adv_HIV	2.01	1.60, 2.52	<0.001
¹ HR = Hazard Ratio, CI = Confidence Interval

# time-varying hazard ratio
# Fit Cox model with time-varying effect of adv_HIV
tv_cox <- coxph(surv_obj ~ adv_HIV + tt(adv_HIV),
             data = htndat,
             tt = function(x, t, ...) x * t)

# View model summary
summary(tv_cox)

## Call:
## coxph(formula = surv_obj ~ adv_HIV + tt(adv_HIV), data = htndat, 
##     tt = function(x, t, ...) x * t)
## 
##   n= 3038, number of events= 398 
##    (1960 observations deleted due to missingness)
## 
##                  coef exp(coef)  se(coef)      z Pr(>|z|)    
## adv_HIV      0.972570  2.644733  0.188536  5.159 2.49e-07 ***
## tt(adv_HIV) -0.013329  0.986759  0.006883 -1.937   0.0528 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##             exp(coef) exp(-coef) lower .95 upper .95
## adv_HIV        2.6447     0.3781    1.8277     3.827
## tt(adv_HIV)    0.9868     1.0134    0.9735     1.000
## 
## Concordance= 0.584  (se = 0.011 )
## Likelihood ratio test= 43.72  on 2 df,   p=3e-10
## Wald test            = 38.28  on 2 df,   p=5e-09
## Score (logrank) test = 40.45  on 2 df,   p=2e-09

8.3 Models

8.3.1 model1: SBP only

# Center SBP at 120 for interpretability and save data in dat
dat <- htndat %>%
  dplyr::mutate(sbp_c120 = SBP - 120)

# Fit Cox model with SBP only
model1 <- coxph(
  Surv(survmonth, event) ~ sbp_c120,
  data    = dat,
  ties    = "efron"
)

summary(model1)

## Call:
## coxph(formula = Surv(survmonth, event) ~ sbp_c120, data = dat, 
##     ties = "efron")
## 
##   n= 4998, number of events= 749 
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)    
## sbp_c120 -0.02246   0.97779  0.00286 -7.853 4.05e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##          exp(coef) exp(-coef) lower .95 upper .95
## sbp_c120    0.9778      1.023    0.9723    0.9833
## 
## Concordance= 0.601  (se = 0.012 )
## Likelihood ratio test= 67.31  on 1 df,   p=2e-16
## Wald test            = 61.68  on 1 df,   p=4e-15
## Score (logrank) test = 59.98  on 1 df,   p=1e-14

8.3.2 model 2a: Use b-splines for SBP

# set knot points
knots <- quantile(dat$sbp_c120, probs = c(0.20, 0.50, 0.80))

# fit Cox model with a 3-df B-spline for sbp_c120 
model_spline <- coxph(
  Surv(survmonth, event) ~ bs(sbp_c120, knots = knots, degree = 3),
  data    = dat,
  ties    = "efron"
)
summary(model_spline)

## Call:
## coxph(formula = Surv(survmonth, event) ~ bs(sbp_c120, knots = knots, 
##     degree = 3), data = dat, ties = "efron")
## 
##   n= 4998, number of events= 749 
## 
##                                               coef exp(coef)  se(coef)      z
## bs(sbp_c120, knots = knots, degree = 3)1 -2.037785  0.130317  1.264068 -1.612
## bs(sbp_c120, knots = knots, degree = 3)2 -1.525040  0.217612  0.806992 -1.890
## bs(sbp_c120, knots = knots, degree = 3)3 -2.890363  0.055556  0.905937 -3.190
## bs(sbp_c120, knots = knots, degree = 3)4 -2.315914  0.098676  0.947017 -2.445
## bs(sbp_c120, knots = knots, degree = 3)5 -2.554967  0.077695  2.070859 -1.234
## bs(sbp_c120, knots = knots, degree = 3)6 -7.040151  0.000876  4.756966 -1.480
##                                          Pr(>|z|)   
## bs(sbp_c120, knots = knots, degree = 3)1  0.10694   
## bs(sbp_c120, knots = knots, degree = 3)2  0.05879 . 
## bs(sbp_c120, knots = knots, degree = 3)3  0.00142 **
## bs(sbp_c120, knots = knots, degree = 3)4  0.01447 * 
## bs(sbp_c120, knots = knots, degree = 3)5  0.21729   
## bs(sbp_c120, knots = knots, degree = 3)6  0.13888   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                          exp(coef) exp(-coef) lower .95
## bs(sbp_c120, knots = knots, degree = 3)1  0.130317      7.674 1.094e-02
## bs(sbp_c120, knots = knots, degree = 3)2  0.217612      4.595 4.475e-02
## bs(sbp_c120, knots = knots, degree = 3)3  0.055556     18.000 9.410e-03
## bs(sbp_c120, knots = knots, degree = 3)4  0.098676     10.134 1.542e-02
## bs(sbp_c120, knots = knots, degree = 3)5  0.077695     12.871 1.342e-03
## bs(sbp_c120, knots = knots, degree = 3)6  0.000876   1141.560 7.823e-08
##                                          upper .95
## bs(sbp_c120, knots = knots, degree = 3)1    1.5523
## bs(sbp_c120, knots = knots, degree = 3)2    1.0583
## bs(sbp_c120, knots = knots, degree = 3)3    0.3280
## bs(sbp_c120, knots = knots, degree = 3)4    0.6314
## bs(sbp_c120, knots = knots, degree = 3)5    4.4989
## bs(sbp_c120, knots = knots, degree = 3)6    9.8093
## 
## Concordance= 0.604  (se = 0.011 )
## Likelihood ratio test= 96.33  on 6 df,   p=<2e-16
## Wald test            = 103.5  on 6 df,   p=<2e-16
## Score (logrank) test = 112.3  on 6 df,   p=<2e-16

termplot(
  model_spline,
  terms         = 1,            # first term is the bs(sbp_c150, ...) spline
  se            = TRUE,
  rug           = TRUE,
  col.term      = "blue",
  col.se        = "red",
  main          = "Partial Effect of SBP (B-spline) on Log Hazard",
  xlab          = "SBP centered at 120 mmHg",
  ylab          = "Partial log hazard",
  ylim=c(-6,5)
)

8.3.3 model 2b: use natural regression splines for SBP

# fit Cox model with a 3-df natural spline for sbp_c120 plus adv_HIV as a covariate
model_spline <- coxph(
  Surv(survmonth, event) ~ ns(sbp_c120, df=3),
  data    = dat,
  ties    = "efron"
)
summary(model_spline)

## Call:
## coxph(formula = Surv(survmonth, event) ~ ns(sbp_c120, df = 3), 
##     data = dat, ties = "efron")
## 
##   n= 4998, number of events= 749 
## 
##                           coef exp(coef) se(coef)      z Pr(>|z|)    
## ns(sbp_c120, df = 3)1 -1.92262   0.14622  0.23230 -8.276  < 2e-16 ***
## ns(sbp_c120, df = 3)2 -4.09300   0.01669  0.80929 -5.057 4.25e-07 ***
## ns(sbp_c120, df = 3)3 -1.45872   0.23253  0.81783 -1.784   0.0745 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## ns(sbp_c120, df = 3)1   0.14622      6.839  0.092743   0.23054
## ns(sbp_c120, df = 3)2   0.01669     59.919  0.003416   0.08153
## ns(sbp_c120, df = 3)3   0.23253      4.300  0.046811   1.15512
## 
## Concordance= 0.601  (se = 0.012 )
## Likelihood ratio test= 83.82  on 3 df,   p=<2e-16
## Wald test            = 96.41  on 3 df,   p=<2e-16
## Score (logrank) test = 103.3  on 3 df,   p=<2e-16

termplot(
  model_spline,
  terms         = 1,            
  se            = TRUE,
  rug           = TRUE,
  col.term      = "blue",
  col.se        = "red",
  main          = "Partial Effect of SBP (natural cubic spline) on Log Hazard",
  xlab          = "SBP centered at 120 mmHg",
  ylab          = "Partial log hazard",
  ylim=c(-6,5)
)

8.3.4 model3: SBP + adv_HIV

# Fit Cox model with main effects of SBP and advanced HIV in a single model
model3 <- coxph(
  Surv(survmonth, event) ~ sbp_c120 + as.factor(adv_HIV),
  data    = dat,
  ties    = "efron"
)

summary(model3)

## Call:
## coxph(formula = Surv(survmonth, event) ~ sbp_c120 + as.factor(adv_HIV), 
##     data = dat, ties = "efron")
## 
##   n= 3038, number of events= 398 
##    (1960 observations deleted due to missingness)
## 
##                          coef exp(coef)  se(coef)      z Pr(>|z|)    
## sbp_c120            -0.016011  0.984117  0.003723 -4.301 1.70e-05 ***
## as.factor(adv_HIV)1  0.686845  1.987435  0.116028  5.920 3.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## sbp_c120               0.9841     1.0161     0.977    0.9913
## as.factor(adv_HIV)1    1.9874     0.5032     1.583    2.4949
## 
## Concordance= 0.631  (se = 0.015 )
## Likelihood ratio test= 60.01  on 2 df,   p=9e-14
## Wald test            = 54.9  on 2 df,   p=1e-12
## Score (logrank) test = 56.14  on 2 df,   p=6e-13

Evidence of significant effect of both SBP and advanced HIV on the time to death

8.3.5 model4: SBP + adv_HIV + (SBP x adv_HIV)

# Fit Cox model with SBP × advanced HIV interaction
model4 <- coxph(
  Surv(survmonth, event) ~ sbp_c120 * as.factor(adv_HIV),
  data    = dat,
  ties    = "efron"
)
summary(model4)

## Call:
## coxph(formula = Surv(survmonth, event) ~ sbp_c120 * as.factor(adv_HIV), 
##     data = dat, ties = "efron")
## 
##   n= 3038, number of events= 398 
##    (1960 observations deleted due to missingness)
## 
##                                   coef exp(coef)  se(coef)      z Pr(>|z|)    
## sbp_c120                     -0.010847  0.989212  0.007533 -1.440    0.150    
## as.factor(adv_HIV)1           0.609250  1.839052  0.151291  4.027 5.65e-05 ***
## sbp_c120:as.factor(adv_HIV)1 -0.006764  0.993259  0.008665 -0.781    0.435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                              exp(coef) exp(-coef) lower .95 upper .95
## sbp_c120                        0.9892     1.0109    0.9747     1.004
## as.factor(adv_HIV)1             1.8391     0.5438    1.3671     2.474
## sbp_c120:as.factor(adv_HIV)1    0.9933     1.0068    0.9765     1.010
## 
## Concordance= 0.63  (se = 0.015 )
## Likelihood ratio test= 60.61  on 3 df,   p=4e-13
## Wald test            = 57.16  on 3 df,   p=2e-12
## Score (logrank) test = 59.59  on 3 df,   p=7e-13

No evidence of a dependence of the effect of advanced HIV on the average SBP on time to death

8.3.6 model5: SBP + adv_HIV + age + gender

# Fit Cox model with several covariates
model5 <- coxph(
  Surv(survmonth, event) ~ sbp_c120 + as.factor(adv_HIV)
  + as.factor(male.gender) + age,
  data    = dat,
  ties    = "efron"
)
summary(model5)

## Call:
## coxph(formula = Surv(survmonth, event) ~ sbp_c120 + as.factor(adv_HIV) + 
##     as.factor(male.gender) + age, data = dat, ties = "efron")
## 
##   n= 3038, number of events= 398 
##    (1960 observations deleted due to missingness)
## 
##                              coef exp(coef)  se(coef)      z Pr(>|z|)    
## sbp_c120                -0.019796  0.980399  0.003810 -5.196 2.04e-07 ***
## as.factor(adv_HIV)1      0.587573  1.799616  0.118416  4.962 6.98e-07 ***
## as.factor(male.gender)1  0.622844  1.864222  0.108476  5.742 9.37e-09 ***
## age                      0.004928  1.004940  0.004644  1.061    0.289    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## sbp_c120                   0.9804     1.0200    0.9731    0.9877
## as.factor(adv_HIV)1        1.7996     0.5557    1.4269    2.2697
## as.factor(male.gender)1    1.8642     0.5364    1.5072    2.3059
## age                        1.0049     0.9951    0.9958    1.0141
## 
## Concordance= 0.654  (se = 0.015 )
## Likelihood ratio test= 97.02  on 4 df,   p=<2e-16
## Wald test            = 93.94  on 4 df,   p=<2e-16
## Score (logrank) test = 95.94  on 4 df,   p=<2e-16

8.3.7 model6: Model 5 with splines for SBP and age

# Fit Cox model with several covariates and splines
model6 <- coxph(
  Surv(survmonth, event) ~ ns(sbp_c120, df=3) + ns(age, df=3)
  + as.factor(male.gender) 
  + adv_HIV + tt(adv_HIV),
  tt = function(x, t, ...) x * t,
  data    = dat,
  ties    = "efron"
)


model6 <- coxph(
  Surv(survmonth, event) ~ ns(sbp_c120, df=3) + ns(age, df=3)
  + as.factor(male.gender) 
  + as.factor(adv_HIV),
  data    = dat,
  ties    = "efron"
)

summary(model6)

## Call:
## coxph(formula = Surv(survmonth, event) ~ ns(sbp_c120, df = 3) + 
##     ns(age, df = 3) + as.factor(male.gender) + as.factor(adv_HIV), 
##     data = dat, ties = "efron")
## 
##   n= 3038, number of events= 398 
##    (1960 observations deleted due to missingness)
## 
##                              coef exp(coef)  se(coef)      z Pr(>|z|)    
## ns(sbp_c120, df = 3)1   -1.921020  0.146457  0.310251 -6.192 5.95e-10 ***
## ns(sbp_c120, df = 3)2   -4.609319  0.009959  0.988404 -4.663 3.11e-06 ***
## ns(sbp_c120, df = 3)3   -1.296022  0.273618  0.931644 -1.391    0.164    
## ns(age, df = 3)1         0.368627  1.445748  0.251386  1.466    0.143    
## ns(age, df = 3)2        -0.270376  0.763092  0.657682 -0.411    0.681    
## ns(age, df = 3)3        -0.293124  0.745930  0.488120 -0.601    0.548    
## as.factor(male.gender)1  0.630648  1.878828  0.109065  5.782 7.37e-09 ***
## as.factor(adv_HIV)1      0.569175  1.766808  0.118548  4.801 1.58e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## ns(sbp_c120, df = 3)1    0.146457     6.8279  0.079731   0.26903
## ns(sbp_c120, df = 3)2    0.009959   100.4158  0.001435   0.06911
## ns(sbp_c120, df = 3)3    0.273618     3.6547  0.044069   1.69887
## ns(age, df = 3)1         1.445748     0.6917  0.883309   2.36632
## ns(age, df = 3)2         0.763092     1.3105  0.210260   2.76947
## ns(age, df = 3)3         0.745930     1.3406  0.286556   1.94172
## as.factor(male.gender)1  1.878828     0.5322  1.517227   2.32661
## as.factor(adv_HIV)1      1.766808     0.5660  1.400495   2.22893
## 
## Concordance= 0.662  (se = 0.014 )
## Likelihood ratio test= 115.2  on 8 df,   p=<2e-16
## Wald test            = 124.3  on 8 df,   p=<2e-16
## Score (logrank) test = 131.7  on 8 df,   p=<2e-16

termplot(model=model6, terms=c(1,2,3,4),  
         se            = TRUE,
         rug           = TRUE,
         col.term      = "blue",
         col.se        = "red"
)

8.3.8 Martingale residuals and proportional hazards assumption

# Martingale residuals using model 1
htndat$mart_resid <- residuals(model1, type = "martingale")

8.3.9 Plot Martingale vs. linear predictor, SBP

plot(
  htndat$SBP-120,
  htndat$mart_resid,
  xlab = "Systolic Blood Pressure (mmHg)",
  ylab = "Martingale residuals",
  main = "Martingale Residuals vs. SBP",
  col="gray"
)
lines(
  lowess(htndat$SBP-120, htndat$mart_resid, f = 0.5),
  col = "red",
  lwd = 3
)

8.3.10 Testing Proportional Hazards assumption

## Using a test of hypothesis (H0: proportional hazards holds)
cox.zph(univ_cox)

##         chisq df     p
## adv_HIV  3.65  1 0.056
## GLOBAL   3.65  1 0.056

## Using Schoenfeld residuals
plot(cox.zph(univ_cox), lwd=3)
abline(h=0,col="red", lwd=2)  # add red line at 0

9 Parametric modeling using Weibull

# Fully parametric fits via survreg
## Weibull AFT model
wei_aft <- survreg(surv_obj ~ as.factor(adv_HIV), 
                   data = htndat, 
                   dist = "weibull")
summary(wei_aft)

## 
## Call:
## survreg(formula = surv_obj ~ as.factor(adv_HIV), data = htndat, 
##     dist = "weibull")
##                       Value Std. Error     z      p
## (Intercept)          5.9704     0.1479 40.36 <2e-16
## as.factor(adv_HIV)1 -0.7601     0.1300 -5.85  5e-09
## Log(scale)           0.0869     0.0421  2.06  0.039
## 
## Scale= 1.09 
## 
## Weibull distribution
## Loglik(model)= -2485   Loglik(intercept only)= -2505
##  Chisq= 40 on 1 degrees of freedom, p= 2.5e-10 
## Number of Newton-Raphson Iterations: 10 
## n=3038 (1960 observations deleted due to missingness)

# Get the PH model parameters from the AFT model fitted earlier
# Extract AFT scale
sigma_aft <- wei_aft$scale

# Convert to PH shape
shape_ph <- 1 / sigma_aft

# AFT coefficient for adv_HIV
beta_aft <- wei_aft$coefficients

# Convert to PH log–hazard ratio
beta_ph <- - beta_aft / sigma_aft
hr_ph   <- exp(beta_ph)

# Display and note similarity to the Cox model fitted earlier
shape_ph

## [1] 0.9167777

beta_ph

##         (Intercept) as.factor(adv_HIV)1 
##          -5.4735319           0.6968243

hr_ph

##         (Intercept) as.factor(adv_HIV)1 
##         0.004196385         2.007367801

Survival Analysis on HTN data

Bernard Osang’ir, Rudradev Sungepta, Ziv Shkedy, Joseph Hogan, Richard Mugo, Ann Mwangi (July, 2025)

1 Data exploration

2 Descriptive statistics

3 Inspect key variables and creating the survival object

4 KM Summaries

5 Hazard Functions

5.1 Plot log-hazard

6 Stratified survival curves

6.1 Overall KM survival plot by advanced HIV status

6.2 Plot the stratified survival function by hiv status

7 Stratified Hazard Curves

7.1 Fit smoothed hazard models for each level

7.2 Plot the hazard functions

7.3 Plot log hazards

8 Logrank test

8.1 Overall Stratified KM: Advanced HIV status

8.2 Cox regression with one binary variable

8.2.1 Univariate Cox regression

8.2.2 Display the results in a nice table

8.3 Models

8.3.1 model1: SBP only

8.3.2 model 2a: Use b-splines for SBP

8.3.3 model 2b: use natural regression splines for SBP

8.3.4 model3: SBP + adv_HIV

8.3.5 model4: SBP + adv_HIV + (SBP x adv_HIV)

8.3.6 model5: SBP + adv_HIV + age + gender

8.3.7 model6: Model 5 with splines for SBP and age

8.3.8 Martingale residuals and proportional hazards assumption

8.3.9 Plot Martingale vs. linear predictor, SBP

8.3.10 Testing Proportional Hazards assumption

9 Parametric modeling using Weibull