library(haven) # read Stata .dta
library(dplyr)
library(lubridate)
library(survival)
library(survminer) # ggsurvplot
library(broom) # tidy()
library(epitools)
library(gtsummary)
library(gt)
library(mice)
library(readr)
library(Epi)
library(flexsurv)
library(Epi)
library(mgcv)
library(popEpi)
library(R2BayesX)
library(speedglm)
Part I
setwd("C:/Users/VUSI/Downloads/BIO3")
<- read_dta("trinmlsh.dta")
dat head(dat, n=10)
ethgp | ageent | death | cvdeath | alc | smokenum | hdlc | diabp | sysbp | chdstart | days | years | bmi | id | timein | timeout | y | timebth |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 62.984 | 1 | 0 | 0 | 0 | 0.767 | 120 | 198 | NA | 1409 | 3.857632 | NA | 62 | 1977-01-01 | 1980-11-10 | 3.857632 | 1914-01-07 |
3 | 60.553 | 0 | 0 | 0 | 5 | NA | 66 | 150 | 0 | 1526 | 4.177960 | NA | 166 | 1977-01-01 | 1981-03-07 | 4.177960 | 1916-06-13 |
5 | 67.617 | 1 | 0 | 0 | 1 | 0.563 | 81 | 135 | 0 | 367 | 1.004791 | 20.58 | 68 | 1977-01-01 | 1978-01-03 | 1.004791 | 1909-05-20 |
4 | 60.424 | 0 | 0 | 0 | 1 | 0.684 | 90 | 170 | 1 | 3494 | 9.566050 | 26.24 | 49 | 1977-01-01 | 1986-07-27 | 9.566050 | 1916-07-30 |
1 | 71.723 | 0 | 0 | 0 | 1 | 1.099 | 98 | 158 | 0 | 2546 | 6.970568 | 23.40 | 287 | 1977-01-01 | 1983-12-22 | 6.970568 | 1905-04-12 |
1 | 69.051 | 0 | 0 | 0 | 1 | NA | 64 | 112 | NA | 3452 | 9.451061 | NA | 69 | 1977-01-01 | 1986-06-15 | 9.451061 | 1907-12-14 |
1 | 67.578 | 0 | 0 | 0 | 0 | 0.850 | 80 | 126 | 0 | 3123 | 8.550308 | 19.73 | 139 | 1977-01-01 | 1985-07-21 | 8.550308 | 1909-06-04 |
3 | 61.895 | 0 | 0 | 0 | 0 | 0.850 | 80 | 115 | 0 | 3262 | 8.930869 | 15.31 | 109 | 1977-01-01 | 1985-12-07 | 8.930869 | 1915-02-08 |
1 | 62.316 | 0 | 0 | 0 | 0 | 0.979 | 91 | 129 | 0 | 3416 | 9.352498 | 23.74 | 81 | 1977-01-01 | 1986-05-10 | 9.352498 | 1914-09-08 |
1 | 68.950 | 0 | 0 | 0 | 0 | 0.748 | 83 | 145 | 0 | 3207 | 8.780288 | 26.36 | 153 | 1977-01-01 | 1985-10-13 | 8.780288 | 1908-01-20 |
summary(dat)
#> ethgp ageent death cvdeath
#> Min. :1.000 Min. :60.01 Min. :0.0000 Min. :0.00000
#> 1st Qu.:1.000 1st Qu.:62.39 1st Qu.:0.0000 1st Qu.:0.00000
#> Median :1.000 Median :64.59 Median :0.0000 Median :0.00000
#> Mean :2.104 Mean :65.09 Mean :0.2767 Mean :0.06918
#> 3rd Qu.:3.000 3rd Qu.:67.67 3rd Qu.:1.0000 3rd Qu.:0.00000
#> Max. :5.000 Max. :73.93 Max. :1.0000 Max. :1.00000
#>
#> alc smokenum hdlc diabp
#> Min. :0.000 Min. :0.000 Min. :0.185 Min. : 55.00
#> 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.748 1st Qu.: 80.00
#> Median :1.000 Median :1.000 Median :0.924 Median : 90.00
#> Mean :1.031 Mean :1.388 Mean :0.988 Mean : 91.09
#> 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:1.201 3rd Qu.:100.00
#> Max. :3.000 Max. :5.000 Max. :2.411 Max. :159.00
#> NA's :1 NA's :17 NA's :4
#> sysbp chdstart days years
#> Min. : 95.0 Min. :0.000 Min. : 6 Min. :0.01643
#> 1st Qu.:135.0 1st Qu.:0.000 1st Qu.:2107 1st Qu.:5.76934
#> Median :150.0 Median :0.000 Median :2804 Median :7.67830
#> Mean :154.8 Mean :0.131 Mean :2532 Mean :6.93251
#> 3rd Qu.:170.0 3rd Qu.:0.000 3rd Qu.:3232 3rd Qu.:8.85010
#> Max. :243.0 Max. :1.000 Max. :3579 Max. :9.79877
#> NA's :4 NA's :28
#> bmi id timein timeout
#> Min. :11.25 Min. : 1.00 Min. :1977-01-01 Min. :1977-01-07
#> 1st Qu.:20.27 1st Qu.: 80.25 1st Qu.:1977-01-01 1st Qu.:1982-10-09
#> Median :23.21 Median :159.50 Median :1977-01-01 Median :1984-09-05
#> Mean :23.25 Mean :159.50 Mean :1977-01-01 Mean :1983-12-08
#> 3rd Qu.:26.10 3rd Qu.:238.75 3rd Qu.:1977-01-01 3rd Qu.:1985-11-07
#> Max. :36.38 Max. :318.00 Max. :1977-01-01 Max. :1986-10-20
#> NA's :35
#> y timebth
#> Min. :0.01643 Min. :1903-01-27
#> 1st Qu.:5.76934 1st Qu.:1909-04-30
#> Median :7.67830 Median :1912-05-31
#> Mean :6.93251 Mean :1911-11-28
#> 3rd Qu.:8.85010 3rd Qu.:1914-08-13
#> Max. :9.79877 Max. :1916-12-30
#>
unique(dat$smokenum)
#> [1] 0 5 1 3 2 4 NA
# --- Step C: Recode smoking categories ---
<- dat %>%
dat mutate(
smoke_cat = case_when(
== 0 ~ "non-smoker",
smokenum == 1 ~ "ex-smoker",
smokenum >= 2 ~ "current-smoker",
smokenum TRUE ~ NA_character_ # any unusual values (like 5) become NA
),smoke_cat = factor(smoke_cat,
levels = c("non-smoker", "ex-smoker", "current-smoker"))
)saveRDS(dat, "data_new.rds")
c) Examine the effect of smoking on all-cause mortality using strate and stcox. Hint: You may wish to recode smokenum into a smaller number of categories, say: 0 = non-smoker, 1 = ex-smoker, 2 = current smoker.
When mortality rates were examined by smoking category, a graded increase was observed from non-smokers (30.48 deaths per 1,000 person-years) to ex-smokers (39.00 per 1,000 person-years) and current smokers (53.40 per 1,000 person-years).
Poisson regression using non-smokers as the reference group indicated that current smokers had a statistically significant 75% higher mortality rate (rate ratio [RR] = 1.75; 95% CI: 1.09–2.39; p = 0.020), while the increase for ex-smokers was smaller and not statistically significant (RR = 1.28; 95% CI: 0.71–2.10; p = 0.41).
d) Using the rates and rate ratios in part (c), can you see a trend in the relationship between smoking category and all-cause mortality? How would you assess this more formally? How would you present the results from your analyses in parts (c) and (d)?
Based on the crude rates, there is a clear increase in all-cause mortality from non-smokers (30.5 deaths/1,000 person-years) to ex-smokers (39.0/1,000 person-years) and current smokers (53.4/1,000 person-years).
To formally assess this trend, a Poisson regression model was fitted with smoking category (0 = non-smoker, 1 = ex-smoker, 2 = current smoker) and person-time as the offset. This approach yields incidence rate ratios (IRRs) with 95% confidence intervals, allowing hypothesis testing for both overall differences between categories. I also used kaplan meire plots and log rank test to see if there’s a difference in survival probabilities between the categories.
The results were presented as a table showing the Poisson regression IRRs, CIs, and p-values, providing both descriptive and formal statistical evidence of the association. I also supplemented this by a forest plot to visually convey the magnitude and direction of associations.
# --- crude incidence rates (deaths / person-years) by smoking category ---
<- dat %>%
rates_tab group_by(smoke_cat) %>%
summarize(
deaths = sum(death, na.rm=TRUE),
person_years = sum(y, na.rm=TRUE)
%>%
) mutate(
rate_per_1000py = deaths / person_years * 1000
)head(rates_tab, n=10)
smoke_cat | deaths | person_years | rate_per_1000py |
---|---|---|---|
non-smoker | 30 | 984.407939 | 30.47517 |
ex-smoker | 18 | 461.593432 | 38.99536 |
current-smoker | 40 | 749.015740 | 53.40342 |
NA | 0 | 9.522245 | 0.00000 |
<- glm(death ~ smoke_cat, offset = log(y),
poisson_crude family = poisson(link = "log"), data = dat)
summary(poisson_crude)
#>
#> Call:
#> glm(formula = death ~ smoke_cat, family = poisson(link = "log"),
#> data = dat, offset = log(y))
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -3.4908 0.1826 -19.120 <2e-16 ***
#> smoke_catex-smoker 0.2465 0.2981 0.827 0.4083
#> smoke_catcurrent-smoker 0.5610 0.2415 2.323 0.0202 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for poisson family taken to be 1)
#>
#> Null deviance: 395.44 on 316 degrees of freedom
#> Residual deviance: 389.95 on 314 degrees of freedom
#> (1 observation deleted due to missingness)
#> AIC: 571.95
#>
#> Number of Fisher Scoring iterations: 7
# Create gtsummary table with bold formatting
<- tbl_regression(
tbl_crude # Make sure this matches your object name
poisson_crude, exponentiate = TRUE, # Exponentiate to get Rate Ratios
estimate_fun = ~sprintf("%.2f", .x), # Format RR to 2 decimals
pvalue_fun = ~sprintf("%.3f", .x) # Format p-value to 3 decimals
%>%
) # Bold formatting for labels
bold_labels() %>%
# Modify headers
modify_header(
~ "**Characteristics**",
label ~ "**IRR**",
estimate ~ "**SE**",
std.error ~ "**p-value**"
p.value %>%
) # Add spanning header
modify_spanning_header(
c(estimate, std.error, ci) ~ "**Poisson Regression Model Estimates**"
%>%
) # Convert to gt and add title/source note
as_gt() %>%
::tab_header(
gttitle = gt::md("**Rate Ratios for All-Cause Mortality by Smoking Status**"),
subtitle = gt::md("Poisson regression with offset for person-years")
%>%
) ::tab_source_note(
gtsource_note = gt::md("Data updated August by Vusi, 2025")
)
tbl_crude
Rate Ratios for All-Cause Mortality by Smoking Status | ||||
---|---|---|---|---|
Poisson regression with offset for person-years | ||||
Characteristics |
Poisson Regression Model Estimates
|
95% CI | p-value | |
IRR | SE | |||
smoke_cat | ||||
non-smoker | — | — | — | |
ex-smoker | 1.28 | 0.298 | 0.70, 2.27 | 0.408 |
current-smoker | 1.75 | 0.242 | 1.09, 2.83 | 0.020 |
Abbreviations: CI = Confidence Interval, IRR = Incidence Rate Ratio, SE = Standard Error | ||||
Data updated August by Vusi, 2025 |
library(ggplot2)
library(dplyr)
library(glue)
# Create forest plot data with color categories
<- data.frame(
forest_data term = c("Ex-smoker vs Non-smoker", "Current-smoker vs Non-smoker"),
estimate = c(0.246530, 0.5610),
std.error = c(0.298142, 0.2415),
p.value = c(0.408, 0.0202)
%>%
) mutate(
# Calculate RR and CI
RR = exp(estimate),
CI_lower = exp(estimate - 1.96 * std.error),
CI_upper = exp(estimate + 1.96 * std.error),
# Create color groups (using term as color variable)
color_group = term,
# Format labels
term = factor(term, levels = rev(term)),
label = glue("{term}\nRR = {sprintf('%.2f', RR)} (95% CI: {sprintf('%.2f', CI_lower)}-{sprintf('%.2f', CI_upper)})\np = {ifelse(p.value < 0.001, '<0.001', sprintf('%.3f', p.value))}"),
# Position for text labels (right side)
text_x = 3.5
)
# Custom color palette
<- c("Ex-smoker vs Non-smoker" = "#4E79A7", # Blue
term_colors "Current-smoker vs Non-smoker" = "#E15759") # Red
# Create the forest plot
ggplot(forest_data, aes(x = RR, y = term, color = color_group)) +
# Reference line
geom_vline(xintercept = 1, linetype = "dashed", color = "gray50", linewidth = 0.8) +
# Point estimates with CIs
geom_pointrange(aes(xmin = CI_lower, xmax = CI_upper),
size = 1, linewidth = 1.2, fatten = 2.5) +
# Text labels (acting as legend)
geom_text(aes(x = text_x, label = label, color = color_group),
hjust = 0, vjust = 0.5, size = 3.8, show.legend = FALSE) +
# Scale adjustments
scale_x_continuous(
trans = "log",
breaks = c(0.5, 0.75, 1, 1.5, 2, 3),
limits = c(0.4, 5), # Extended limit for text
expand = expansion(mult = c(0, 0.1))
+
)
# Apply custom colors
scale_color_manual(values = term_colors) +
# Labels and titles
labs(
title = "Smoking Status Mortality Risk",
subtitle = "Poisson regression with person-years offset",
x = "IRR with 95% Confidence Interval",
y = NULL,
caption = glue("Null deviance: 280.09, Residual deviance: 279.37\n",
"AIC: 395.37 | 81 observations excluded due to missingness")
+
)
# Theme customization
theme_minimal(base_size = 12) +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 11),
axis.text.y = element_blank(), # Remove y-axis text since we have labels
axis.title.x = element_text(size = 11),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
legend.position = "none", # We're using text as legend
plot.caption = element_text(size = 9, hjust = 0),
plot.margin = margin(1, 8, 1, 1, "cm") # Increased right margin for text
+
)
# Ensure nothing gets clipped
coord_cartesian(clip = "off")
# Save with proper dimensions
ggsave("forest_plot.png", width = 11, height = 5, dpi = 300)
Measuring trend using a Kaplan Meier plot
# --- Kaplan-Meier curves and log-rank test (strate analogue) ---
<- with(dat, Surv(time = y, event = death))
surv_obj
# KM by smoking
<- survfit(surv_obj ~ smoke_cat, data = dat)
fit_km ggsurvplot(fit_km, data = dat,
risk.table = F,
conf.int = T,
pval = TRUE,
xlab = "Follow-up time (years)",
legend.title = "Smoking")
e) Possible confounding effect of current age on the relationship between smoking and mortality
The mean ages at study entry were very similar across smoking groups (64.71–65.60 years), with overlapping age ranges of approximately 60–73 years. This indicates no large differences in baseline age, so adjusting for current age is unlikely to substantially change the smoking–mortality association compared with using follow-up time as the analysis scale. So age is not a possible confounding factor between smoking and and mortality.
# --- Step E: Check age distribution by smoking category ---
%>%
dat group_by(smoke_cat) %>%
summarise(
n = n(),
mean_age_entry = mean(ageent, na.rm = TRUE),
sd_age_entry = sd(ageent, na.rm = TRUE),
min_age_entry = min(ageent, na.rm = TRUE),
max_age_entry = max(ageent, na.rm = TRUE)
)
smoke_cat | n | mean_age_entry | sd_age_entry | min_age_entry | max_age_entry |
---|---|---|---|---|---|
non-smoker | 140 | 65.12610 | 3.203284 | 60.077 | 72.016 |
ex-smoker | 68 | 65.60484 | 3.571255 | 60.093 | 73.467 |
current-smoker | 109 | 64.71269 | 2.902213 | 60.005 | 73.930 |
NA | 1 | 67.24400 | NA | 67.244 | 67.244 |
# Visual check: density plot of baseline age by smoking category using facet_grid
library(ggplot2)
ggplot(dat, aes(x = ageent)) + # Removed color/fill from aes() as faceting separates them
geom_density(alpha = 0.5, fill = "pink") + # Added a common fill color
labs(x = "Age at study entry", y = "Density", title = "Age Distribution by Smoking Category") +
facet_grid(. ~ smoke_cat, scales = "free_y") + # Faceting by smoke_cat in columns, free y-scales
theme_minimal()
f) What was the age range at study entry (ageent) in each smoking group? Are there large differences in age between the groups?
The age range at study entry was 60–72 years for non-smokers, 60–73 years for ex-smokers, and 60–73 years for current smokers. Mean ages were 65.13, 65.60, and 64.71 years respectively, indicating no large age differences between groups. Since baseline ages are similar, using follow-up time as the analysis scale will not result in comparing men of very different actual ages, although minor age variation still warrants adjustment.
# --- Step F: If follow-up time is the analysis scale,
# are we comparing men of very different ages?
%>%
dat group_by(smoke_cat) %>%
summarise(
min_age = min(ageent, na.rm = TRUE),
max_age = max(ageent, na.rm = TRUE),
mean_age = mean(ageent, na.rm = TRUE),
sd_age = sd(ageent, na.rm = TRUE),
n = n()
)
smoke_cat | min_age | max_age | mean_age | sd_age | n |
---|---|---|---|---|---|
non-smoker | 60.077 | 72.016 | 65.12610 | 3.203284 | 140 |
ex-smoker | 60.093 | 73.467 | 65.60484 | 3.571255 | 68 |
current-smoker | 60.005 | 73.930 | 64.71269 | 2.902213 | 109 |
NA | 67.244 | 67.244 | 67.24400 | NA | 1 |
g) Cox regression and assumptions
After adjusting for age at study entry, there is no statistically significant difference in all-cause mortality risk between ex-smokers or current smokers and non-smokers (HR = 1.05 per year older; p = 0.12).
No strong evidence of PH violation overall (global p = 0.156). Smoking shows borderline evidence of time-varying effects (p = 0.076), and visual inspection of Schoenfeld residual plots confirm this (figures below).
# --- Step G: Age-adjusted Cox model ---
<- with(dat, Surv(y, death))
surv_obj
<- coxph(surv_obj ~ smoke_cat + ageent, data = dat)
cox_age_adj summary(cox_age_adj)
#> Call:
#> coxph(formula = surv_obj ~ smoke_cat + ageent, data = dat)
#>
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> coef exp(coef) se(coef) z Pr(>|z|)
#> smoke_catex-smoker 0.23028 1.25896 0.29904 0.770 0.4413
#> smoke_catcurrent-smoker 0.57251 1.77271 0.24180 2.368 0.0179 *
#> ageent 0.05228 1.05367 0.03377 1.548 0.1216
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> smoke_catex-smoker 1.259 0.7943 0.7006 2.262
#> smoke_catcurrent-smoker 1.773 0.5641 1.1036 2.847
#> ageent 1.054 0.9491 0.9862 1.126
#>
#> Concordance= 0.569 (se = 0.029 )
#> Likelihood ratio test= 7.82 on 3 df, p=0.05
#> Wald test = 7.75 on 3 df, p=0.05
#> Score (logrank) test = 7.88 on 3 df, p=0.05
# --- PH assumption check for adjusted model ---
<- cox.zph(cox_age_adj)
zph_age_adj
zph_age_adj#> chisq df p
#> smoke_cat 5.159 2 0.076
#> ageent 0.035 1 0.852
#> GLOBAL 5.225 3 0.156
plot(zph_age_adj)
H to J - Whitehal_new.dta
<- read_dta("whitehal_new.dta")
whitehal head(whitehal, n=10)
id | all | chd | sbp | smok | agein | grade | sbpgrp | timein | timeout | timebth | cholgrp | chol |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5001 | 0 | 0 | 120 | 4 | 47.48255 | 1 | 2 | 1967-09-13 | 1987-01-30 | 1920-03-20 | 4 | 273 |
5019 | 0 | 0 | 118 | 3 | 56.10678 | 0 | 1 | 1967-09-13 | 1987-01-30 | 1911-08-06 | 3 | 234 |
5038 | 0 | 0 | 147 | 3 | 51.44422 | 0 | 3 | 1967-09-13 | 1987-01-30 | 1916-04-03 | 4 | 295 |
5039 | 0 | 0 | 92 | 1 | 51.11567 | 0 | 1 | 1967-09-13 | 1987-01-30 | 1916-08-02 | 3 | 210 |
5042 | 1 | 0 | 128 | 2 | 55.48528 | 0 | 2 | 1967-09-13 | 1983-04-12 | 1912-03-20 | 4 | 287 |
5052 | 1 | 0 | 109 | 2 | 54.43669 | 0 | 1 | 1967-09-13 | 1984-07-10 | 1913-04-07 | 3 | 209 |
5064 | 1 | 0 | 145 | 1 | 46.37645 | 0 | 3 | 1967-09-15 | 1984-05-13 | 1921-04-30 | 4 | 262 |
5078 | 1 | 1 | 144 | 2 | 47.72348 | 0 | 3 | 1967-09-15 | 1983-11-29 | 1919-12-25 | 4 | 268 |
5089 | 0 | 0 | 154 | 1 | 47.92608 | 0 | 3 | 1967-09-15 | 1987-01-30 | 1919-10-11 | 2 | 187 |
5090 | 0 | 0 | 114 | 1 | 59.45790 | 0 | 1 | 1967-09-15 | 1987-01-30 | 1908-03-31 | 3 | 222 |
# packages
library(haven); library(dplyr); library(survival); library(survminer); library(broom)
# assume your data is in `wh` (from read_dta)
# wh <- read_dta("whitehal_new.dta")
# Clean labels → factors, build survival time (years)
<- whitehal %>%
wh2 mutate(
grade = haven::as_factor(grade, levels = "labels"), # 1=high, 2=low
smok = haven::as_factor(smok, levels = "labels"),
sbpgrp= haven::as_factor(sbpgrp, levels = "labels"),
cholgrp = haven::as_factor(cholgrp, levels = "labels"),
fu_years = as.numeric(timeout - timein) / 365.25,
chd = as.integer(chd) # ensure 0/1
)
# quick checks
summary(dplyr::
select(wh2, fu_years, chd, grade, agein, sbp, chol, smok))
#> fu_years chd grade agein
#> Min. : 0.1506 Min. :0.00000 High job grade:1194 Min. :40.05
#> 1st Qu.:17.1636 1st Qu.:0.00000 Low job grade : 483 1st Qu.:46.77
#> Median :17.9602 Median :0.00000 Median :51.79
#> Mean :16.4612 Mean :0.09183 Mean :52.10
#> 3rd Qu.:18.5626 3rd Qu.:0.00000 3rd Qu.:57.48
#> Max. :19.3812 Max. :1.00000 Max. :69.47
#>
#> sbp chol smok
#> Min. : 87.0 Min. : 75.0 Never smoked :317
#> 1st Qu.:120.0 1st Qu.:165.0 Ex-smoker :646
#> Median :133.0 Median :192.0 1-14 cig/day :310
#> Mean :135.8 Mean :196.4 15-24 cig/day:279
#> 3rd Qu.:148.0 3rd Qu.:225.0 25+ cig/day :125
#> Max. :252.0 Max. :480.0
#> NA's :26
table(wh2$grade, useNA = "ifany")
#>
#> High job grade Low job grade
#> 1194 483
head(wh2, n= 10)
id | all | chd | sbp | smok | agein | grade | sbpgrp | timein | timeout | timebth | cholgrp | chol | fu_years |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5001 | 0 | 0 | 120 | 15-24 cig/day | 47.48255 | Low job grade | 120-139 | 1967-09-13 | 1987-01-30 | 1920-03-20 | 250+ | 273 | 19.38123 |
5019 | 0 | 0 | 118 | 1-14 cig/day | 56.10678 | High job grade | <120 | 1967-09-13 | 1987-01-30 | 1911-08-06 | 200-249 | 234 | 19.38123 |
5038 | 0 | 0 | 147 | 1-14 cig/day | 51.44422 | High job grade | 140-159 | 1967-09-13 | 1987-01-30 | 1916-04-03 | 250+ | 295 | 19.38123 |
5039 | 0 | 0 | 92 | Never smoked | 51.11567 | High job grade | <120 | 1967-09-13 | 1987-01-30 | 1916-08-02 | 200-249 | 210 | 19.38123 |
5042 | 1 | 0 | 128 | Ex-smoker | 55.48528 | High job grade | 120-139 | 1967-09-13 | 1983-04-12 | 1912-03-20 | 250+ | 287 | 15.57837 |
5052 | 1 | 0 | 109 | Ex-smoker | 54.43669 | High job grade | <120 | 1967-09-13 | 1984-07-10 | 1913-04-07 | 200-249 | 209 | 16.82141 |
5064 | 1 | 0 | 145 | Never smoked | 46.37645 | High job grade | 140-159 | 1967-09-15 | 1984-05-13 | 1921-04-30 | 250+ | 262 | 16.66248 |
5078 | 1 | 1 | 144 | Ex-smoker | 47.72348 | High job grade | 140-159 | 1967-09-15 | 1983-11-29 | 1919-12-25 | 250+ | 268 | 16.20801 |
5089 | 0 | 0 | 154 | Never smoked | 47.92608 | High job grade | 140-159 | 1967-09-15 | 1987-01-30 | 1919-10-11 | 150-199 | 187 | 19.37842 |
5090 | 0 | 0 | 114 | Never smoked | 59.45790 | High job grade | <120 | 1967-09-15 | 1987-01-30 | 1908-03-31 | 200-249 | 222 | 19.37842 |
i) Cox model: effect of job grade (follow-up time scale)
Using follow-up time as the analysis scale, the unadjusted Cox regression (output below) showed that participants in the low job grade had more than twice the hazard of CHD mortality compared with those in the high job grade (HR = 2.04; 95% CI: 1.48–2.82; p < 0.001). However, after adjusting for age at entry, systolic blood pressure, cholesterol, and smoking, the association attenuated and became non-significant (HR for low vs high = 1.11; equivalent to HR for high vs low = 0.905; 95% CI: 0.637–1.284; p = 0.575).
In the adjusted model, age at entry (HR = 1.10 per year; 95% CI: 1.07–1.13), systolic blood pressure (HR = 1.016 per mmHg; 95% CI: 1.009–1.023), cholesterol (HR = 1.007 per unit; 95% CI: 1.004–1.011), and all categories of smoking were strong independent predictors of CHD mortality. Proportional hazards testing indicated some evidence of violation for grade in the unadjusted model (p = 0.0067) and borderline evidence in the adjusted model (p = 0.031), but the global test for the adjusted model was not significant (p = 0.232).
# survival object
<- with(wh2, Surv(fu_years, chd))
S_chd
# Ensure reference category is grade=1 high
<- wh2 %>% mutate(grade = relevel(factor(grade), ref = "High job grade"))
wh2
## Unadjusted Cox: CHD ~ grade
<- coxph(S_chd ~ grade, data = wh2)
cox_grade summary(cox_grade)
#> Call:
#> coxph(formula = S_chd ~ grade, data = wh2)
#>
#> n= 1677, number of events= 154
#>
#> coef exp(coef) se(coef) z Pr(>|z|)
#> gradeLow job grade 0.7140 2.0421 0.1637 4.362 1.29e-05 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> gradeLow job grade 2.042 0.4897 1.482 2.815
#>
#> Concordance= 0.585 (se = 0.02 )
#> Likelihood ratio test= 17.94 on 1 df, p=2e-05
#> Wald test = 19.03 on 1 df, p=1e-05
#> Score (logrank) test = 19.85 on 1 df, p=8e-06
# KM curves + log-rank (nice analogue to a simple stratified comparison)
<- survfit(S_chd ~ grade, data = wh2)
fit_km1 ggsurvplot(fit_km1, data = wh2, risk.table = F, pval = TRUE, conf.int = T,
xlab = "Follow-up (years)", legend.title = "Job grade")
## Adjusted Cox (optional; include if you decide to adjust)
<- coxph(S_chd ~ grade + agein + sbp + chol + smok, data = wh2)
cox_adj
summary(cox_adj)
#> Call:
#> coxph(formula = S_chd ~ grade + agein + sbp + chol + smok, data = wh2)
#>
#> n= 1651, number of events= 151
#> (26 observations deleted due to missingness)
#>
#> coef exp(coef) se(coef) z Pr(>|z|)
#> gradeLow job grade 0.100190 1.105381 0.178687 0.561 0.575000
#> agein 0.095873 1.100620 0.013930 6.882 5.88e-12 ***
#> sbp 0.016100 1.016230 0.003615 4.453 8.45e-06 ***
#> chol 0.007367 1.007394 0.001633 4.513 6.41e-06 ***
#> smokEx-smoker 0.769555 2.158804 0.332196 2.317 0.020527 *
#> smok1-14 cig/day 1.050483 2.859032 0.349535 3.005 0.002653 **
#> smok15-24 cig/day 1.320346 3.744718 0.348945 3.784 0.000154 ***
#> smok25+ cig/day 1.265059 3.543302 0.398100 3.178 0.001484 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> gradeLow job grade 1.105 0.9047 0.7788 1.569
#> agein 1.101 0.9086 1.0710 1.131
#> sbp 1.016 0.9840 1.0091 1.023
#> chol 1.007 0.9927 1.0042 1.011
#> smokEx-smoker 2.159 0.4632 1.1258 4.140
#> smok1-14 cig/day 2.859 0.3498 1.4411 5.672
#> smok15-24 cig/day 3.745 0.2670 1.8897 7.421
#> smok25+ cig/day 3.543 0.2822 1.6238 7.732
#>
#> Concordance= 0.759 (se = 0.018 )
#> Likelihood ratio test= 135.4 on 8 df, p=<2e-16
#> Wald test = 126.8 on 8 df, p=<2e-16
#> Score (logrank) test = 136.6 on 8 df, p=<2e-16
# PH checks
<- cox.zph(cox_grade); zph_unadj
zph_unadj #> chisq df p
#> grade 7.35 1 0.0067
#> GLOBAL 7.35 1 0.0067
<- cox.zph(cox_adj); zph_adj
zph_adj #> chisq df p
#> grade 4.644 1 0.031
#> agein 3.597 1 0.058
#> sbp 0.682 1 0.409
#> chol 1.326 1 0.250
#> smok 2.907 4 0.574
#> GLOBAL 10.488 8 0.232
j) Parametric (preferred) vs Frailty; interpretation for categorical & continuous
A Weibull parametric model was fitted as the most suitable choice given the absence of clustering. In the unadjusted model, the shape parameter (k = 1.36; 95% CI: 1.17–1.58) indicated an increasing hazard over time, and men in the high job grade had a 69% longer expected survival time to CHD death compared with those in the low grade (time ratio [TR] = 1.69; 95% CI: 1.32–2.16). After adjustment for age, systolic blood pressure, cholesterol, and smoking, the shape remained >1 (k = 1.45; 95% CI: 1.25–1.68), but the grade effect was no longer significant (TR = 1.07; 95% CI: 0.84–1.36).
For the continuous variable age at entry, the adjusted time ratio was 0.94 per year (95% CI: 0.92–0.96), indicating that each additional year of baseline age shortened the expected time to CHD death by approximately 6%. Similarly, higher systolic blood pressure and cholesterol were associated with shorter survival times (TR = 0.989 and TR = 0.995 per unit, respectively), and all smoking categories had TRs well below 1, indicating shorter survival compared with non-smokers. These parametric findings were consistent with the adjusted Cox model, reinforcing that job grade was not an independent predictor once key cardiovascular risk factors were included, while age, blood pressure, cholesterol, and smoking remained important determinants of CHD mortality.
# install.packages("flexsurv")
library(flexsurv)
# Unadjusted Weibull
<- flexsurvreg(S_chd ~ grade, data = wh2, dist = "weibull")
wb_unadj
wb_unadj#> Call:
#> flexsurvreg(formula = S_chd ~ grade, data = wh2, dist = "weibull")
#>
#> Estimates:
#> data mean est L95% U95% se exp(est)
#> shape NA 1.357 1.166 1.581 0.105 NA
#> scale NA 115.375 83.463 159.490 19.060 NA
#> gradeLow job grade 0.288 -0.525 -0.771 -0.278 0.126 0.592
#> L95% U95%
#> shape NA NA
#> scale NA NA
#> gradeLow job grade 0.462 0.757
#>
#> N = 1677, Events: 154, Censored: 1523
#> Total time at risk: 27605.37
#> Log-likelihood = -937.7307, df = 3
#> AIC = 1881.461
# Adjusted Weibull (if mirroring cox_adj)
<- flexsurvreg(S_chd ~ grade + agein + sbp + chol + smok,
wb_adj data = wh2, dist = "weibull")
wb_adj#> Call:
#> flexsurvreg(formula = S_chd ~ grade + agein + sbp + chol + smok,
#> data = wh2, dist = "weibull")
#>
#> Estimates:
#> data mean est L95% U95% se
#> shape NA 1.45e+00 1.25e+00 1.68e+00 1.11e-01
#> scale NA 7.30e+04 1.30e+04 4.11e+05 6.44e+04
#> gradeLow job grade 2.89e-01 -6.96e-02 -3.11e-01 1.72e-01 1.23e-01
#> agein 5.21e+01 -6.56e-02 -8.61e-02 -4.50e-02 1.05e-02
#> sbp 1.36e+02 -1.11e-02 -1.62e-02 -6.01e-03 2.59e-03
#> chol 1.96e+02 -5.06e-03 -7.35e-03 -2.77e-03 1.17e-03
#> smokEx-smoker 3.86e-01 -5.29e-01 -9.85e-01 -7.32e-02 2.32e-01
#> smok1-14 cig/day 1.82e-01 -7.24e-01 -1.21e+00 -2.40e-01 2.47e-01
#> smok15-24 cig/day 1.67e-01 -9.07e-01 -1.39e+00 -4.20e-01 2.48e-01
#> smok25+ cig/day 7.45e-02 -8.66e-01 -1.42e+00 -3.15e-01 2.81e-01
#> exp(est) L95% U95%
#> shape NA NA NA
#> scale NA NA NA
#> gradeLow job grade 9.33e-01 7.33e-01 1.19e+00
#> agein 9.37e-01 9.17e-01 9.56e-01
#> sbp 9.89e-01 9.84e-01 9.94e-01
#> chol 9.95e-01 9.93e-01 9.97e-01
#> smokEx-smoker 5.89e-01 3.74e-01 9.29e-01
#> smok1-14 cig/day 4.85e-01 2.99e-01 7.87e-01
#> smok15-24 cig/day 4.04e-01 2.48e-01 6.57e-01
#> smok25+ cig/day 4.21e-01 2.42e-01 7.30e-01
#>
#> N = 1651, Events: 151, Censored: 1500
#> Total time at risk: 27200.02
#> Log-likelihood = -861.7449, df = 10
#> AIC = 1743.49
Missing values
<- read_dta("trinmlsh.dta")
dat head(dat, n=10)
ethgp | ageent | death | cvdeath | alc | smokenum | hdlc | diabp | sysbp | chdstart | days | years | bmi | id | timein | timeout | y | timebth |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 62.984 | 1 | 0 | 0 | 0 | 0.767 | 120 | 198 | NA | 1409 | 3.857632 | NA | 62 | 1977-01-01 | 1980-11-10 | 3.857632 | 1914-01-07 |
3 | 60.553 | 0 | 0 | 0 | 5 | NA | 66 | 150 | 0 | 1526 | 4.177960 | NA | 166 | 1977-01-01 | 1981-03-07 | 4.177960 | 1916-06-13 |
5 | 67.617 | 1 | 0 | 0 | 1 | 0.563 | 81 | 135 | 0 | 367 | 1.004791 | 20.58 | 68 | 1977-01-01 | 1978-01-03 | 1.004791 | 1909-05-20 |
4 | 60.424 | 0 | 0 | 0 | 1 | 0.684 | 90 | 170 | 1 | 3494 | 9.566050 | 26.24 | 49 | 1977-01-01 | 1986-07-27 | 9.566050 | 1916-07-30 |
1 | 71.723 | 0 | 0 | 0 | 1 | 1.099 | 98 | 158 | 0 | 2546 | 6.970568 | 23.40 | 287 | 1977-01-01 | 1983-12-22 | 6.970568 | 1905-04-12 |
1 | 69.051 | 0 | 0 | 0 | 1 | NA | 64 | 112 | NA | 3452 | 9.451061 | NA | 69 | 1977-01-01 | 1986-06-15 | 9.451061 | 1907-12-14 |
1 | 67.578 | 0 | 0 | 0 | 0 | 0.850 | 80 | 126 | 0 | 3123 | 8.550308 | 19.73 | 139 | 1977-01-01 | 1985-07-21 | 8.550308 | 1909-06-04 |
3 | 61.895 | 0 | 0 | 0 | 0 | 0.850 | 80 | 115 | 0 | 3262 | 8.930869 | 15.31 | 109 | 1977-01-01 | 1985-12-07 | 8.930869 | 1915-02-08 |
1 | 62.316 | 0 | 0 | 0 | 0 | 0.979 | 91 | 129 | 0 | 3416 | 9.352498 | 23.74 | 81 | 1977-01-01 | 1986-05-10 | 9.352498 | 1914-09-08 |
1 | 68.950 | 0 | 0 | 0 | 0 | 0.748 | 83 | 145 | 0 | 3207 | 8.780288 | 26.36 | 153 | 1977-01-01 | 1985-10-13 | 8.780288 | 1908-01-20 |
#pre-proccesssed
<- read_rds("data_new.rds")
data_new head(data_new, n=10)
ethgp | ageent | death | cvdeath | alc | smokenum | hdlc | diabp | sysbp | chdstart | days | years | bmi | id | timein | timeout | y | timebth | smoke_cat |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 62.984 | 1 | 0 | 0 | 0 | 0.767 | 120 | 198 | NA | 1409 | 3.857632 | NA | 62 | 1977-01-01 | 1980-11-10 | 3.857632 | 1914-01-07 | non-smoker |
3 | 60.553 | 0 | 0 | 0 | 5 | NA | 66 | 150 | 0 | 1526 | 4.177960 | NA | 166 | 1977-01-01 | 1981-03-07 | 4.177960 | 1916-06-13 | current-smoker |
5 | 67.617 | 1 | 0 | 0 | 1 | 0.563 | 81 | 135 | 0 | 367 | 1.004791 | 20.58 | 68 | 1977-01-01 | 1978-01-03 | 1.004791 | 1909-05-20 | ex-smoker |
4 | 60.424 | 0 | 0 | 0 | 1 | 0.684 | 90 | 170 | 1 | 3494 | 9.566050 | 26.24 | 49 | 1977-01-01 | 1986-07-27 | 9.566050 | 1916-07-30 | ex-smoker |
1 | 71.723 | 0 | 0 | 0 | 1 | 1.099 | 98 | 158 | 0 | 2546 | 6.970568 | 23.40 | 287 | 1977-01-01 | 1983-12-22 | 6.970568 | 1905-04-12 | ex-smoker |
1 | 69.051 | 0 | 0 | 0 | 1 | NA | 64 | 112 | NA | 3452 | 9.451061 | NA | 69 | 1977-01-01 | 1986-06-15 | 9.451061 | 1907-12-14 | ex-smoker |
1 | 67.578 | 0 | 0 | 0 | 0 | 0.850 | 80 | 126 | 0 | 3123 | 8.550308 | 19.73 | 139 | 1977-01-01 | 1985-07-21 | 8.550308 | 1909-06-04 | non-smoker |
3 | 61.895 | 0 | 0 | 0 | 0 | 0.850 | 80 | 115 | 0 | 3262 | 8.930869 | 15.31 | 109 | 1977-01-01 | 1985-12-07 | 8.930869 | 1915-02-08 | non-smoker |
1 | 62.316 | 0 | 0 | 0 | 0 | 0.979 | 91 | 129 | 0 | 3416 | 9.352498 | 23.74 | 81 | 1977-01-01 | 1986-05-10 | 9.352498 | 1914-09-08 | non-smoker |
1 | 68.950 | 0 | 0 | 0 | 0 | 0.748 | 83 | 145 | 0 | 3207 | 8.780288 | 26.36 | 153 | 1977-01-01 | 1985-10-13 | 8.780288 | 1908-01-20 | non-smoker |
<- data_new %>% dplyr:: select(-smokenum)
data_new1 head(data_new1, n=10)
ethgp | ageent | death | cvdeath | alc | hdlc | diabp | sysbp | chdstart | days | years | bmi | id | timein | timeout | y | timebth | smoke_cat |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2 | 62.984 | 1 | 0 | 0 | 0.767 | 120 | 198 | NA | 1409 | 3.857632 | NA | 62 | 1977-01-01 | 1980-11-10 | 3.857632 | 1914-01-07 | non-smoker |
3 | 60.553 | 0 | 0 | 0 | NA | 66 | 150 | 0 | 1526 | 4.177960 | NA | 166 | 1977-01-01 | 1981-03-07 | 4.177960 | 1916-06-13 | current-smoker |
5 | 67.617 | 1 | 0 | 0 | 0.563 | 81 | 135 | 0 | 367 | 1.004791 | 20.58 | 68 | 1977-01-01 | 1978-01-03 | 1.004791 | 1909-05-20 | ex-smoker |
4 | 60.424 | 0 | 0 | 0 | 0.684 | 90 | 170 | 1 | 3494 | 9.566050 | 26.24 | 49 | 1977-01-01 | 1986-07-27 | 9.566050 | 1916-07-30 | ex-smoker |
1 | 71.723 | 0 | 0 | 0 | 1.099 | 98 | 158 | 0 | 2546 | 6.970568 | 23.40 | 287 | 1977-01-01 | 1983-12-22 | 6.970568 | 1905-04-12 | ex-smoker |
1 | 69.051 | 0 | 0 | 0 | NA | 64 | 112 | NA | 3452 | 9.451061 | NA | 69 | 1977-01-01 | 1986-06-15 | 9.451061 | 1907-12-14 | ex-smoker |
1 | 67.578 | 0 | 0 | 0 | 0.850 | 80 | 126 | 0 | 3123 | 8.550308 | 19.73 | 139 | 1977-01-01 | 1985-07-21 | 8.550308 | 1909-06-04 | non-smoker |
3 | 61.895 | 0 | 0 | 0 | 0.850 | 80 | 115 | 0 | 3262 | 8.930869 | 15.31 | 109 | 1977-01-01 | 1985-12-07 | 8.930869 | 1915-02-08 | non-smoker |
1 | 62.316 | 0 | 0 | 0 | 0.979 | 91 | 129 | 0 | 3416 | 9.352498 | 23.74 | 81 | 1977-01-01 | 1986-05-10 | 9.352498 | 1914-09-08 | non-smoker |
1 | 68.950 | 0 | 0 | 0 | 0.748 | 83 | 145 | 0 | 3207 | 8.780288 | 26.36 | 153 | 1977-01-01 | 1985-10-13 | 8.780288 | 1908-01-20 | non-smoker |
k) Redo the modelling after administering the appropriate missing values imputation procedures
Multiple imputation using the mice package was performed to address missing values in variables including smoke_cat, hdlc, diabp, sysbp, chdstart, and bmi. Variables were imputed under the assumption of Missing at Random (MAR), meaning the probability of missingness could be explained by the observed data. For categorical variables such as smoke_cat, polytomous logistic regression (polyreg) was used, while continuous variables were imputed using predictive mean matching (pmm). The imputation model excluded outcome and follow-up time variables to avoid bias. Ten imputed datasets were generated, and Cox regression models were fitted within each dataset and pooled to obtain final estimates.
When comparing the Cox regression results before and after multiple imputation, the imputed model shows greater statistical efficiency and retention of sample size, as it avoided the loss of 44 observations due to missingness.
The hazard ratio for ex-smokers was attenuated after imputation (HR 1.12, 95% CI 0.61–2.04, p = 0.71) compared with the complete-case analysis (HR 1.79, 95% CI 0.91–3.55, p = 0.09), suggesting that the apparent excess risk among ex-smokers in the complete-case analysis may have been overestimated due to selection bias from missing data.
For current smokers, the association remained robust in both models, although slightly attenuated after imputation (HR 1.83, 95% CI 1.12–3.00, p = 0.017) versus the complete-case estimate (HR 2.45, 95% CI 1.35–4.46, p = 0.003).
Other covariates showed similar directions of effect in both models, but the imputed model provided narrower confidence intervals for several predictors, indicating improved precision.
Overall, the imputation improved the model by preserving statistical power, reducing bias from case-wise deletion, and yielding more stable effect estimates, particularly clarifying that the elevated risk is strongest and most consistent among current smokers.
# Count missing values for each variable
<- sapply(data_new1, function(x) sum(is.na(x)))
missing_summary
# Turn into a data frame for nicer display
<- data.frame(
missing_df Variable = names(missing_summary),
Missing_Count = missing_summary,
Missing_Percent = round(100 * missing_summary / nrow(data_new1), 2)
)
# Show only variables with at least 1 missing value
<- missing_df[missing_df$Missing_Count > 0, ]
missing_df
missing_df
Variable | Missing_Count | Missing_Percent | |
---|---|---|---|
hdlc | hdlc | 17 | 5.35 |
diabp | diabp | 4 | 1.26 |
sysbp | sysbp | 4 | 1.26 |
chdstart | chdstart | 28 | 8.81 |
bmi | bmi | 35 | 11.01 |
smoke_cat | smoke_cat | 1 | 0.31 |
# Make sure smoke_cat is a factor
$smoke_cat <- factor(data_new1$smoke_cat)
data_new1
# Correct the imputation method for categorical variables
<- make.method(data_new1)
my_methods "smoke_cat"] <- "polyreg" # unordered categorical
my_methods["hdlc"] <- "pmm"
my_methods["diabp"] <- "pmm"
my_methods["sysbp"] <- "pmm"
my_methods["chdstart"] <- "pmm"
my_methods["bmi"] <- "pmm"
my_methods[
<- make.predictorMatrix(data_new1)
pred_matrix c("death", "y")] <- 0 # Do not use outcome/time to impute predictors pred_matrix[,
print(my_methods)
#> ethgp ageent death cvdeath alc hdlc diabp sysbp
#> "" "" "" "" "" "pmm" "pmm" "pmm"
#> chdstart days years bmi id timein timeout y
#> "pmm" "" "" "pmm" "" "" "" ""
#> timebth smoke_cat
#> "" "polyreg"
set.seed(123)
<- mice(data_new, m = 10,
imp method = my_methods,
predictorMatrix = pred_matrix,
printFlag = TRUE)
#>
#> iter imp variable
#> 1 1 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 2 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 3 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 4 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 5 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 6 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 7 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 8 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 9 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 1 10 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 1 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 2 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 3 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 4 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 5 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 6 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 7 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 8 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 9 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 2 10 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 1 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 2 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 3 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 4 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 5 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 6 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 7 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 8 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 9 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 3 10 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 1 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 2 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 3 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 4 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 5 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 6 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 7 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 8 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 9 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 4 10 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 1 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 2 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 3 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 4 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 5 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 6 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 7 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 8 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 9 hdlc* diabp* sysbp* chdstart* bmi smoke_cat
#> 5 10 hdlc* diabp* sysbp* chdstart bmi smoke_cat
# ===============================
# 7. Check convergence
# ===============================
plot(imp)
<- coxph(Surv(y, death) ~ smoke_cat+ageent+sysbp+bmi+diabp+ hdlc+chdstart, data =data_new1)
fit_cox
fit_cox#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart, data = data_new1)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.584613 1.794296 0.346474 1.687 0.09154
#> smoke_catcurrent-smoker 0.897908 2.454463 0.297975 3.013 0.00258
#> ageent 0.035336 1.035968 0.040442 0.874 0.38226
#> sysbp 0.007397 1.007425 0.006667 1.110 0.26716
#> bmi -0.035497 0.965125 0.032282 -1.100 0.27151
#> diabp 0.001391 1.001392 0.013074 0.106 0.91529
#> hdlc -0.360503 0.697326 0.343396 -1.050 0.29380
#> chdstart 0.608962 1.838522 0.322438 1.889 0.05894
#>
#> Likelihood ratio test=18.36 on 8 df, p=0.01865
#> n= 274, number of events= 68
#> (44 observations deleted due to missingness)
<- with(imp, coxph(Surv(y, death) ~ smoke_cat+ageent+sysbp+bmi+diabp+ hdlc+chdstart)))
(fit_imp #> call :
#> with.mids(data = imp, expr = coxph(Surv(y, death) ~ smoke_cat +
#> ageent + sysbp + bmi + diabp + hdlc + chdstart))
#>
#> call1 :
#> mice(data = data_new, m = 10, method = my_methods, predictorMatrix = pred_matrix,
#> printFlag = TRUE)
#>
#> nmis :
#> [1] 0 0 0 0 0 1 17 4 4 28 0 0 35 0 0 0 0 0 1
#>
#> analyses :
#> [[1]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.133415 1.142724 0.300007 0.445 0.6565
#> smoke_catcurrent-smoker 0.571208 1.770404 0.246502 2.317 0.0205
#> ageent 0.048825 1.050036 0.034659 1.409 0.1589
#> sysbp 0.014213 1.014315 0.005173 2.748 0.0060
#> bmi -0.031616 0.968879 0.028512 -1.109 0.2675
#> diabp -0.012251 0.987823 0.009782 -1.252 0.2104
#> hdlc -0.172897 0.841224 0.288575 -0.599 0.5491
#> chdstart 0.618227 1.855634 0.288445 2.143 0.0321
#>
#> Likelihood ratio test=20.37 on 8 df, p=0.009033
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[2]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.141884 1.152443 0.300017 0.473 0.6363
#> smoke_catcurrent-smoker 0.578078 1.782610 0.247540 2.335 0.0195
#> ageent 0.050276 1.051562 0.034670 1.450 0.1470
#> sysbp 0.011320 1.011385 0.005407 2.094 0.0363
#> bmi -0.021171 0.979052 0.028069 -0.754 0.4507
#> diabp -0.008648 0.991389 0.010112 -0.855 0.3924
#> hdlc -0.248338 0.780096 0.292367 -0.849 0.3957
#> chdstart 0.615726 1.850999 0.288033 2.138 0.0325
#>
#> Likelihood ratio test=17.48 on 8 df, p=0.0255
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[3]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.116696 1.123777 0.302141 0.386 0.6993
#> smoke_catcurrent-smoker 0.619795 1.858547 0.247450 2.505 0.0123
#> ageent 0.058941 1.060713 0.034507 1.708 0.0876
#> sysbp 0.003372 1.003378 0.005857 0.576 0.5649
#> bmi -0.010719 0.989338 0.027513 -0.390 0.6968
#> diabp 0.012465 1.012544 0.010670 1.168 0.2427
#> hdlc -0.153901 0.857357 0.282982 -0.544 0.5865
#> chdstart 0.641471 1.899273 0.286918 2.236 0.0254
#>
#> Likelihood ratio test=18.47 on 8 df, p=0.01794
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[4]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.089126 1.093219 0.301436 0.296 0.767480
#> smoke_catcurrent-smoker 0.649951 1.915448 0.247782 2.623 0.008714
#> ageent 0.053973 1.055456 0.034808 1.551 0.120997
#> sysbp 0.013209 1.013297 0.005086 2.597 0.009404
#> bmi -0.011250 0.988813 0.028185 -0.399 0.689777
#> diabp -0.011651 0.988417 0.009491 -1.228 0.219596
#> hdlc -0.342012 0.710340 0.309381 -1.105 0.268955
#> chdstart 0.844013 2.325682 0.243671 3.464 0.000533
#>
#> Likelihood ratio test=26.22 on 8 df, p=0.0009633
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[5]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.080068 1.083361 0.301436 0.266 0.79053
#> smoke_catcurrent-smoker 0.590055 1.804088 0.247735 2.382 0.01723
#> ageent 0.056250 1.057862 0.034423 1.634 0.10225
#> sysbp 0.002542 1.002545 0.006048 0.420 0.67426
#> bmi -0.034094 0.966481 0.027950 -1.220 0.22253
#> diabp 0.015026 1.015139 0.010697 1.405 0.16012
#> hdlc -0.672523 0.510419 0.322773 -2.084 0.03720
#> chdstart 0.768687 2.156932 0.274265 2.803 0.00507
#>
#> Likelihood ratio test=25.06 on 8 df, p=0.001517
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[6]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.083990 1.087618 0.300349 0.280 0.77975
#> smoke_catcurrent-smoker 0.621128 1.861026 0.246596 2.519 0.01178
#> ageent 0.056893 1.058542 0.034282 1.660 0.09700
#> sysbp 0.004767 1.004779 0.005817 0.820 0.41250
#> bmi -0.024026 0.976261 0.028206 -0.852 0.39433
#> diabp 0.010908 1.010968 0.010556 1.033 0.30140
#> hdlc -0.513054 0.598664 0.320305 -1.602 0.10921
#> chdstart 0.867019 2.379807 0.254888 3.402 0.00067
#>
#> Likelihood ratio test=25.52 on 8 df, p=0.00127
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[7]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.124178 1.132218 0.301129 0.412 0.6801
#> smoke_catcurrent-smoker 0.632753 1.882786 0.246917 2.563 0.0104
#> ageent 0.060462 1.062327 0.034421 1.757 0.0790
#> sysbp 0.002638 1.002641 0.005848 0.451 0.6519
#> bmi 0.001439 1.001440 0.027427 0.052 0.9582
#> diabp 0.013544 1.013636 0.010651 1.272 0.2035
#> hdlc 0.019240 1.019426 0.269333 0.071 0.9431
#> chdstart 0.636044 1.888994 0.286768 2.218 0.0266
#>
#> Likelihood ratio test=18.71 on 8 df, p=0.01649
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[8]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.132729 1.141941 0.300117 0.442 0.65830
#> smoke_catcurrent-smoker 0.572822 1.773265 0.246455 2.324 0.02011
#> ageent 0.046800 1.047913 0.034900 1.341 0.17993
#> sysbp 0.015836 1.015962 0.005094 3.109 0.00188
#> bmi -0.029831 0.970609 0.028819 -1.035 0.30061
#> diabp -0.014591 0.985515 0.009696 -1.505 0.13236
#> hdlc -0.029871 0.970571 0.281960 -0.106 0.91563
#> chdstart 0.596817 1.816329 0.288393 2.069 0.03850
#>
#> Likelihood ratio test=21.88 on 8 df, p=0.005139
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[9]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.088270 1.092283 0.299646 0.295 0.768315
#> smoke_catcurrent-smoker 0.593003 1.809415 0.246542 2.405 0.016160
#> ageent 0.058144 1.059867 0.034420 1.689 0.091176
#> sysbp 0.009854 1.009903 0.005672 1.737 0.082325
#> bmi -0.045921 0.955118 0.028693 -1.600 0.109503
#> diabp 0.004997 1.005010 0.010288 0.486 0.627163
#> hdlc -0.747038 0.473768 0.332247 -2.248 0.024548
#> chdstart 0.872675 2.393305 0.250838 3.479 0.000503
#>
#> Likelihood ratio test=31.26 on 8 df, p=0.0001265
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
#>
#> [[10]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp +
#> bmi + diabp + hdlc + chdstart)
#>
#> coef exp(coef) se(coef) z p
#> smoke_catex-smoker 0.148782 1.160420 0.299364 0.497 0.61919
#> smoke_catcurrent-smoker 0.617236 1.853797 0.247265 2.496 0.01255
#> ageent 0.054285 1.055785 0.034400 1.578 0.11456
#> sysbp 0.011574 1.011641 0.005416 2.137 0.03260
#> bmi -0.019960 0.980237 0.028591 -0.698 0.48509
#> diabp -0.007853 0.992178 0.010108 -0.777 0.43723
#> hdlc -0.649782 0.522160 0.327854 -1.982 0.04749
#> chdstart 0.807447 2.242176 0.270274 2.988 0.00281
#>
#> Likelihood ratio test=23.96 on 8 df, p=0.002329
#> n= 317, number of events= 88
#> (1 observation deleted due to missingness)
<- pool(fit_imp)
pooled_fit summary(pooled_fit, conf.int = TRUE, exponentiate = TRUE)
term | estimate | std.error | statistic | df | p.value | 2.5 % | 97.5 % | conf.low | conf.high |
---|---|---|---|---|---|---|---|---|---|
smoke_catex-smoker | 1.1206554 | 0.3018151 | 0.3774289 | 77.38172 | 0.7068881 | 0.6144489 | 2.043894 | 0.6144489 | 2.043894 |
smoke_catcurrent-smoker | 1.8305254 | 0.2487275 | 2.4307845 | 76.92589 | 0.0173945 | 1.1155144 | 3.003837 | 1.1155144 | 3.003837 |
ageent | 1.0559965 | 0.0348788 | 1.5621195 | 76.37623 | 0.1223941 | 0.9851348 | 1.131955 | 0.9851348 | 1.131955 |
sysbp | 1.0089726 | 0.0077252 | 1.1562847 | 19.69379 | 0.2613995 | 0.9928278 | 1.025380 | 0.9928278 | 1.025380 |
bmi | 0.9775411 | 0.0316092 | -0.7186186 | 48.26172 | 0.4758409 | 0.9173552 | 1.041676 | 0.9173552 | 1.041676 |
diabp | 1.0001947 | 0.0163862 | 0.0118832 | 13.42001 | 0.9906936 | 0.9655154 | 1.036120 | 0.9655154 | 1.036120 |
hdlc | 0.7039713 | 0.4211070 | -0.8335594 | 19.87747 | 0.4144341 | 0.2923572 | 1.695103 | 0.2923572 | 1.695103 |
chdstart | 2.0684771 | 0.2992412 | 2.4288523 | 54.76648 | 0.0184571 | 1.1354879 | 3.768070 | 1.1354879 | 3.768070 |
Part II
<- read_dta("paedsurv_new.dta")
paed head(paed, n=10)
idno | sex | chiv | dob | date_enr | date_out | died | ill3m | stunted | cd4cat |
---|---|---|---|---|---|---|---|---|---|
1001 | 1 | 0 | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | 0 | 1 | NA |
1002 | 2 | 0 | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | 0 | 1 | NA |
1003 | 2 | 1 | 2003-08-10 | 2006-05-24 | 2010-03-12 | 0 | 1 | 0 | 3 |
1004 | 1 | 0 | 2009-01-24 | 2009-03-09 | 2010-03-18 | 0 | 0 | 0 | NA |
1005 | 2 | 0 | 2004-12-23 | 2005-04-18 | 2010-01-25 | 0 | 0 | 0 | NA |
1006 | 1 | 0 | 2008-04-05 | 2008-05-27 | 2010-01-25 | 0 | 1 | 0 | NA |
1007 | 1 | 0 | 2002-06-03 | 2002-09-24 | 2010-03-18 | 0 | 1 | 0 | 1 |
1008 | 1 | 0 | 2007-05-02 | 2007-07-25 | 2009-10-23 | 0 | 1 | 0 | 1 |
1009 | 2 | 0 | 2007-07-25 | 2007-10-26 | 2009-11-03 | 0 | 1 | 1 | 1 |
1010 | 1 | 1 | 2008-11-16 | 2009-01-09 | 2010-01-13 | 0 | 1 | 0 | NA |
<- paed %>%
paed mutate(
sex = if (inherits(sex, "haven_labelled")) as_factor(sex) else as.factor(sex),
chiv = if (inherits(chiv, "haven_labelled")) as_factor(chiv) else as.factor(chiv),
ill3m = if (inherits(ill3m, "haven_labelled")) as_factor(ill3m) else as.factor(ill3m),
cd4cat = if (inherits(cd4cat, "haven_labelled")) as_factor(cd4cat) else as.factor(cd4cat),
stunted = as.factor(stunted)
)
head(paed, n=10)
idno | sex | chiv | dob | date_enr | date_out | died | ill3m | stunted | cd4cat |
---|---|---|---|---|---|---|---|---|---|
1001 | Male | HIV exposed | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | No | 1 | NA |
1002 | Female | HIV exposed | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | No | 1 | NA |
1003 | Female | HIV positive | 2003-08-10 | 2006-05-24 | 2010-03-12 | 0 | Yes | 0 | <15% |
1004 | Male | HIV exposed | 2009-01-24 | 2009-03-09 | 2010-03-18 | 0 | No | 0 | NA |
1005 | Female | HIV exposed | 2004-12-23 | 2005-04-18 | 2010-01-25 | 0 | No | 0 | NA |
1006 | Male | HIV exposed | 2008-04-05 | 2008-05-27 | 2010-01-25 | 0 | Yes | 0 | NA |
1007 | Male | HIV exposed | 2002-06-03 | 2002-09-24 | 2010-03-18 | 0 | Yes | 0 | 25%+ |
1008 | Male | HIV exposed | 2007-05-02 | 2007-07-25 | 2009-10-23 | 0 | Yes | 0 | 25%+ |
1009 | Female | HIV exposed | 2007-07-25 | 2007-10-26 | 2009-11-03 | 0 | Yes | 1 | 25%+ |
1010 | Male | HIV positive | 2008-11-16 | 2009-01-09 | 2010-01-13 | 0 | Yes | 0 | NA |
sapply(paed[, c("sex","chiv","ill3m","stunted","cd4cat","dob","date_enr","date_out","died")], class)
#> sex chiv ill3m stunted cd4cat dob date_enr date_out
#> "factor" "factor" "factor" "factor" "factor" "Date" "Date" "Date"
#> died
#> "numeric"
# ---- 2) Derive age at enrolment & person-time ----
<- paed %>%
paed mutate(
age_enrol_years = as.numeric(difftime(date_enr, dob, units = "days")) / 365.25,
overall_survival = as.numeric(difftime(date_out, date_enr, units = "days")), # follow-up in days
overall_survival = ifelse(is.na(overall_survival), NA_real_, pmax(overall_survival, 0)),
person_years = overall_survival / 365.25,
died = as.integer(died)
)head(paed, n=10)
idno | sex | chiv | dob | date_enr | date_out | died | ill3m | stunted | cd4cat | age_enrol_years | overall_survival | person_years |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1001 | Male | HIV exposed | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | No | 1 | NA | 0.1341547 | 33.09961 | 0.0906218 |
1002 | Female | HIV exposed | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | No | 1 | NA | 0.1341547 | 33.09961 | 0.0906218 |
1003 | Female | HIV positive | 2003-08-10 | 2006-05-24 | 2010-03-12 | 0 | Yes | 0 | <15% | 2.7871321 | 1388.00000 | 3.8001369 |
1004 | Male | HIV exposed | 2009-01-24 | 2009-03-09 | 2010-03-18 | 0 | No | 0 | NA | 0.1204654 | 374.00000 | 1.0239562 |
1005 | Female | HIV exposed | 2004-12-23 | 2005-04-18 | 2010-01-25 | 0 | No | 0 | NA | 0.3175907 | 1743.00000 | 4.7720739 |
1006 | Male | HIV exposed | 2008-04-05 | 2008-05-27 | 2010-01-25 | 0 | Yes | 0 | NA | 0.1423682 | 608.00000 | 1.6646133 |
1007 | Male | HIV exposed | 2002-06-03 | 2002-09-24 | 2010-03-18 | 0 | Yes | 0 | 25%+ | 0.3093771 | 2732.00000 | 7.4798084 |
1008 | Male | HIV exposed | 2007-05-02 | 2007-07-25 | 2009-10-23 | 0 | Yes | 0 | 25%+ | 0.2299795 | 821.00000 | 2.2477755 |
1009 | Female | HIV exposed | 2007-07-25 | 2007-10-26 | 2009-11-03 | 0 | Yes | 1 | 25%+ | 0.2546201 | 739.00000 | 2.0232717 |
1010 | Male | HIV positive | 2008-11-16 | 2009-01-09 | 2010-01-13 | 0 | Yes | 0 | NA | 0.1478439 | 369.00000 | 1.0102669 |
# ---- 3) Basic descriptive rates by HIV status (crude) ----
<- paed %>%
rates_chiv filter(!is.na(chiv)) %>%
group_by(chiv) %>%
summarise(
deaths = sum(died == 1, na.rm = TRUE),
person_y = sum(person_years, na.rm = TRUE),
rate_per_1000py = 1000 * deaths / person_y,
.groups = "drop"
) rates_chiv
chiv | deaths | person_y | rate_per_1000py |
---|---|---|---|
HIV exposed | 16 | 1217.7708 | 13.13876 |
HIV positive | 27 | 638.3912 | 42.29381 |
b) Examine the effect of child’s HIV status (chiv) using strate and stcox. What is the HR and 95% CI?
A crude Cox model showed that HIV-positive children had a hazard ratio (HR) of 3.67 (95% CI: 1.98–6.82, p < 0.001) compared with HIV-exposed children. This indicates that HIV-positive children had over three times the hazard of death when compared on the same follow-up time scale.
c) Consider the possible confounding effect of current age on the relationship between child’s HIV status and mortality. Before doing any analyses, would you expect a Cox model controlling for current age to give different results from the one controlling for time in the study?
Before adjusting for age, there was a marked difference in mean age at study entry between HIV groups. HIV-exposed children entered at an average of 0.32 years, whereas HIV-positive children entered at 4.33 years. Since older age can be protective against mortality in this cohort, controlling for age is expected to alter the estimated effect of HIV status.
<- paed %>% filter(!is.na(overall_survival) & !is.na(died) & !is.na(chiv))
cox_dat
# Crude Cox: HIV status only
<- coxph(Surv(overall_survival, died) ~ chiv, data = cox_dat)
cox_crude summary(cox_crude)
#> Call:
#> coxph(formula = Surv(overall_survival, died) ~ chiv, data = cox_dat)
#>
#> n= 590, number of events= 43
#>
#> coef exp(coef) se(coef) z Pr(>|z|)
#> chivHIV positive 1.3009 3.6725 0.3158 4.119 3.81e-05 ***
#> chivHIV neg exposed NA NA 0.0000 NA NA
#> chivHIV neg unexposed NA NA 0.0000 NA NA
#> chivUnknown NA NA 0.0000 NA NA
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> chivHIV positive 3.672 0.2723 1.978 6.82
#> chivHIV neg exposed NA NA NA NA
#> chivHIV neg unexposed NA NA NA NA
#> chivUnknown NA NA NA NA
#>
#> Concordance= 0.654 (se = 0.038 )
#> Likelihood ratio test= 17.7 on 1 df, p=3e-05
#> Wald test = 16.96 on 1 df, p=4e-05
#> Score (logrank) test = 19.48 on 1 df, p=1e-05
d) Are there large differences in age at entry between the HIV groups?
Yes, there are large diffrences in age at entry between the HIV groups. The age range for HIV-exposed children was 0–6.80 years, while for HIV-positive children it was 0–13.18 years. These large differences in baseline age also confirm the potential for confounding, making age adjustment necessary.
<-cox_dat %>% mutate(age_entry = age_enrol_years) # just a readable alias
dat <- dat %>%
age_tab group_by(chiv) %>%
summarise(
n = sum(!is.na(age_entry)),
mean = mean(age_entry, na.rm = TRUE),
sd = sd(age_entry, na.rm = TRUE),
min = min(age_entry, na.rm = TRUE),
max = max(age_entry, na.rm = TRUE),
.groups = "drop"
) age_tab
chiv | n | mean | sd | min | max |
---|---|---|---|---|---|
HIV exposed | 412 | 0.3238306 | 0.6747565 | 0.0000000 | 6.795346 |
HIV positive | 178 | 4.3317106 | 4.1456232 | 0.0054757 | 13.182752 |
e) Now perform a Cox regression controlling for current age to investigate whether age is a confounder
After adjusting for age at entry, the HR for HIV-positive versus HIV-exposed increased to 5.44 (95% CI: 2.78–10.65, p < 0.001). Age itself was inversely associated with mortality (HR per year = 0.88, 95% CI: 0.78–0.99, p = 0.029). This strengthening of the HIV effect after age adjustment suggests negative confounding by age.
Traditional Cox model
<- coxph(Surv(overall_survival, died) ~ chiv + age_entry, data = dat)
cox_fit
summary(cox_fit)
#> Call:
#> coxph(formula = Surv(overall_survival, died) ~ chiv + age_entry,
#> data = dat)
#>
#> n= 590, number of events= 43
#>
#> coef exp(coef) se(coef) z Pr(>|z|)
#> chivHIV positive 1.69303 5.43591 0.34295 4.937 7.94e-07 ***
#> chivHIV neg exposed NA NA 0.00000 NA NA
#> chivHIV neg unexposed NA NA 0.00000 NA NA
#> chivUnknown NA NA 0.00000 NA NA
#> age_entry -0.13096 0.87725 0.06003 -2.181 0.0292 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> exp(coef) exp(-coef) lower .95 upper .95
#> chivHIV positive 5.4359 0.184 2.7756 10.6460
#> chivHIV neg exposed NA NA NA NA
#> chivHIV neg unexposed NA NA NA NA
#> chivUnknown NA NA NA NA
#> age_entry 0.8773 1.140 0.7799 0.9868
#>
#> Concordance= 0.632 (se = 0.05 )
#> Likelihood ratio test= 23.41 on 2 df, p=8e-06
#> Wald test = 24.45 on 2 df, p=5e-06
#> Score (logrank) test = 29.05 on 2 df, p=5e-07
f) With age as the analysis scale, use a Poisson model to assess the relationship between child’s HIV status and mortality. How do the results compare to those from using a Cox model?
Using the Poisson model with log–person-years offset, the estimated rate ratio (RR) for mortality comparing HIV-positive to HIV-exposed children is 5.31 (e^1.669 = 5.31), with a 95% CI of 2.69–10.46 (p = 1.4×10⁻⁶). Age is protective: each additional year at entry is associated with RR = 0.85 (95% CI 0.75–0.96, p = 0.011). These results closely match the age-adjusted Cox model (HR = 5.44, 95% CI 2.78–10.65, p < 0.001; age HR 0.88, 95% CI 0.78–0.99), indicating strong and consistent evidence that HIV-positive status is associated with roughly a five-fold higher mortality, and that older age is associated with lower risk.
Traditional Poisson Model
<- dat %>% filter(!is.na(person_years) & !is.na(died) & !is.na(chiv))
pois_dat <- glm(died ~ chiv +age_entry + offset(log(person_years)), family = poisson(link = "log"), data = pois_dat)
Poisson_Trad summary(Poisson_Trad)
#>
#> Call:
#> glm(formula = died ~ chiv + age_entry + offset(log(person_years)),
#> family = poisson(link = "log"), data = pois_dat)
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) -4.27294 0.25091 -17.030 < 2e-16 ***
#> chivHIV positive 1.66934 0.34598 4.825 1.4e-06 ***
#> age_entry -0.16282 0.06408 -2.541 0.0111 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for poisson family taken to be 1)
#>
#> Null deviance: 348.17 on 589 degrees of freedom
#> Residual deviance: 325.84 on 587 degrees of freedom
#> AIC: 417.84
#>
#> Number of Fisher Scoring iterations: 7
g) In order to make a like-for-like comparison with the Cox model, what would be a more appropriate analysis using Poisson regression?
For a like-for-like comparison with the Cox model, the Poisson regression should be parameterised as a piecewise exponential model, with the baseline hazard split into appropriate time intervals (e.g., age bands) and an offset for log person-time. This allows the Poisson model to mimic the Cox model’s handling of time-to-event data.
h) Use the stsplit (Lexis in R) command to create 1-year age bands of current age from 0 to 5 years, and 5-year bands for children aged over 5. Re-run the Poisson model controlling for current age. How do these results compare with those obtained from the Cox model?
When comparing the traditional Cox model with the Poisson regression using 1-year age bands for children 0–5 years and 5-year bands for older children, the results are directionally consistent but differ in magnitude. Both models show that HIV positivity substantially increases mortality, while older age is protective. In the Cox model, the hazard ratio for HIV positive children is approximately 5.44 (95% CI: 2.78–10.65), whereas the Poisson model with age bands gives a higher incidence rate ratio of about 9.47 (95% CI: 8.82–10.17).
head(pois_dat, n=10)
idno | sex | chiv | dob | date_enr | date_out | died | ill3m | stunted | cd4cat | age_enrol_years | overall_survival | person_years | age_entry |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1001 | Male | HIV exposed | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | No | 1 | NA | 0.1341547 | 33.09961 | 0.0906218 | 0.1341547 |
1002 | Female | HIV exposed | 2010-01-11 | 2010-03-01 | 2010-04-03 | 0 | No | 1 | NA | 0.1341547 | 33.09961 | 0.0906218 | 0.1341547 |
1003 | Female | HIV positive | 2003-08-10 | 2006-05-24 | 2010-03-12 | 0 | Yes | 0 | <15% | 2.7871321 | 1388.00000 | 3.8001369 | 2.7871321 |
1004 | Male | HIV exposed | 2009-01-24 | 2009-03-09 | 2010-03-18 | 0 | No | 0 | NA | 0.1204654 | 374.00000 | 1.0239562 | 0.1204654 |
1005 | Female | HIV exposed | 2004-12-23 | 2005-04-18 | 2010-01-25 | 0 | No | 0 | NA | 0.3175907 | 1743.00000 | 4.7720739 | 0.3175907 |
1006 | Male | HIV exposed | 2008-04-05 | 2008-05-27 | 2010-01-25 | 0 | Yes | 0 | NA | 0.1423682 | 608.00000 | 1.6646133 | 0.1423682 |
1007 | Male | HIV exposed | 2002-06-03 | 2002-09-24 | 2010-03-18 | 0 | Yes | 0 | 25%+ | 0.3093771 | 2732.00000 | 7.4798084 | 0.3093771 |
1008 | Male | HIV exposed | 2007-05-02 | 2007-07-25 | 2009-10-23 | 0 | Yes | 0 | 25%+ | 0.2299795 | 821.00000 | 2.2477755 | 0.2299795 |
1009 | Female | HIV exposed | 2007-07-25 | 2007-10-26 | 2009-11-03 | 0 | Yes | 1 | 25%+ | 0.2546201 | 739.00000 | 2.0232717 | 0.2546201 |
1010 | Male | HIV positive | 2008-11-16 | 2009-01-09 | 2010-01-13 | 0 | Yes | 0 | NA | 0.1478439 | 369.00000 | 1.0102669 | 0.1478439 |
<- Lexis(exit= list(tfe =overall_survival),
poisson exit.status= factor(died,
labels = c("Alive","Dead")),
data = pois_dat)
#> NOTE: entry.status has been set to "Alive" for all.
#> NOTE: entry is assumed to be 0 on the tfe timescale.
# 1. Create age bands (1-year up to 5, then 5-year)
<- c(0:5, seq(10, max(poisson$overall_survival, na.rm=TRUE), by=5))
age_breaks
# 2. Split data by current age
<- splitMulti(poisson,
poisson_split tfe = age_breaks,
time.scale = "overall_survival")
# 3. Create age band factor
$age_band <- cut(poisson_split$age_entry,
poisson_splitbreaks = age_breaks,
right = FALSE,
include.lowest = TRUE)
# 4. Poisson model
<- glm(died ~ chiv + age_band + offset(log(lex.dur)),
poisson_Annual family = "poisreg",
data = poisson_split)
Comparing the traditional Cox model with the poisson regression (1-year age bands of current age from 0 to 5 years, and 5-year bands for children aged over 5)
rbind(ci.exp(cox_fit), ci.exp(poisson_Annual, subset = c("chiv", "age_entry")))
#> exp(Est.) 2.5% 97.5%
#> chivHIV positive 5.4359084 2.775594 10.6460459
#> chivHIV neg exposed 1.0000000 1.000000 1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000 1.0000000
#> chivUnknown 1.0000000 1.000000 1.0000000
#> age_entry 0.8772548 0.779876 0.9867927
#> chivHIV positive 9.4718927 8.819615 10.1724110
i) Does age confound the relationship between HIV status and mortality?
Yes. Age confounds the HIV–mortality relationship because it is associated with both HIV status and mortality; after adjustment, HIV positive children still had a much higher risk of death (HR ≈ 5.4), but age showed an independent protective effect (HR ≈ 0.88).
j) Redo part (h) and (i) to refit another Poisson model but splitting time at every event?
When comparing the traditional Cox proportional hazards model with the Poisson regression approach (where follow-up time is split at each event to approximate the partial likelihood), the results are nearly identical. In both models, HIV-positive status is strongly associated with an increased risk of mortality, with hazard/IRR estimates around 5.4 (95% CI ≈ 2.8–10.6). Similarly, age at entry is protective, with an effect estimate of about 0.88 (95% CI ≈ 0.78–0.99), meaning each additional year of age is associated with a ~12% reduction in risk.
Spliting data into small intervals
<- splitMulti(poisson, tfe = c(0, sort(unique(poisson$overall_survival))))
poisson.s summary(poisson.s)
#>
#> Transitions:
#> To
#> From Alive Dead Records: Events: Risk time: Persons:
#> Alive 137287 43 137330 43 677963.2 590
<- glm(cbind(lex.Xst == "Dead", lex.dur) ~ 0 + factor(tfe) + chiv + age_entry, family = "poisreg", data = poisson.s) mLs.pois.fc
Comparing the traditional Cox model with the poisson regression (splitting time at every event)
rbind(ci.exp(cox_fit), ci.exp(mLs.pois.fc, subset = c("chiv", "age_entry")))
#> exp(Est.) 2.5% 97.5%
#> chivHIV positive 5.4359084 2.7755939 10.6460459
#> chivHIV neg exposed 1.0000000 1.0000000 1.0000000
#> chivHIV neg unexposed 1.0000000 1.0000000 1.0000000
#> chivUnknown 1.0000000 1.0000000 1.0000000
#> age_entry 0.8772548 0.7798760 0.9867927
#> chivHIV positive 5.4337909 2.7745567 10.6417304
#> age_entry 0.8772983 0.7799259 0.9868274
k) Bonus: Fit an appropriate Parametric model or Frailty model whichever is most suitable and distinguish how you will interpret the results for a single categorical and also a single continuous variable.
Appropriate Parametric Model
Among the candidate models fitted, the Gompertz parametric survival model was the most suitable, as it had the lowest AIC (875.0) and BIC, indicating the best balance of fit and parsimony. Frailty models were not required, as there was no indication of clustering or unmeasured shared risk within subgroups.
Interpretation of a Categorical Variable (HIV Status)
For the categorical predictor HIV status, the Gompertz model estimated that HIV-positive children had a hazard of death approximately 5.46 times higher than HIV-negative children (HR = 5.46, 95% CI: 2.78–10.7, p < 0.001). This interpretation means that, holding other factors constant, HIV infection is associated with a markedly elevated risk of mortality throughout follow-up. In practical terms, HIV infection substantially accelerates progression to death compared with uninfected children.
Interpretation of a Continuous Variable (Age at Enrolment)
For the continuous predictor age at enrolment, the Gompertz model estimated a hazard ratio of 0.88 per year (95% CI: 0.78–0.99, p = 0.03). This indicates that each additional year of age at enrolment is associated with about a 12% reduction in the hazard of death. In other words, older children at the time of enrolment are at lower risk of mortality compared with younger children, suggesting age acts as a protective factor.
Running 6 parameric survival models in flexsurvreg to choose the model that fits the data well
<- Surv(overall_survival, died) ~ chiv + age_enrol_years
s_object
<- list(
models expo = try(flexsurvreg(s_object, data = paed, dist = "exponential"), silent = TRUE),
weib = try(flexsurvreg(s_object, data = paed, dist = "weibull"), silent = TRUE),
gomp = try(flexsurvreg(s_object, data = paed, dist = "gompertz"), silent = TRUE),
lnorm = try(flexsurvreg(s_object, data = paed, dist = "lnorm"), silent = TRUE),
llog = try(flexsurvreg(s_object, data = paed, dist = "llogis"), silent = TRUE),
ggamma = try(flexsurvreg(s_object, data = paed, dist = "gengamma"), silent = TRUE)
)
Extracting AIC and BIC values for ranking the best model
<- function(m, n) {
extract_ic if (inherits(m, "try-error")) return(c(AIC = NA, BIC = NA))
<- length(m$res[, "est"]) # number of parameters
k <- m$loglik
ll <- m$AIC
aic <- -2 * ll + k * log(n)
bic c(AIC = aic, BIC = bic)
}
<- nrow(paed) # sample size
n
<- t(sapply(models, extract_ic, n = n))
ic_values
# Convert to data.frame for easy sorting
<- as.data.frame(ic_values)
ic_df $Model <- rownames(ic_df)
ic_df
# Rankings
<- ic_df[order(ic_df$AIC), c("Model", "AIC")]
aic_rank <- ic_df[order(ic_df$BIC), c("Model", "BIC")]
bic_rank
print(aic_rank)
#> Model AIC
#> gomp gomp 875.0119
#> ggamma ggamma 880.0386
#> expo expo NA
#> weib weib NA
#> lnorm lnorm NA
#> llog llog NA
print(bic_rank)
#> Model BIC
#> gomp gomp 892.5324
#> ggamma ggamma 901.9392
#> expo expo NA
#> weib weib NA
#> lnorm lnorm NA
#> llog llog NA
# Define time grid for plotting
<- seq(0, max(paed$overall_survival, na.rm = TRUE), length.out = 200)
time_grid
# Function to extract hazard estimates from a fitted flexsurvreg model
<- function(model, model_name, time_grid) {
get_hazard_df if (inherits(model, "try-error")) return(NULL)
<- summary(model, type = "hazard", t = time_grid, ci = FALSE)
pred data.frame(
time = time_grid,
hazard = pred[[1]]$est,
model = model_name
)
}
# Collect hazards from your models
<- list(
haz_list get_hazard_df(models$gomp, "Gompertz", time_grid),
get_hazard_df(models$ggamma, "GenGamma", time_grid)
# Add others if they converge, e.g. Weibull, Lognormal, Log-logistic
)
<- bind_rows(haz_list)
haz_df
# Plot with ggplot
ggplot(haz_df, aes(x = time, y = hazard, colour = model)) +
geom_line(size = 1.2) +
labs(
title = "Hazard Functions from Parametric Models",
x = "Time (years)",
y = "Hazard Rate"
+
) theme_minimal(base_size = 14) +
theme(legend.position = "top")
Running the best model based on AIC and BIC
<- flexsurvreg(s_object, data = paed, dist = "gompertz")
Gompertz_model
Gompertz_model#> Call:
#> flexsurvreg(formula = s_object, data = paed, dist = "gompertz")
#>
#> Estimates:
#> data mean est L95% U95% se
#> shape NA -1.68e-03 -2.23e-03 -1.12e-03 2.84e-04
#> rate NA 9.78e-05 5.50e-05 1.74e-04 2.87e-05
#> chivHIV positive 3.02e-01 1.70e+00 1.02e+00 2.37e+00 3.43e-01
#> age_enrol_years 1.53e+00 -1.32e-01 -2.50e-01 -1.39e-02 6.03e-02
#> exp(est) L95% U95%
#> shape NA NA NA
#> rate NA NA NA
#> chivHIV positive 5.46e+00 2.78e+00 1.07e+01
#> age_enrol_years 8.76e-01 7.79e-01 9.86e-01
#>
#> N = 590, Events: 43, Censored: 547
#> Total time at risk: 677963.2
#> Log-likelihood = -433.5059, df = 4
#> AIC = 875.0119
Part III
Across all the modeling frameworks applied, the Cox proportional hazards model on the Lexis dataset, the time-split Cox model, the Poisson regression under a piecewise exponential formulation, the parametric Gompertz survival model, and the flexible additive hazard model (GAMM) using R2BayesX , the results were consistent.
In each case, HIV status emerged as the strongest predictor of mortality, with HIV-positive children having a markedly higher hazard of death compared to exposed or unexposed peers. This was confirmed by hazard ratios greater than 5 in both the Cox and GAMM models. Similarly, age at enrolment consistently showed a protective effect, with older children experiencing reduced mortality risk across all models, regardless of whether age was modeled linearly (Cox, Poisson, Gompertz) or flexibly through smoothing (GAMM).
The flexible GAMM added value by allowing the hazard to vary smoothly with survival time, yet the smooth term did not reveal substantial deviations from proportionality, suggesting that the Cox model’s proportional hazards assumption was adequate. Importantly, the GAMM’s parametric coefficients aligned closely with the Cox and Gompertz estimates, reinforcing the robustness of the conclusions.
The consistency also highlights that the substantive conclusions are not sensitive to the chosen modeling strategy, although the GAMM provided additional reassurance by flexibly checking for non-linearities and time-varying effects.
Comparing the traditionalCox model with the Cox(splitting time at every event), and Cox (Lexis dataset - unsplit)
Cox-Unsplit
<- coxph(Surv(tfe, tfe+lex.dur,lex.Xst == "Dead") ~ chiv+ age_enrol_years ,
Cox_lexis eps = 10^-11,iter.max = 25, data =poisson)
Cox-Split at every time interval
<- coxph(Surv(tfe, tfe+lex.dur, lex.Xst == "Dead")~ chiv+ age_enrol_years ,
Cox_timeSpliteps = 10^-11,iter.max = 25, data =poisson.s)
rbind(ci.exp(cox_fit),
ci.exp(Cox_lexis),
ci.exp(Cox_timeSplit, subset = c("chiv", "age_enrol_years ")))
#> exp(Est.) 2.5% 97.5%
#> chivHIV positive 5.4359084 2.775594 10.6460459
#> chivHIV neg exposed 1.0000000 1.000000 1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000 1.0000000
#> chivUnknown 1.0000000 1.000000 1.0000000
#> age_entry 0.8772548 0.779876 0.9867927
#> chivHIV positive 5.4359084 2.775594 10.6460459
#> chivHIV neg exposed 1.0000000 1.000000 1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000 1.0000000
#> chivUnknown 1.0000000 1.000000 1.0000000
#> age_enrol_years 0.8772548 0.779876 0.9867927
#> chivHIV positive 5.4359084 2.775594 10.6460459
#> chivHIV neg exposed 1.0000000 1.000000 1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000 1.0000000
#> chivUnknown 1.0000000 1.000000 1.0000000
A flexible GAMM model
<- bayesx(died ~ chiv + age_enrol_years +
gam_fit sx(overall_survival, bs = "bl"),
family = "cox",
data = paed,
method = "REML"
)
summary(gam_fit)
#> Call:
#> bayesx(formula = died ~ chiv + age_enrol_years + sx(overall_survival,
#> bs = "bl"), data = paed, family = "cox", method = "REML")
#>
#> Fixed effects estimation results:
#>
#> Parametric coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -11.2382 0.2609 -43.0723 <2e-16 ***
#> chivHIV positive 1.6973 0.3432 4.9452 <2e-16 ***
#> age_enrol_years -0.1315 0.0602 -2.1843 0.0293 *
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Smooth terms:
#> Variance Smooth Par. df Stopped
#> sx(overall_survival) 0.0005 1993.4700 1.0742 0
#>
#> N = 590 df = 4.07418 AIC = 875.595 BIC = 893.441
#> logLik = -433.7235 method = REML family = cox
plot(gam_fit, resid = TRUE, cex.resid = 0.1)
Model equations of the models fitted
1. Traditional Cox Proportional Hazards Model
\[ h(t \mid X) = h_0(t)\exp(\beta^\top X) \]
2. Piecewise Cox model (time split lexis data)
\[ \lambda(t \mid X) = \lambda_0^{(k)} \exp(\beta^\top X), \quad \text{for } t \in (\tau_{k-1}, \tau_k] \]
3. Traditional Poisson model
\[ \mu_i = Y_i \cdot \exp(\alpha_k + \beta^\top X_i) \]
4. Piecewise Poisson model (time split lexis data)
\[ \lambda_i = \frac{\mu_i}{Y_i} = \exp\left(\alpha_{k(i)} + \beta^\top X_i\right) \]
5. GAMM
\[ h(t \mid X) = h_0(t)\exp\left(\beta^\top X + f(t)\right) \]
6. Gompertz model
\[ h(t \mid X) = \lambda \exp(\gamma t)\exp(\beta^\top X) \]
7. Weibull
PH model
\[ h(t \mid X) = \lambda_k t^{k-1} \exp(\beta^\top X) \]