library(haven)      # read Stata .dta
library(dplyr)
library(lubridate)
library(survival)
library(survminer)  # ggsurvplot
library(broom)      # tidy()
library(epitools)  
library(gtsummary)
library(gt)
library(mice)
library(readr)
library(Epi)
library(flexsurv)
library(Epi)
library(mgcv)
library(popEpi)
library(R2BayesX)
library(speedglm)

Part I

setwd("C:/Users/VUSI/Downloads/BIO3")
dat <- read_dta("trinmlsh.dta")
head(dat, n=10)

ethgp	ageent	death	smokenum	hdlc	diabp	sysbp	chdstart	days	years	bmi	id	timein	timeout	y	timebth
2	62.984	1	0	0.767	120	198	NA	1409	3.857632	NA	62	1977-01-01	1980-11-10	3.857632	1914-01-07
3	60.553	0	5	NA	66	150	0	1526	4.177960	NA	166	1977-01-01	1981-03-07	4.177960	1916-06-13
5	67.617	1	1	0.563	81	135	0	367	1.004791	20.58	68	1977-01-01	1978-01-03	1.004791	1909-05-20
4	60.424	0	1	0.684	90	170	1	3494	9.566050	26.24	49	1977-01-01	1986-07-27	9.566050	1916-07-30
1	71.723	0	1	1.099	98	158	0	2546	6.970568	23.40	287	1977-01-01	1983-12-22	6.970568	1905-04-12
1	69.051	0	1	NA	64	112	NA	3452	9.451061	NA	69	1977-01-01	1986-06-15	9.451061	1907-12-14
1	67.578	0	0	0.850	80	126	0	3123	8.550308	19.73	139	1977-01-01	1985-07-21	8.550308	1909-06-04
3	61.895	0	0	0.850	80	115	0	3262	8.930869	15.31	109	1977-01-01	1985-12-07	8.930869	1915-02-08
1	62.316	0	0	0.979	91	129	0	3416	9.352498	23.74	81	1977-01-01	1986-05-10	9.352498	1914-09-08
1	68.950	0	0	0.748	83	145	0	3207	8.780288	26.36	153	1977-01-01	1985-10-13	8.780288	1908-01-20

summary(dat)
#>      ethgp           ageent          death           cvdeath       
#>  Min.   :1.000   Min.   :60.01   Min.   :0.0000   Min.   :0.00000  
#>  1st Qu.:1.000   1st Qu.:62.39   1st Qu.:0.0000   1st Qu.:0.00000  
#>  Median :1.000   Median :64.59   Median :0.0000   Median :0.00000  
#>  Mean   :2.104   Mean   :65.09   Mean   :0.2767   Mean   :0.06918  
#>  3rd Qu.:3.000   3rd Qu.:67.67   3rd Qu.:1.0000   3rd Qu.:0.00000  
#>  Max.   :5.000   Max.   :73.93   Max.   :1.0000   Max.   :1.00000  
#>                                                                    
#>       alc           smokenum          hdlc           diabp       
#>  Min.   :0.000   Min.   :0.000   Min.   :0.185   Min.   : 55.00  
#>  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.748   1st Qu.: 80.00  
#>  Median :1.000   Median :1.000   Median :0.924   Median : 90.00  
#>  Mean   :1.031   Mean   :1.388   Mean   :0.988   Mean   : 91.09  
#>  3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:1.201   3rd Qu.:100.00  
#>  Max.   :3.000   Max.   :5.000   Max.   :2.411   Max.   :159.00  
#>                  NA's   :1       NA's   :17      NA's   :4       
#>      sysbp          chdstart          days          years        
#>  Min.   : 95.0   Min.   :0.000   Min.   :   6   Min.   :0.01643  
#>  1st Qu.:135.0   1st Qu.:0.000   1st Qu.:2107   1st Qu.:5.76934  
#>  Median :150.0   Median :0.000   Median :2804   Median :7.67830  
#>  Mean   :154.8   Mean   :0.131   Mean   :2532   Mean   :6.93251  
#>  3rd Qu.:170.0   3rd Qu.:0.000   3rd Qu.:3232   3rd Qu.:8.85010  
#>  Max.   :243.0   Max.   :1.000   Max.   :3579   Max.   :9.79877  
#>  NA's   :4       NA's   :28                                      
#>       bmi              id             timein              timeout          
#>  Min.   :11.25   Min.   :  1.00   Min.   :1977-01-01   Min.   :1977-01-07  
#>  1st Qu.:20.27   1st Qu.: 80.25   1st Qu.:1977-01-01   1st Qu.:1982-10-09  
#>  Median :23.21   Median :159.50   Median :1977-01-01   Median :1984-09-05  
#>  Mean   :23.25   Mean   :159.50   Mean   :1977-01-01   Mean   :1983-12-08  
#>  3rd Qu.:26.10   3rd Qu.:238.75   3rd Qu.:1977-01-01   3rd Qu.:1985-11-07  
#>  Max.   :36.38   Max.   :318.00   Max.   :1977-01-01   Max.   :1986-10-20  
#>  NA's   :35                                                                
#>        y              timebth          
#>  Min.   :0.01643   Min.   :1903-01-27  
#>  1st Qu.:5.76934   1st Qu.:1909-04-30  
#>  Median :7.67830   Median :1912-05-31  
#>  Mean   :6.93251   Mean   :1911-11-28  
#>  3rd Qu.:8.85010   3rd Qu.:1914-08-13  
#>  Max.   :9.79877   Max.   :1916-12-30  
#> 
unique(dat$smokenum)
#> [1]  0  5  1  3  2  4 NA

# --- Step C: Recode smoking categories ---
dat <- dat %>%
  mutate(
    smoke_cat = case_when(
      smokenum == 0 ~ "non-smoker",
      smokenum == 1 ~ "ex-smoker",
      smokenum >= 2 ~ "current-smoker",
      TRUE ~ NA_character_  # any unusual values (like 5) become NA
    ),
    smoke_cat = factor(smoke_cat, 
                       levels = c("non-smoker", "ex-smoker", "current-smoker"))
  )
saveRDS(dat, "data_new.rds")

c) Examine the effect of smoking on all-cause mortality using strate and stcox. Hint: You may wish to recode smokenum into a smaller number of categories, say: 0 = non-smoker, 1 = ex-smoker, 2 = current smoker.

When mortality rates were examined by smoking category, a graded increase was observed from non-smokers (30.48 deaths per 1,000 person-years) to ex-smokers (39.00 per 1,000 person-years) and current smokers (53.40 per 1,000 person-years).

Poisson regression using non-smokers as the reference group indicated that current smokers had a statistically significant 75% higher mortality rate (rate ratio [RR] = 1.75; 95% CI: 1.09–2.39; p = 0.020), while the increase for ex-smokers was smaller and not statistically significant (RR = 1.28; 95% CI: 0.71–2.10; p = 0.41).

d) Using the rates and rate ratios in part (c), can you see a trend in the relationship between smoking category and all-cause mortality? How would you assess this more formally? How would you present the results from your analyses in parts (c) and (d)?

Based on the crude rates, there is a clear increase in all-cause mortality from non-smokers (30.5 deaths/1,000 person-years) to ex-smokers (39.0/1,000 person-years) and current smokers (53.4/1,000 person-years).

To formally assess this trend, a Poisson regression model was fitted with smoking category (0 = non-smoker, 1 = ex-smoker, 2 = current smoker) and person-time as the offset. This approach yields incidence rate ratios (IRRs) with 95% confidence intervals, allowing hypothesis testing for both overall differences between categories. I also used kaplan meire plots and log rank test to see if there’s a difference in survival probabilities between the categories.

The results were presented as a table showing the Poisson regression IRRs, CIs, and p-values, providing both descriptive and formal statistical evidence of the association. I also supplemented this by a forest plot to visually convey the magnitude and direction of associations.


# --- crude incidence rates (deaths / person-years) by smoking category ---
rates_tab <- dat %>%
  group_by(smoke_cat) %>%
  summarize(
    deaths = sum(death, na.rm=TRUE),
    person_years = sum(y, na.rm=TRUE)
  ) %>%
  mutate(
    rate_per_1000py = deaths / person_years * 1000
  )
head(rates_tab, n=10)

smoke_cat	deaths	person_years	rate_per_1000py
non-smoker	30	984.407939	30.47517
ex-smoker	18	461.593432	38.99536
current-smoker	40	749.015740	53.40342
NA	0	9.522245	0.00000

poisson_crude <- glm(death ~ smoke_cat, offset = log(y),
                     family = poisson(link = "log"), data = dat)

summary(poisson_crude)
#> 
#> Call:
#> glm(formula = death ~ smoke_cat, family = poisson(link = "log"), 
#>     data = dat, offset = log(y))
#> 
#> Coefficients:
#>                         Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)              -3.4908     0.1826 -19.120   <2e-16 ***
#> smoke_catex-smoker        0.2465     0.2981   0.827   0.4083    
#> smoke_catcurrent-smoker   0.5610     0.2415   2.323   0.0202 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for poisson family taken to be 1)
#> 
#>     Null deviance: 395.44  on 316  degrees of freedom
#> Residual deviance: 389.95  on 314  degrees of freedom
#>   (1 observation deleted due to missingness)
#> AIC: 571.95
#> 
#> Number of Fisher Scoring iterations: 7

# Create gtsummary table with bold formatting
tbl_crude <- tbl_regression(
  poisson_crude,  # Make sure this matches your object name
  exponentiate = TRUE,    # Exponentiate to get Rate Ratios
  estimate_fun = ~sprintf("%.2f", .x),  # Format RR to 2 decimals
  pvalue_fun = ~sprintf("%.3f", .x)    # Format p-value to 3 decimals
) %>%
  # Bold formatting for labels
  bold_labels() %>%
  # Modify headers
  modify_header(
    label ~ "**Characteristics**",
    estimate ~ "**IRR**",
    std.error ~ "**SE**",
    p.value ~ "**p-value**"
  ) %>%
  # Add spanning header
  modify_spanning_header(
    c(estimate, std.error, ci) ~ "**Poisson Regression Model Estimates**"
  ) %>%
  # Convert to gt and add title/source note
  as_gt() %>%
  gt::tab_header(
    title = gt::md("**Rate Ratios for All-Cause Mortality by Smoking Status**"),
    subtitle = gt::md("Poisson regression with offset for person-years")
  ) %>%
  gt::tab_source_note(
    source_note = gt::md("Data updated August by Vusi, 2025")
  )

tbl_crude

Rate Ratios for All-Cause Mortality by Smoking Status
Poisson regression with offset for person-years
Characteristics	Poisson Regression Model Estimates		95% CI	p-value
Characteristics	IRR	SE	95% CI	p-value
smoke_cat
non-smoker	—	—	—
ex-smoker	1.28	0.298	0.70, 2.27	0.408
current-smoker	1.75	0.242	1.09, 2.83	0.020
Abbreviations: CI = Confidence Interval, IRR = Incidence Rate Ratio, SE = Standard Error
Data updated August by Vusi, 2025

library(ggplot2)
library(dplyr)
library(glue)

# Create forest plot data with color categories
forest_data <- data.frame(
  term = c("Ex-smoker vs Non-smoker", "Current-smoker vs Non-smoker"),
  estimate = c(0.246530, 0.5610),
  std.error = c(0.298142, 0.2415),
  p.value = c(0.408, 0.0202)
) %>%
  mutate(
    # Calculate RR and CI
    RR = exp(estimate),
    CI_lower = exp(estimate - 1.96 * std.error),
    CI_upper = exp(estimate + 1.96 * std.error),
    
    # Create color groups (using term as color variable)
    color_group = term,
    
    # Format labels
    term = factor(term, levels = rev(term)),
    label = glue("{term}\nRR = {sprintf('%.2f', RR)} (95% CI: {sprintf('%.2f', CI_lower)}-{sprintf('%.2f', CI_upper)})\np = {ifelse(p.value < 0.001, '<0.001', sprintf('%.3f', p.value))}"),
    
    # Position for text labels (right side)
    text_x = 3.5
  )

# Custom color palette
term_colors <- c("Ex-smoker vs Non-smoker" = "#4E79A7",  # Blue
                 "Current-smoker vs Non-smoker" = "#E15759")  # Red

# Create the forest plot
ggplot(forest_data, aes(x = RR, y = term, color = color_group)) +
  # Reference line
  geom_vline(xintercept = 1, linetype = "dashed", color = "gray50", linewidth = 0.8) +
  
  # Point estimates with CIs
  geom_pointrange(aes(xmin = CI_lower, xmax = CI_upper), 
                 size = 1, linewidth = 1.2, fatten = 2.5) +
  
  # Text labels (acting as legend)
  geom_text(aes(x = text_x, label = label, color = color_group),
            hjust = 0, vjust = 0.5, size = 3.8, show.legend = FALSE) +
  
  # Scale adjustments
  scale_x_continuous(
    trans = "log",
    breaks = c(0.5, 0.75, 1, 1.5, 2, 3),
    limits = c(0.4, 5),  # Extended limit for text
    expand = expansion(mult = c(0, 0.1))
  ) +
  
  # Apply custom colors
  scale_color_manual(values = term_colors) +
  
  # Labels and titles
  labs(
    title = "Smoking Status Mortality Risk",
    subtitle = "Poisson regression with person-years offset",
    x = "IRR with 95% Confidence Interval",
    y = NULL,
    caption = glue("Null deviance: 280.09, Residual deviance: 279.37\n",
                  "AIC: 395.37 | 81 observations excluded due to missingness")
  ) +
  
  # Theme customization
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    plot.subtitle = element_text(hjust = 0.5, size = 11),
    axis.text.y = element_blank(),  # Remove y-axis text since we have labels
    axis.title.x = element_text(size = 11),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    legend.position = "none",  # We're using text as legend
    plot.caption = element_text(size = 9, hjust = 0),
    plot.margin = margin(1, 8, 1, 1, "cm")  # Increased right margin for text
  ) +
  
  # Ensure nothing gets clipped
  coord_cartesian(clip = "off")


# Save with proper dimensions
ggsave("forest_plot.png", width = 11, height = 5, dpi = 300)

Measuring trend using a Kaplan Meier plot

# --- Kaplan-Meier curves and log-rank test (strate analogue) ---
surv_obj <- with(dat, Surv(time = y, event = death))


# KM by smoking
fit_km <- survfit(surv_obj ~ smoke_cat, data = dat)
ggsurvplot(fit_km, data = dat,
           risk.table = F,
           conf.int = T,
           pval = TRUE,
           xlab = "Follow-up time (years)",
           legend.title = "Smoking")

e) Possible confounding effect of current age on the relationship between smoking and mortality

The mean ages at study entry were very similar across smoking groups (64.71–65.60 years), with overlapping age ranges of approximately 60–73 years. This indicates no large differences in baseline age, so adjusting for current age is unlikely to substantially change the smoking–mortality association compared with using follow-up time as the analysis scale. So age is not a possible confounding factor between smoking and and mortality.

# --- Step E: Check age distribution by smoking category ---
dat %>%
  group_by(smoke_cat) %>%
  summarise(
    n = n(),
    mean_age_entry = mean(ageent, na.rm = TRUE),
    sd_age_entry = sd(ageent, na.rm = TRUE),
    min_age_entry = min(ageent, na.rm = TRUE),
    max_age_entry = max(ageent, na.rm = TRUE)
  )

smoke_cat	n	mean_age_entry	sd_age_entry	min_age_entry	max_age_entry
non-smoker	140	65.12610	3.203284	60.077	72.016
ex-smoker	68	65.60484	3.571255	60.093	73.467
current-smoker	109	64.71269	2.902213	60.005	73.930
NA	1	67.24400	NA	67.244	67.244


# Visual check: density plot of baseline age by smoking category using facet_grid
library(ggplot2)
ggplot(dat, aes(x = ageent)) + # Removed color/fill from aes() as faceting separates them
  geom_density(alpha = 0.5, fill = "pink") + # Added a common fill color
  labs(x = "Age at study entry", y = "Density", title = "Age Distribution by Smoking Category") +
  facet_grid(. ~ smoke_cat, scales = "free_y") + # Faceting by smoke_cat in columns, free y-scales
  theme_minimal()

f) What was the age range at study entry (ageent) in each smoking group? Are there large differences in age between the groups?

The age range at study entry was 60–72 years for non-smokers, 60–73 years for ex-smokers, and 60–73 years for current smokers. Mean ages were 65.13, 65.60, and 64.71 years respectively, indicating no large age differences between groups. Since baseline ages are similar, using follow-up time as the analysis scale will not result in comparing men of very different actual ages, although minor age variation still warrants adjustment.

# --- Step F: If follow-up time is the analysis scale,
# are we comparing men of very different ages?
dat %>%
  group_by(smoke_cat) %>%
  summarise(
    min_age = min(ageent, na.rm = TRUE),
    max_age = max(ageent, na.rm = TRUE),
    mean_age = mean(ageent, na.rm = TRUE),
    sd_age = sd(ageent, na.rm = TRUE),
    n = n()
  )

smoke_cat	min_age	max_age	mean_age	sd_age	n
non-smoker	60.077	72.016	65.12610	3.203284	140
ex-smoker	60.093	73.467	65.60484	3.571255	68
current-smoker	60.005	73.930	64.71269	2.902213	109
NA	67.244	67.244	67.24400	NA	1

g) Cox regression and assumptions

After adjusting for age at study entry, there is no statistically significant difference in all-cause mortality risk between ex-smokers or current smokers and non-smokers (HR = 1.05 per year older; p = 0.12).

No strong evidence of PH violation overall (global p = 0.156). Smoking shows borderline evidence of time-varying effects (p = 0.076), and visual inspection of Schoenfeld residual plots confirm this (figures below).

# --- Step G: Age-adjusted Cox model ---
surv_obj <- with(dat, Surv(y, death))

cox_age_adj <- coxph(surv_obj ~ smoke_cat + ageent, data = dat)
summary(cox_age_adj)
#> Call:
#> coxph(formula = surv_obj ~ smoke_cat + ageent, data = dat)
#> 
#>   n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#>                            coef exp(coef) se(coef)     z Pr(>|z|)  
#> smoke_catex-smoker      0.23028   1.25896  0.29904 0.770   0.4413  
#> smoke_catcurrent-smoker 0.57251   1.77271  0.24180 2.368   0.0179 *
#> ageent                  0.05228   1.05367  0.03377 1.548   0.1216  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#>                         exp(coef) exp(-coef) lower .95 upper .95
#> smoke_catex-smoker          1.259     0.7943    0.7006     2.262
#> smoke_catcurrent-smoker     1.773     0.5641    1.1036     2.847
#> ageent                      1.054     0.9491    0.9862     1.126
#> 
#> Concordance= 0.569  (se = 0.029 )
#> Likelihood ratio test= 7.82  on 3 df,   p=0.05
#> Wald test            = 7.75  on 3 df,   p=0.05
#> Score (logrank) test = 7.88  on 3 df,   p=0.05

# --- PH assumption check for adjusted model ---
zph_age_adj <- cox.zph(cox_age_adj)
zph_age_adj
#>           chisq df     p
#> smoke_cat 5.159  2 0.076
#> ageent    0.035  1 0.852
#> GLOBAL    5.225  3 0.156
plot(zph_age_adj)

H to J - Whitehal_new.dta

whitehal <- read_dta("whitehal_new.dta")
head(whitehal, n=10)

id	all	chd	sbp	smok	agein	grade	sbpgrp	timein	timeout	timebth	cholgrp	chol
5001	0	0	120	4	47.48255	1	2	1967-09-13	1987-01-30	1920-03-20	4	273
5019	0	0	118	3	56.10678	0	1	1967-09-13	1987-01-30	1911-08-06	3	234
5038	0	0	147	3	51.44422	0	3	1967-09-13	1987-01-30	1916-04-03	4	295
5039	0	0	92	1	51.11567	0	1	1967-09-13	1987-01-30	1916-08-02	3	210
5042	1	0	128	2	55.48528	0	2	1967-09-13	1983-04-12	1912-03-20	4	287
5052	1	0	109	2	54.43669	0	1	1967-09-13	1984-07-10	1913-04-07	3	209
5064	1	0	145	1	46.37645	0	3	1967-09-15	1984-05-13	1921-04-30	4	262
5078	1	1	144	2	47.72348	0	3	1967-09-15	1983-11-29	1919-12-25	4	268
5089	0	0	154	1	47.92608	0	3	1967-09-15	1987-01-30	1919-10-11	2	187
5090	0	0	114	1	59.45790	0	1	1967-09-15	1987-01-30	1908-03-31	3	222

# packages
library(haven); library(dplyr); library(survival); library(survminer); library(broom)

# assume your data is in `wh` (from read_dta)
# wh <- read_dta("whitehal_new.dta")

# Clean labels → factors, build survival time (years)
wh2 <- whitehal %>%
  mutate(
    grade = haven::as_factor(grade, levels = "labels"),   # 1=high, 2=low
    smok  = haven::as_factor(smok, levels = "labels"),
    sbpgrp= haven::as_factor(sbpgrp, levels = "labels"),
    cholgrp = haven::as_factor(cholgrp, levels = "labels"),
    fu_years = as.numeric(timeout - timein) / 365.25,
    chd = as.integer(chd) # ensure 0/1
  )

# quick checks
summary(dplyr::

          select(wh2, fu_years, chd, grade, agein, sbp, chol, smok))
#>     fu_years            chd                     grade          agein      
#>  Min.   : 0.1506   Min.   :0.00000   High job grade:1194   Min.   :40.05  
#>  1st Qu.:17.1636   1st Qu.:0.00000   Low job grade : 483   1st Qu.:46.77  
#>  Median :17.9602   Median :0.00000                         Median :51.79  
#>  Mean   :16.4612   Mean   :0.09183                         Mean   :52.10  
#>  3rd Qu.:18.5626   3rd Qu.:0.00000                         3rd Qu.:57.48  
#>  Max.   :19.3812   Max.   :1.00000                         Max.   :69.47  
#>                                                                           
#>       sbp             chol                  smok    
#>  Min.   : 87.0   Min.   : 75.0   Never smoked :317  
#>  1st Qu.:120.0   1st Qu.:165.0   Ex-smoker    :646  
#>  Median :133.0   Median :192.0   1-14 cig/day :310  
#>  Mean   :135.8   Mean   :196.4   15-24 cig/day:279  
#>  3rd Qu.:148.0   3rd Qu.:225.0   25+ cig/day  :125  
#>  Max.   :252.0   Max.   :480.0                      
#>                  NA's   :26
table(wh2$grade, useNA = "ifany")
#> 
#> High job grade  Low job grade 
#>           1194            483
head(wh2, n= 10)

id	all	chd	sbp	smok	agein	grade	sbpgrp	timein	timeout	timebth	cholgrp	chol	fu_years
5001	0	0	120	15-24 cig/day	47.48255	Low job grade	120-139	1967-09-13	1987-01-30	1920-03-20	250+	273	19.38123
5019	0	0	118	1-14 cig/day	56.10678	High job grade	<120	1967-09-13	1987-01-30	1911-08-06	200-249	234	19.38123
5038	0	0	147	1-14 cig/day	51.44422	High job grade	140-159	1967-09-13	1987-01-30	1916-04-03	250+	295	19.38123
5039	0	0	92	Never smoked	51.11567	High job grade	<120	1967-09-13	1987-01-30	1916-08-02	200-249	210	19.38123
5042	1	0	128	Ex-smoker	55.48528	High job grade	120-139	1967-09-13	1983-04-12	1912-03-20	250+	287	15.57837
5052	1	0	109	Ex-smoker	54.43669	High job grade	<120	1967-09-13	1984-07-10	1913-04-07	200-249	209	16.82141
5064	1	0	145	Never smoked	46.37645	High job grade	140-159	1967-09-15	1984-05-13	1921-04-30	250+	262	16.66248
5078	1	1	144	Ex-smoker	47.72348	High job grade	140-159	1967-09-15	1983-11-29	1919-12-25	250+	268	16.20801
5089	0	0	154	Never smoked	47.92608	High job grade	140-159	1967-09-15	1987-01-30	1919-10-11	150-199	187	19.37842
5090	0	0	114	Never smoked	59.45790	High job grade	<120	1967-09-15	1987-01-30	1908-03-31	200-249	222	19.37842

i) Cox model: effect of job grade (follow-up time scale)

Using follow-up time as the analysis scale, the unadjusted Cox regression (output below) showed that participants in the low job grade had more than twice the hazard of CHD mortality compared with those in the high job grade (HR = 2.04; 95% CI: 1.48–2.82; p < 0.001). However, after adjusting for age at entry, systolic blood pressure, cholesterol, and smoking, the association attenuated and became non-significant (HR for low vs high = 1.11; equivalent to HR for high vs low = 0.905; 95% CI: 0.637–1.284; p = 0.575).

In the adjusted model, age at entry (HR = 1.10 per year; 95% CI: 1.07–1.13), systolic blood pressure (HR = 1.016 per mmHg; 95% CI: 1.009–1.023), cholesterol (HR = 1.007 per unit; 95% CI: 1.004–1.011), and all categories of smoking were strong independent predictors of CHD mortality. Proportional hazards testing indicated some evidence of violation for grade in the unadjusted model (p = 0.0067) and borderline evidence in the adjusted model (p = 0.031), but the global test for the adjusted model was not significant (p = 0.232).

# survival object
S_chd <- with(wh2, Surv(fu_years, chd))

# Ensure reference category is grade=1 high
wh2 <- wh2 %>% mutate(grade = relevel(factor(grade), ref = "High job grade"))

## Unadjusted Cox: CHD ~ grade
cox_grade <- coxph(S_chd ~ grade, data = wh2)
summary(cox_grade)
#> Call:
#> coxph(formula = S_chd ~ grade, data = wh2)
#> 
#>   n= 1677, number of events= 154 
#> 
#>                      coef exp(coef) se(coef)     z Pr(>|z|)    
#> gradeLow job grade 0.7140    2.0421   0.1637 4.362 1.29e-05 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#>                    exp(coef) exp(-coef) lower .95 upper .95
#> gradeLow job grade     2.042     0.4897     1.482     2.815
#> 
#> Concordance= 0.585  (se = 0.02 )
#> Likelihood ratio test= 17.94  on 1 df,   p=2e-05
#> Wald test            = 19.03  on 1 df,   p=1e-05
#> Score (logrank) test = 19.85  on 1 df,   p=8e-06

# KM curves + log-rank (nice analogue to a simple stratified comparison)
fit_km1 <- survfit(S_chd ~ grade, data = wh2)
ggsurvplot(fit_km1, data = wh2, risk.table = F, pval = TRUE, conf.int = T,
           xlab = "Follow-up (years)", legend.title = "Job grade")

## Adjusted Cox (optional; include if you decide to adjust)
cox_adj <- coxph(S_chd ~ grade + agein + sbp + chol + smok, data = wh2)

summary(cox_adj)
#> Call:
#> coxph(formula = S_chd ~ grade + agein + sbp + chol + smok, data = wh2)
#> 
#>   n= 1651, number of events= 151 
#>    (26 observations deleted due to missingness)
#> 
#>                        coef exp(coef) se(coef)     z Pr(>|z|)    
#> gradeLow job grade 0.100190  1.105381 0.178687 0.561 0.575000    
#> agein              0.095873  1.100620 0.013930 6.882 5.88e-12 ***
#> sbp                0.016100  1.016230 0.003615 4.453 8.45e-06 ***
#> chol               0.007367  1.007394 0.001633 4.513 6.41e-06 ***
#> smokEx-smoker      0.769555  2.158804 0.332196 2.317 0.020527 *  
#> smok1-14 cig/day   1.050483  2.859032 0.349535 3.005 0.002653 ** 
#> smok15-24 cig/day  1.320346  3.744718 0.348945 3.784 0.000154 ***
#> smok25+ cig/day    1.265059  3.543302 0.398100 3.178 0.001484 ** 
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#>                    exp(coef) exp(-coef) lower .95 upper .95
#> gradeLow job grade     1.105     0.9047    0.7788     1.569
#> agein                  1.101     0.9086    1.0710     1.131
#> sbp                    1.016     0.9840    1.0091     1.023
#> chol                   1.007     0.9927    1.0042     1.011
#> smokEx-smoker          2.159     0.4632    1.1258     4.140
#> smok1-14 cig/day       2.859     0.3498    1.4411     5.672
#> smok15-24 cig/day      3.745     0.2670    1.8897     7.421
#> smok25+ cig/day        3.543     0.2822    1.6238     7.732
#> 
#> Concordance= 0.759  (se = 0.018 )
#> Likelihood ratio test= 135.4  on 8 df,   p=<2e-16
#> Wald test            = 126.8  on 8 df,   p=<2e-16
#> Score (logrank) test = 136.6  on 8 df,   p=<2e-16

# PH checks
zph_unadj <- cox.zph(cox_grade);  zph_unadj
#>        chisq df      p
#> grade   7.35  1 0.0067
#> GLOBAL  7.35  1 0.0067

zph_adj   <- cox.zph(cox_adj);    zph_adj
#>         chisq df     p
#> grade   4.644  1 0.031
#> agein   3.597  1 0.058
#> sbp     0.682  1 0.409
#> chol    1.326  1 0.250
#> smok    2.907  4 0.574
#> GLOBAL 10.488  8 0.232

j) Parametric (preferred) vs Frailty; interpretation for categorical & continuous

A Weibull parametric model was fitted as the most suitable choice given the absence of clustering. In the unadjusted model, the shape parameter (k = 1.36; 95% CI: 1.17–1.58) indicated an increasing hazard over time, and men in the high job grade had a 69% longer expected survival time to CHD death compared with those in the low grade (time ratio [TR] = 1.69; 95% CI: 1.32–2.16). After adjustment for age, systolic blood pressure, cholesterol, and smoking, the shape remained >1 (k = 1.45; 95% CI: 1.25–1.68), but the grade effect was no longer significant (TR = 1.07; 95% CI: 0.84–1.36).

For the continuous variable age at entry, the adjusted time ratio was 0.94 per year (95% CI: 0.92–0.96), indicating that each additional year of baseline age shortened the expected time to CHD death by approximately 6%. Similarly, higher systolic blood pressure and cholesterol were associated with shorter survival times (TR = 0.989 and TR = 0.995 per unit, respectively), and all smoking categories had TRs well below 1, indicating shorter survival compared with non-smokers. These parametric findings were consistent with the adjusted Cox model, reinforcing that job grade was not an independent predictor once key cardiovascular risk factors were included, while age, blood pressure, cholesterol, and smoking remained important determinants of CHD mortality.

# install.packages("flexsurv")
library(flexsurv)

# Unadjusted Weibull
wb_unadj <- flexsurvreg(S_chd ~ grade, data = wh2, dist = "weibull")
wb_unadj
#> Call:
#> flexsurvreg(formula = S_chd ~ grade, data = wh2, dist = "weibull")
#> 
#> Estimates: 
#>                     data mean  est      L95%     U95%     se       exp(est)
#> shape                    NA      1.357    1.166    1.581    0.105       NA 
#> scale                    NA    115.375   83.463  159.490   19.060       NA 
#> gradeLow job grade    0.288     -0.525   -0.771   -0.278    0.126    0.592 
#>                     L95%     U95%   
#> shape                    NA       NA
#> scale                    NA       NA
#> gradeLow job grade    0.462    0.757
#> 
#> N = 1677,  Events: 154,  Censored: 1523
#> Total time at risk: 27605.37
#> Log-likelihood = -937.7307, df = 3
#> AIC = 1881.461


# Adjusted Weibull (if mirroring cox_adj)
wb_adj <- flexsurvreg(S_chd ~ grade + agein + sbp + chol + smok,
                      data = wh2, dist = "weibull")
wb_adj
#> Call:
#> flexsurvreg(formula = S_chd ~ grade + agein + sbp + chol + smok, 
#>     data = wh2, dist = "weibull")
#> 
#> Estimates: 
#>                     data mean  est        L95%       U95%       se       
#> shape                      NA   1.45e+00   1.25e+00   1.68e+00   1.11e-01
#> scale                      NA   7.30e+04   1.30e+04   4.11e+05   6.44e+04
#> gradeLow job grade   2.89e-01  -6.96e-02  -3.11e-01   1.72e-01   1.23e-01
#> agein                5.21e+01  -6.56e-02  -8.61e-02  -4.50e-02   1.05e-02
#> sbp                  1.36e+02  -1.11e-02  -1.62e-02  -6.01e-03   2.59e-03
#> chol                 1.96e+02  -5.06e-03  -7.35e-03  -2.77e-03   1.17e-03
#> smokEx-smoker        3.86e-01  -5.29e-01  -9.85e-01  -7.32e-02   2.32e-01
#> smok1-14 cig/day     1.82e-01  -7.24e-01  -1.21e+00  -2.40e-01   2.47e-01
#> smok15-24 cig/day    1.67e-01  -9.07e-01  -1.39e+00  -4.20e-01   2.48e-01
#> smok25+ cig/day      7.45e-02  -8.66e-01  -1.42e+00  -3.15e-01   2.81e-01
#>                     exp(est)   L95%       U95%     
#> shape                      NA         NA         NA
#> scale                      NA         NA         NA
#> gradeLow job grade   9.33e-01   7.33e-01   1.19e+00
#> agein                9.37e-01   9.17e-01   9.56e-01
#> sbp                  9.89e-01   9.84e-01   9.94e-01
#> chol                 9.95e-01   9.93e-01   9.97e-01
#> smokEx-smoker        5.89e-01   3.74e-01   9.29e-01
#> smok1-14 cig/day     4.85e-01   2.99e-01   7.87e-01
#> smok15-24 cig/day    4.04e-01   2.48e-01   6.57e-01
#> smok25+ cig/day      4.21e-01   2.42e-01   7.30e-01
#> 
#> N = 1651,  Events: 151,  Censored: 1500
#> Total time at risk: 27200.02
#> Log-likelihood = -861.7449, df = 10
#> AIC = 1743.49

Missing values

dat <- read_dta("trinmlsh.dta")
head(dat, n=10)

ethgp	ageent	death	smokenum	hdlc	diabp	sysbp	chdstart	days	years	bmi	id	timein	timeout	y	timebth
2	62.984	1	0	0.767	120	198	NA	1409	3.857632	NA	62	1977-01-01	1980-11-10	3.857632	1914-01-07
3	60.553	0	5	NA	66	150	0	1526	4.177960	NA	166	1977-01-01	1981-03-07	4.177960	1916-06-13
5	67.617	1	1	0.563	81	135	0	367	1.004791	20.58	68	1977-01-01	1978-01-03	1.004791	1909-05-20
4	60.424	0	1	0.684	90	170	1	3494	9.566050	26.24	49	1977-01-01	1986-07-27	9.566050	1916-07-30
1	71.723	0	1	1.099	98	158	0	2546	6.970568	23.40	287	1977-01-01	1983-12-22	6.970568	1905-04-12
1	69.051	0	1	NA	64	112	NA	3452	9.451061	NA	69	1977-01-01	1986-06-15	9.451061	1907-12-14
1	67.578	0	0	0.850	80	126	0	3123	8.550308	19.73	139	1977-01-01	1985-07-21	8.550308	1909-06-04
3	61.895	0	0	0.850	80	115	0	3262	8.930869	15.31	109	1977-01-01	1985-12-07	8.930869	1915-02-08
1	62.316	0	0	0.979	91	129	0	3416	9.352498	23.74	81	1977-01-01	1986-05-10	9.352498	1914-09-08
1	68.950	0	0	0.748	83	145	0	3207	8.780288	26.36	153	1977-01-01	1985-10-13	8.780288	1908-01-20


#pre-proccesssed
data_new <- read_rds("data_new.rds")
head(data_new, n=10)

ethgp	ageent	death	smokenum	hdlc	diabp	sysbp	chdstart	days	years	bmi	id	timein	timeout	y	timebth	smoke_cat
2	62.984	1	0	0.767	120	198	NA	1409	3.857632	NA	62	1977-01-01	1980-11-10	3.857632	1914-01-07	non-smoker
3	60.553	0	5	NA	66	150	0	1526	4.177960	NA	166	1977-01-01	1981-03-07	4.177960	1916-06-13	current-smoker
5	67.617	1	1	0.563	81	135	0	367	1.004791	20.58	68	1977-01-01	1978-01-03	1.004791	1909-05-20	ex-smoker
4	60.424	0	1	0.684	90	170	1	3494	9.566050	26.24	49	1977-01-01	1986-07-27	9.566050	1916-07-30	ex-smoker
1	71.723	0	1	1.099	98	158	0	2546	6.970568	23.40	287	1977-01-01	1983-12-22	6.970568	1905-04-12	ex-smoker
1	69.051	0	1	NA	64	112	NA	3452	9.451061	NA	69	1977-01-01	1986-06-15	9.451061	1907-12-14	ex-smoker
1	67.578	0	0	0.850	80	126	0	3123	8.550308	19.73	139	1977-01-01	1985-07-21	8.550308	1909-06-04	non-smoker
3	61.895	0	0	0.850	80	115	0	3262	8.930869	15.31	109	1977-01-01	1985-12-07	8.930869	1915-02-08	non-smoker
1	62.316	0	0	0.979	91	129	0	3416	9.352498	23.74	81	1977-01-01	1986-05-10	9.352498	1914-09-08	non-smoker
1	68.950	0	0	0.748	83	145	0	3207	8.780288	26.36	153	1977-01-01	1985-10-13	8.780288	1908-01-20	non-smoker

data_new1 <- data_new %>% dplyr:: select(-smokenum)
head(data_new1, n=10)

ethgp	ageent	death	hdlc	diabp	sysbp	chdstart	days	years	bmi	id	timein	timeout	y	timebth	smoke_cat
2	62.984	1	0.767	120	198	NA	1409	3.857632	NA	62	1977-01-01	1980-11-10	3.857632	1914-01-07	non-smoker
3	60.553	0	NA	66	150	0	1526	4.177960	NA	166	1977-01-01	1981-03-07	4.177960	1916-06-13	current-smoker
5	67.617	1	0.563	81	135	0	367	1.004791	20.58	68	1977-01-01	1978-01-03	1.004791	1909-05-20	ex-smoker
4	60.424	0	0.684	90	170	1	3494	9.566050	26.24	49	1977-01-01	1986-07-27	9.566050	1916-07-30	ex-smoker
1	71.723	0	1.099	98	158	0	2546	6.970568	23.40	287	1977-01-01	1983-12-22	6.970568	1905-04-12	ex-smoker
1	69.051	0	NA	64	112	NA	3452	9.451061	NA	69	1977-01-01	1986-06-15	9.451061	1907-12-14	ex-smoker
1	67.578	0	0.850	80	126	0	3123	8.550308	19.73	139	1977-01-01	1985-07-21	8.550308	1909-06-04	non-smoker
3	61.895	0	0.850	80	115	0	3262	8.930869	15.31	109	1977-01-01	1985-12-07	8.930869	1915-02-08	non-smoker
1	62.316	0	0.979	91	129	0	3416	9.352498	23.74	81	1977-01-01	1986-05-10	9.352498	1914-09-08	non-smoker
1	68.950	0	0.748	83	145	0	3207	8.780288	26.36	153	1977-01-01	1985-10-13	8.780288	1908-01-20	non-smoker

k) Redo the modelling after administering the appropriate missing values imputation procedures

Multiple imputation using the mice package was performed to address missing values in variables including smoke_cat, hdlc, diabp, sysbp, chdstart, and bmi. Variables were imputed under the assumption of Missing at Random (MAR), meaning the probability of missingness could be explained by the observed data. For categorical variables such as smoke_cat, polytomous logistic regression (polyreg) was used, while continuous variables were imputed using predictive mean matching (pmm). The imputation model excluded outcome and follow-up time variables to avoid bias. Ten imputed datasets were generated, and Cox regression models were fitted within each dataset and pooled to obtain final estimates.

When comparing the Cox regression results before and after multiple imputation, the imputed model shows greater statistical efficiency and retention of sample size, as it avoided the loss of 44 observations due to missingness.

The hazard ratio for ex-smokers was attenuated after imputation (HR 1.12, 95% CI 0.61–2.04, p = 0.71) compared with the complete-case analysis (HR 1.79, 95% CI 0.91–3.55, p = 0.09), suggesting that the apparent excess risk among ex-smokers in the complete-case analysis may have been overestimated due to selection bias from missing data.

For current smokers, the association remained robust in both models, although slightly attenuated after imputation (HR 1.83, 95% CI 1.12–3.00, p = 0.017) versus the complete-case estimate (HR 2.45, 95% CI 1.35–4.46, p = 0.003).

Other covariates showed similar directions of effect in both models, but the imputed model provided narrower confidence intervals for several predictors, indicating improved precision.

Overall, the imputation improved the model by preserving statistical power, reducing bias from case-wise deletion, and yielding more stable effect estimates, particularly clarifying that the elevated risk is strongest and most consistent among current smokers.

# Count missing values for each variable
missing_summary <- sapply(data_new1, function(x) sum(is.na(x)))

# Turn into a data frame for nicer display
missing_df <- data.frame(
  Variable = names(missing_summary),
  Missing_Count = missing_summary,
  Missing_Percent = round(100 * missing_summary / nrow(data_new1), 2)
)

# Show only variables with at least 1 missing value
missing_df <- missing_df[missing_df$Missing_Count > 0, ]

missing_df

	Variable	Missing_Count	Missing_Percent
hdlc	hdlc	17	5.35
diabp	diabp	4	1.26
sysbp	sysbp	4	1.26
chdstart	chdstart	28	8.81
bmi	bmi	35	11.01
smoke_cat	smoke_cat	1	0.31


# Make sure smoke_cat is a factor
data_new1$smoke_cat <- factor(data_new1$smoke_cat)

# Correct the imputation method for categorical variables
my_methods <- make.method(data_new1)
my_methods["smoke_cat"] <- "polyreg"  # unordered categorical
my_methods["hdlc"] <- "pmm"
my_methods["diabp"] <- "pmm"
my_methods["sysbp"] <- "pmm"
my_methods["chdstart"] <- "pmm"
my_methods["bmi"] <- "pmm"


pred_matrix <- make.predictorMatrix(data_new1)
pred_matrix[, c("death", "y")] <- 0  # Do not use outcome/time to impute predictors


print(my_methods)
#>     ethgp    ageent     death   cvdeath       alc      hdlc     diabp     sysbp 
#>        ""        ""        ""        ""        ""     "pmm"     "pmm"     "pmm" 
#>  chdstart      days     years       bmi        id    timein   timeout         y 
#>     "pmm"        ""        ""     "pmm"        ""        ""        ""        "" 
#>   timebth smoke_cat 
#>        "" "polyreg"

set.seed(123)

imp <- mice(data_new, m = 10,
            method = my_methods,
            predictorMatrix = pred_matrix,
            printFlag = TRUE)
#> 
#>  iter imp variable
#>   1   1  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   2  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   3  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   4  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   5  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   6  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   7  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   8  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   9  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   1   10  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   1  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   2  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   3  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   4  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   5  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   6  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   7  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   8  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   9  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   2   10  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   1  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   2  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   3  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   4  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   5  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   6  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   7  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   8  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   9  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   3   10  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   1  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   2  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   3  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   4  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   5  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   6  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   7  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   8  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   9  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   4   10  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   1  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   2  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   3  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   4  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   5  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   6  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   7  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   8  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   9  hdlc*  diabp*  sysbp*  chdstart*  bmi  smoke_cat
#>   5   10  hdlc*  diabp*  sysbp*  chdstart  bmi  smoke_cat

# ===============================
# 7. Check convergence
# ===============================
plot(imp)

fit_cox <- coxph(Surv(y, death) ~ smoke_cat+ageent+sysbp+bmi+diabp+ hdlc+chdstart, data =data_new1)
fit_cox
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart, data = data_new1)
#> 
#>                              coef exp(coef)  se(coef)      z       p
#> smoke_catex-smoker       0.584613  1.794296  0.346474  1.687 0.09154
#> smoke_catcurrent-smoker  0.897908  2.454463  0.297975  3.013 0.00258
#> ageent                   0.035336  1.035968  0.040442  0.874 0.38226
#> sysbp                    0.007397  1.007425  0.006667  1.110 0.26716
#> bmi                     -0.035497  0.965125  0.032282 -1.100 0.27151
#> diabp                    0.001391  1.001392  0.013074  0.106 0.91529
#> hdlc                    -0.360503  0.697326  0.343396 -1.050 0.29380
#> chdstart                 0.608962  1.838522  0.322438  1.889 0.05894
#> 
#> Likelihood ratio test=18.36  on 8 df, p=0.01865
#> n= 274, number of events= 68 
#>    (44 observations deleted due to missingness)

(fit_imp <- with(imp, coxph(Surv(y, death) ~ smoke_cat+ageent+sysbp+bmi+diabp+ hdlc+chdstart)))
#> call :
#> with.mids(data = imp, expr = coxph(Surv(y, death) ~ smoke_cat + 
#>     ageent + sysbp + bmi + diabp + hdlc + chdstart))
#> 
#> call1 :
#> mice(data = data_new, m = 10, method = my_methods, predictorMatrix = pred_matrix, 
#>     printFlag = TRUE)
#> 
#> nmis :
#>  [1]  0  0  0  0  0  1 17  4  4 28  0  0 35  0  0  0  0  0  1
#> 
#> analyses :
#> [[1]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z      p
#> smoke_catex-smoker       0.133415  1.142724  0.300007  0.445 0.6565
#> smoke_catcurrent-smoker  0.571208  1.770404  0.246502  2.317 0.0205
#> ageent                   0.048825  1.050036  0.034659  1.409 0.1589
#> sysbp                    0.014213  1.014315  0.005173  2.748 0.0060
#> bmi                     -0.031616  0.968879  0.028512 -1.109 0.2675
#> diabp                   -0.012251  0.987823  0.009782 -1.252 0.2104
#> hdlc                    -0.172897  0.841224  0.288575 -0.599 0.5491
#> chdstart                 0.618227  1.855634  0.288445  2.143 0.0321
#> 
#> Likelihood ratio test=20.37  on 8 df, p=0.009033
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[2]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z      p
#> smoke_catex-smoker       0.141884  1.152443  0.300017  0.473 0.6363
#> smoke_catcurrent-smoker  0.578078  1.782610  0.247540  2.335 0.0195
#> ageent                   0.050276  1.051562  0.034670  1.450 0.1470
#> sysbp                    0.011320  1.011385  0.005407  2.094 0.0363
#> bmi                     -0.021171  0.979052  0.028069 -0.754 0.4507
#> diabp                   -0.008648  0.991389  0.010112 -0.855 0.3924
#> hdlc                    -0.248338  0.780096  0.292367 -0.849 0.3957
#> chdstart                 0.615726  1.850999  0.288033  2.138 0.0325
#> 
#> Likelihood ratio test=17.48  on 8 df, p=0.0255
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[3]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z      p
#> smoke_catex-smoker       0.116696  1.123777  0.302141  0.386 0.6993
#> smoke_catcurrent-smoker  0.619795  1.858547  0.247450  2.505 0.0123
#> ageent                   0.058941  1.060713  0.034507  1.708 0.0876
#> sysbp                    0.003372  1.003378  0.005857  0.576 0.5649
#> bmi                     -0.010719  0.989338  0.027513 -0.390 0.6968
#> diabp                    0.012465  1.012544  0.010670  1.168 0.2427
#> hdlc                    -0.153901  0.857357  0.282982 -0.544 0.5865
#> chdstart                 0.641471  1.899273  0.286918  2.236 0.0254
#> 
#> Likelihood ratio test=18.47  on 8 df, p=0.01794
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[4]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z        p
#> smoke_catex-smoker       0.089126  1.093219  0.301436  0.296 0.767480
#> smoke_catcurrent-smoker  0.649951  1.915448  0.247782  2.623 0.008714
#> ageent                   0.053973  1.055456  0.034808  1.551 0.120997
#> sysbp                    0.013209  1.013297  0.005086  2.597 0.009404
#> bmi                     -0.011250  0.988813  0.028185 -0.399 0.689777
#> diabp                   -0.011651  0.988417  0.009491 -1.228 0.219596
#> hdlc                    -0.342012  0.710340  0.309381 -1.105 0.268955
#> chdstart                 0.844013  2.325682  0.243671  3.464 0.000533
#> 
#> Likelihood ratio test=26.22  on 8 df, p=0.0009633
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[5]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z       p
#> smoke_catex-smoker       0.080068  1.083361  0.301436  0.266 0.79053
#> smoke_catcurrent-smoker  0.590055  1.804088  0.247735  2.382 0.01723
#> ageent                   0.056250  1.057862  0.034423  1.634 0.10225
#> sysbp                    0.002542  1.002545  0.006048  0.420 0.67426
#> bmi                     -0.034094  0.966481  0.027950 -1.220 0.22253
#> diabp                    0.015026  1.015139  0.010697  1.405 0.16012
#> hdlc                    -0.672523  0.510419  0.322773 -2.084 0.03720
#> chdstart                 0.768687  2.156932  0.274265  2.803 0.00507
#> 
#> Likelihood ratio test=25.06  on 8 df, p=0.001517
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[6]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z       p
#> smoke_catex-smoker       0.083990  1.087618  0.300349  0.280 0.77975
#> smoke_catcurrent-smoker  0.621128  1.861026  0.246596  2.519 0.01178
#> ageent                   0.056893  1.058542  0.034282  1.660 0.09700
#> sysbp                    0.004767  1.004779  0.005817  0.820 0.41250
#> bmi                     -0.024026  0.976261  0.028206 -0.852 0.39433
#> diabp                    0.010908  1.010968  0.010556  1.033 0.30140
#> hdlc                    -0.513054  0.598664  0.320305 -1.602 0.10921
#> chdstart                 0.867019  2.379807  0.254888  3.402 0.00067
#> 
#> Likelihood ratio test=25.52  on 8 df, p=0.00127
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[7]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                             coef exp(coef) se(coef)     z      p
#> smoke_catex-smoker      0.124178  1.132218 0.301129 0.412 0.6801
#> smoke_catcurrent-smoker 0.632753  1.882786 0.246917 2.563 0.0104
#> ageent                  0.060462  1.062327 0.034421 1.757 0.0790
#> sysbp                   0.002638  1.002641 0.005848 0.451 0.6519
#> bmi                     0.001439  1.001440 0.027427 0.052 0.9582
#> diabp                   0.013544  1.013636 0.010651 1.272 0.2035
#> hdlc                    0.019240  1.019426 0.269333 0.071 0.9431
#> chdstart                0.636044  1.888994 0.286768 2.218 0.0266
#> 
#> Likelihood ratio test=18.71  on 8 df, p=0.01649
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[8]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z       p
#> smoke_catex-smoker       0.132729  1.141941  0.300117  0.442 0.65830
#> smoke_catcurrent-smoker  0.572822  1.773265  0.246455  2.324 0.02011
#> ageent                   0.046800  1.047913  0.034900  1.341 0.17993
#> sysbp                    0.015836  1.015962  0.005094  3.109 0.00188
#> bmi                     -0.029831  0.970609  0.028819 -1.035 0.30061
#> diabp                   -0.014591  0.985515  0.009696 -1.505 0.13236
#> hdlc                    -0.029871  0.970571  0.281960 -0.106 0.91563
#> chdstart                 0.596817  1.816329  0.288393  2.069 0.03850
#> 
#> Likelihood ratio test=21.88  on 8 df, p=0.005139
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[9]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z        p
#> smoke_catex-smoker       0.088270  1.092283  0.299646  0.295 0.768315
#> smoke_catcurrent-smoker  0.593003  1.809415  0.246542  2.405 0.016160
#> ageent                   0.058144  1.059867  0.034420  1.689 0.091176
#> sysbp                    0.009854  1.009903  0.005672  1.737 0.082325
#> bmi                     -0.045921  0.955118  0.028693 -1.600 0.109503
#> diabp                    0.004997  1.005010  0.010288  0.486 0.627163
#> hdlc                    -0.747038  0.473768  0.332247 -2.248 0.024548
#> chdstart                 0.872675  2.393305  0.250838  3.479 0.000503
#> 
#> Likelihood ratio test=31.26  on 8 df, p=0.0001265
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)
#> 
#> [[10]]
#> Call:
#> coxph(formula = Surv(y, death) ~ smoke_cat + ageent + sysbp + 
#>     bmi + diabp + hdlc + chdstart)
#> 
#>                              coef exp(coef)  se(coef)      z       p
#> smoke_catex-smoker       0.148782  1.160420  0.299364  0.497 0.61919
#> smoke_catcurrent-smoker  0.617236  1.853797  0.247265  2.496 0.01255
#> ageent                   0.054285  1.055785  0.034400  1.578 0.11456
#> sysbp                    0.011574  1.011641  0.005416  2.137 0.03260
#> bmi                     -0.019960  0.980237  0.028591 -0.698 0.48509
#> diabp                   -0.007853  0.992178  0.010108 -0.777 0.43723
#> hdlc                    -0.649782  0.522160  0.327854 -1.982 0.04749
#> chdstart                 0.807447  2.242176  0.270274  2.988 0.00281
#> 
#> Likelihood ratio test=23.96  on 8 df, p=0.002329
#> n= 317, number of events= 88 
#>    (1 observation deleted due to missingness)

pooled_fit <- pool(fit_imp)
summary(pooled_fit, conf.int = TRUE, exponentiate = TRUE)

term	estimate	std.error	statistic	df	p.value	2.5 %	97.5 %	conf.low	conf.high
smoke_catex-smoker	1.1206554	0.3018151	0.3774289	77.38172	0.7068881	0.6144489	2.043894	0.6144489	2.043894
smoke_catcurrent-smoker	1.8305254	0.2487275	2.4307845	76.92589	0.0173945	1.1155144	3.003837	1.1155144	3.003837
ageent	1.0559965	0.0348788	1.5621195	76.37623	0.1223941	0.9851348	1.131955	0.9851348	1.131955
sysbp	1.0089726	0.0077252	1.1562847	19.69379	0.2613995	0.9928278	1.025380	0.9928278	1.025380
bmi	0.9775411	0.0316092	-0.7186186	48.26172	0.4758409	0.9173552	1.041676	0.9173552	1.041676
diabp	1.0001947	0.0163862	0.0118832	13.42001	0.9906936	0.9655154	1.036120	0.9655154	1.036120
hdlc	0.7039713	0.4211070	-0.8335594	19.87747	0.4144341	0.2923572	1.695103	0.2923572	1.695103
chdstart	2.0684771	0.2992412	2.4288523	54.76648	0.0184571	1.1354879	3.768070	1.1354879	3.768070

Part II


paed <- read_dta("paedsurv_new.dta")
head(paed, n=10)

idno	sex	chiv	dob	date_enr	date_out	ill3m	stunted	cd4cat
1001	1	0	2010-01-11	2010-03-01	2010-04-03	0	1	NA
1002	2	0	2010-01-11	2010-03-01	2010-04-03	0	1	NA
1003	2	1	2003-08-10	2006-05-24	2010-03-12	1	0	3
1004	1	0	2009-01-24	2009-03-09	2010-03-18	0	0	NA
1005	2	0	2004-12-23	2005-04-18	2010-01-25	0	0	NA
1006	1	0	2008-04-05	2008-05-27	2010-01-25	1	0	NA
1007	1	0	2002-06-03	2002-09-24	2010-03-18	1	0	1
1008	1	0	2007-05-02	2007-07-25	2009-10-23	1	0	1
1009	2	0	2007-07-25	2007-10-26	2009-11-03	1	1	1
1010	1	1	2008-11-16	2009-01-09	2010-01-13	1	0	NA

paed <- paed %>%
  mutate(
    sex    = if (inherits(sex, "haven_labelled"))    as_factor(sex)    else as.factor(sex),
    chiv   = if (inherits(chiv, "haven_labelled"))   as_factor(chiv)   else as.factor(chiv),
    ill3m  = if (inherits(ill3m, "haven_labelled"))  as_factor(ill3m)  else as.factor(ill3m),
    cd4cat = if (inherits(cd4cat, "haven_labelled")) as_factor(cd4cat) else as.factor(cd4cat),
    stunted = as.factor(stunted)
  )
 

head(paed, n=10)

idno	sex	chiv	dob	date_enr	date_out	ill3m	stunted	cd4cat
1001	Male	HIV exposed	2010-01-11	2010-03-01	2010-04-03	No	1	NA
1002	Female	HIV exposed	2010-01-11	2010-03-01	2010-04-03	No	1	NA
1003	Female	HIV positive	2003-08-10	2006-05-24	2010-03-12	Yes	0	<15%
1004	Male	HIV exposed	2009-01-24	2009-03-09	2010-03-18	No	0	NA
1005	Female	HIV exposed	2004-12-23	2005-04-18	2010-01-25	No	0	NA
1006	Male	HIV exposed	2008-04-05	2008-05-27	2010-01-25	Yes	0	NA
1007	Male	HIV exposed	2002-06-03	2002-09-24	2010-03-18	Yes	0	25%+
1008	Male	HIV exposed	2007-05-02	2007-07-25	2009-10-23	Yes	0	25%+
1009	Female	HIV exposed	2007-07-25	2007-10-26	2009-11-03	Yes	1	25%+
1010	Male	HIV positive	2008-11-16	2009-01-09	2010-01-13	Yes	0	NA

sapply(paed[, c("sex","chiv","ill3m","stunted","cd4cat","dob","date_enr","date_out","died")], class)
#>       sex      chiv     ill3m   stunted    cd4cat       dob  date_enr  date_out 
#>  "factor"  "factor"  "factor"  "factor"  "factor"    "Date"    "Date"    "Date" 
#>      died 
#> "numeric"

# ---- 2) Derive age at enrolment & person-time ----
paed <- paed %>%
  mutate(
    age_enrol_years = as.numeric(difftime(date_enr, dob, units = "days")) / 365.25,
    overall_survival = as.numeric(difftime(date_out, date_enr, units = "days")),   # follow-up in days
    overall_survival = ifelse(is.na(overall_survival), NA_real_, pmax(overall_survival, 0)),
    person_years = overall_survival / 365.25,
   
    died = as.integer(died)
  )
head(paed, n=10)

idno	sex	chiv	dob	date_enr	date_out	ill3m	stunted	cd4cat	age_enrol_years	overall_survival	person_years
1001	Male	HIV exposed	2010-01-11	2010-03-01	2010-04-03	No	1	NA	0.1341547	33.09961	0.0906218
1002	Female	HIV exposed	2010-01-11	2010-03-01	2010-04-03	No	1	NA	0.1341547	33.09961	0.0906218
1003	Female	HIV positive	2003-08-10	2006-05-24	2010-03-12	Yes	0	<15%	2.7871321	1388.00000	3.8001369
1004	Male	HIV exposed	2009-01-24	2009-03-09	2010-03-18	No	0	NA	0.1204654	374.00000	1.0239562
1005	Female	HIV exposed	2004-12-23	2005-04-18	2010-01-25	No	0	NA	0.3175907	1743.00000	4.7720739
1006	Male	HIV exposed	2008-04-05	2008-05-27	2010-01-25	Yes	0	NA	0.1423682	608.00000	1.6646133
1007	Male	HIV exposed	2002-06-03	2002-09-24	2010-03-18	Yes	0	25%+	0.3093771	2732.00000	7.4798084
1008	Male	HIV exposed	2007-05-02	2007-07-25	2009-10-23	Yes	0	25%+	0.2299795	821.00000	2.2477755
1009	Female	HIV exposed	2007-07-25	2007-10-26	2009-11-03	Yes	1	25%+	0.2546201	739.00000	2.0232717
1010	Male	HIV positive	2008-11-16	2009-01-09	2010-01-13	Yes	0	NA	0.1478439	369.00000	1.0102669

# ---- 3) Basic descriptive rates by HIV status (crude) ----
rates_chiv <- paed %>%
  filter(!is.na(chiv)) %>%
  group_by(chiv) %>%
  summarise(
    deaths = sum(died == 1, na.rm = TRUE),
    person_y = sum(person_years, na.rm = TRUE),
    rate_per_1000py = 1000 * deaths / person_y,
    .groups = "drop"
  )
rates_chiv

chiv	deaths	person_y	rate_per_1000py
HIV exposed	16	1217.7708	13.13876
HIV positive	27	638.3912	42.29381

b) Examine the effect of child’s HIV status (chiv) using strate and stcox. What is the HR and 95% CI?

A crude Cox model showed that HIV-positive children had a hazard ratio (HR) of 3.67 (95% CI: 1.98–6.82, p < 0.001) compared with HIV-exposed children. This indicates that HIV-positive children had over three times the hazard of death when compared on the same follow-up time scale.

c) Consider the possible confounding effect of current age on the relationship between child’s HIV status and mortality. Before doing any analyses, would you expect a Cox model controlling for current age to give different results from the one controlling for time in the study?

Before adjusting for age, there was a marked difference in mean age at study entry between HIV groups. HIV-exposed children entered at an average of 0.32 years, whereas HIV-positive children entered at 4.33 years. Since older age can be protective against mortality in this cohort, controlling for age is expected to alter the estimated effect of HIV status.


cox_dat <- paed %>% filter(!is.na(overall_survival) & !is.na(died) & !is.na(chiv))

# Crude Cox: HIV status only
cox_crude <- coxph(Surv(overall_survival, died) ~ chiv, data = cox_dat)
summary(cox_crude)
#> Call:
#> coxph(formula = Surv(overall_survival, died) ~ chiv, data = cox_dat)
#> 
#>   n= 590, number of events= 43 
#> 
#>                         coef exp(coef) se(coef)     z Pr(>|z|)    
#> chivHIV positive      1.3009    3.6725   0.3158 4.119 3.81e-05 ***
#> chivHIV neg exposed       NA        NA   0.0000    NA       NA    
#> chivHIV neg unexposed     NA        NA   0.0000    NA       NA    
#> chivUnknown               NA        NA   0.0000    NA       NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#>                       exp(coef) exp(-coef) lower .95 upper .95
#> chivHIV positive          3.672     0.2723     1.978      6.82
#> chivHIV neg exposed          NA         NA        NA        NA
#> chivHIV neg unexposed        NA         NA        NA        NA
#> chivUnknown                  NA         NA        NA        NA
#> 
#> Concordance= 0.654  (se = 0.038 )
#> Likelihood ratio test= 17.7  on 1 df,   p=3e-05
#> Wald test            = 16.96  on 1 df,   p=4e-05
#> Score (logrank) test = 19.48  on 1 df,   p=1e-05

d) Are there large differences in age at entry between the HIV groups?

Yes, there are large diffrences in age at entry between the HIV groups. The age range for HIV-exposed children was 0–6.80 years, while for HIV-positive children it was 0–13.18 years. These large differences in baseline age also confirm the potential for confounding, making age adjustment necessary.

dat <-cox_dat %>% mutate(age_entry = age_enrol_years)  # just a readable alias
age_tab <- dat %>%
  group_by(chiv) %>%
  summarise(
    n   = sum(!is.na(age_entry)),
    mean = mean(age_entry, na.rm = TRUE),
    sd   = sd(age_entry, na.rm = TRUE),
    min  = min(age_entry, na.rm = TRUE),
    max  = max(age_entry, na.rm = TRUE),
    .groups = "drop"
)
age_tab

chiv	n	mean	sd	min	max
HIV exposed	412	0.3238306	0.6747565	0.0000000	6.795346
HIV positive	178	4.3317106	4.1456232	0.0054757	13.182752

e) Now perform a Cox regression controlling for current age to investigate whether age is a confounder

After adjusting for age at entry, the HR for HIV-positive versus HIV-exposed increased to 5.44 (95% CI: 2.78–10.65, p < 0.001). Age itself was inversely associated with mortality (HR per year = 0.88, 95% CI: 0.78–0.99, p = 0.029). This strengthening of the HIV effect after age adjustment suggests negative confounding by age.

Traditional Cox model

cox_fit <- coxph(Surv(overall_survival, died) ~ chiv + age_entry, data = dat)

summary(cox_fit)
#> Call:
#> coxph(formula = Surv(overall_survival, died) ~ chiv + age_entry, 
#>     data = dat)
#> 
#>   n= 590, number of events= 43 
#> 
#>                           coef exp(coef) se(coef)      z Pr(>|z|)    
#> chivHIV positive       1.69303   5.43591  0.34295  4.937 7.94e-07 ***
#> chivHIV neg exposed         NA        NA  0.00000     NA       NA    
#> chivHIV neg unexposed       NA        NA  0.00000     NA       NA    
#> chivUnknown                 NA        NA  0.00000     NA       NA    
#> age_entry             -0.13096   0.87725  0.06003 -2.181   0.0292 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#>                       exp(coef) exp(-coef) lower .95 upper .95
#> chivHIV positive         5.4359      0.184    2.7756   10.6460
#> chivHIV neg exposed          NA         NA        NA        NA
#> chivHIV neg unexposed        NA         NA        NA        NA
#> chivUnknown                  NA         NA        NA        NA
#> age_entry                0.8773      1.140    0.7799    0.9868
#> 
#> Concordance= 0.632  (se = 0.05 )
#> Likelihood ratio test= 23.41  on 2 df,   p=8e-06
#> Wald test            = 24.45  on 2 df,   p=5e-06
#> Score (logrank) test = 29.05  on 2 df,   p=5e-07

f) With age as the analysis scale, use a Poisson model to assess the relationship between child’s HIV status and mortality. How do the results compare to those from using a Cox model?

Using the Poisson model with log–person-years offset, the estimated rate ratio (RR) for mortality comparing HIV-positive to HIV-exposed children is 5.31 (e^1.669 = 5.31), with a 95% CI of 2.69–10.46 (p = 1.4×10⁻⁶). Age is protective: each additional year at entry is associated with RR = 0.85 (95% CI 0.75–0.96, p = 0.011). These results closely match the age-adjusted Cox model (HR = 5.44, 95% CI 2.78–10.65, p < 0.001; age HR 0.88, 95% CI 0.78–0.99), indicating strong and consistent evidence that HIV-positive status is associated with roughly a five-fold higher mortality, and that older age is associated with lower risk.

Traditional Poisson Model

pois_dat <- dat %>% filter(!is.na(person_years) & !is.na(died) & !is.na(chiv))
Poisson_Trad <- glm(died ~ chiv +age_entry + offset(log(person_years)), family = poisson(link = "log"), data = pois_dat)
summary(Poisson_Trad)
#> 
#> Call:
#> glm(formula = died ~ chiv + age_entry + offset(log(person_years)), 
#>     family = poisson(link = "log"), data = pois_dat)
#> 
#> Coefficients:
#>                  Estimate Std. Error z value Pr(>|z|)    
#> (Intercept)      -4.27294    0.25091 -17.030  < 2e-16 ***
#> chivHIV positive  1.66934    0.34598   4.825  1.4e-06 ***
#> age_entry        -0.16282    0.06408  -2.541   0.0111 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for poisson family taken to be 1)
#> 
#>     Null deviance: 348.17  on 589  degrees of freedom
#> Residual deviance: 325.84  on 587  degrees of freedom
#> AIC: 417.84
#> 
#> Number of Fisher Scoring iterations: 7

g) In order to make a like-for-like comparison with the Cox model, what would be a more appropriate analysis using Poisson regression?

For a like-for-like comparison with the Cox model, the Poisson regression should be parameterised as a piecewise exponential model, with the baseline hazard split into appropriate time intervals (e.g., age bands) and an offset for log person-time. This allows the Poisson model to mimic the Cox model’s handling of time-to-event data.

h) Use the stsplit (Lexis in R) command to create 1-year age bands of current age from 0 to 5 years, and 5-year bands for children aged over 5. Re-run the Poisson model controlling for current age. How do these results compare with those obtained from the Cox model?

When comparing the traditional Cox model with the Poisson regression using 1-year age bands for children 0–5 years and 5-year bands for older children, the results are directionally consistent but differ in magnitude. Both models show that HIV positivity substantially increases mortality, while older age is protective. In the Cox model, the hazard ratio for HIV positive children is approximately 5.44 (95% CI: 2.78–10.65), whereas the Poisson model with age bands gives a higher incidence rate ratio of about 9.47 (95% CI: 8.82–10.17).

head(pois_dat, n=10)

idno	sex	chiv	dob	date_enr	date_out	ill3m	stunted	cd4cat	age_enrol_years	overall_survival	person_years	age_entry
1001	Male	HIV exposed	2010-01-11	2010-03-01	2010-04-03	No	1	NA	0.1341547	33.09961	0.0906218	0.1341547
1002	Female	HIV exposed	2010-01-11	2010-03-01	2010-04-03	No	1	NA	0.1341547	33.09961	0.0906218	0.1341547
1003	Female	HIV positive	2003-08-10	2006-05-24	2010-03-12	Yes	0	<15%	2.7871321	1388.00000	3.8001369	2.7871321
1004	Male	HIV exposed	2009-01-24	2009-03-09	2010-03-18	No	0	NA	0.1204654	374.00000	1.0239562	0.1204654
1005	Female	HIV exposed	2004-12-23	2005-04-18	2010-01-25	No	0	NA	0.3175907	1743.00000	4.7720739	0.3175907
1006	Male	HIV exposed	2008-04-05	2008-05-27	2010-01-25	Yes	0	NA	0.1423682	608.00000	1.6646133	0.1423682
1007	Male	HIV exposed	2002-06-03	2002-09-24	2010-03-18	Yes	0	25%+	0.3093771	2732.00000	7.4798084	0.3093771
1008	Male	HIV exposed	2007-05-02	2007-07-25	2009-10-23	Yes	0	25%+	0.2299795	821.00000	2.2477755	0.2299795
1009	Female	HIV exposed	2007-07-25	2007-10-26	2009-11-03	Yes	1	25%+	0.2546201	739.00000	2.0232717	0.2546201
1010	Male	HIV positive	2008-11-16	2009-01-09	2010-01-13	Yes	0	NA	0.1478439	369.00000	1.0102669	0.1478439

poisson <- Lexis(exit= list(tfe =overall_survival),
exit.status= factor(died,
labels = c("Alive","Dead")),
data = pois_dat)
#> NOTE: entry.status has been set to "Alive" for all.
#> NOTE: entry is assumed to be 0 on the tfe timescale.

# 1. Create age bands (1-year up to 5, then 5-year)
age_breaks <- c(0:5, seq(10, max(poisson$overall_survival, na.rm=TRUE), by=5))

# 2. Split data by current age
poisson_split <- splitMulti(poisson, 
                          tfe = age_breaks,
                          time.scale = "overall_survival")

# 3. Create age band factor
poisson_split$age_band <- cut(poisson_split$age_entry, 
                             breaks = age_breaks,
                             right = FALSE, 
                             include.lowest = TRUE)

# 4. Poisson model
poisson_Annual <- glm(died ~ chiv + age_band + offset(log(lex.dur)),
                    family = "poisreg",
                    data = poisson_split)

Comparing the traditional Cox model with the poisson regression (1-year age bands of current age from 0 to 5 years, and 5-year bands for children aged over 5)

rbind(ci.exp(cox_fit), ci.exp(poisson_Annual, subset = c("chiv", "age_entry")))
#>                       exp(Est.)     2.5%      97.5%
#> chivHIV positive      5.4359084 2.775594 10.6460459
#> chivHIV neg exposed   1.0000000 1.000000  1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000  1.0000000
#> chivUnknown           1.0000000 1.000000  1.0000000
#> age_entry             0.8772548 0.779876  0.9867927
#> chivHIV positive      9.4718927 8.819615 10.1724110

i) Does age confound the relationship between HIV status and mortality?

Yes. Age confounds the HIV–mortality relationship because it is associated with both HIV status and mortality; after adjustment, HIV positive children still had a much higher risk of death (HR ≈ 5.4), but age showed an independent protective effect (HR ≈ 0.88).

j) Redo part (h) and (i) to refit another Poisson model but splitting time at every event?

When comparing the traditional Cox proportional hazards model with the Poisson regression approach (where follow-up time is split at each event to approximate the partial likelihood), the results are nearly identical. In both models, HIV-positive status is strongly associated with an increased risk of mortality, with hazard/IRR estimates around 5.4 (95% CI ≈ 2.8–10.6). Similarly, age at entry is protective, with an effect estimate of about 0.88 (95% CI ≈ 0.78–0.99), meaning each additional year of age is associated with a ~12% reduction in risk.

Spliting data into small intervals

poisson.s <- splitMulti(poisson, tfe = c(0, sort(unique(poisson$overall_survival))))
summary(poisson.s)
#>        
#> Transitions:
#>      To
#> From     Alive Dead  Records:  Events: Risk time:  Persons:
#>   Alive 137287   43    137330       43   677963.2       590


mLs.pois.fc <- glm(cbind(lex.Xst == "Dead", lex.dur) ~ 0 + factor(tfe) + chiv + age_entry, family = "poisreg", data = poisson.s)

Comparing the traditional Cox model with the poisson regression (splitting time at every event)

rbind(ci.exp(cox_fit), ci.exp(mLs.pois.fc, subset = c("chiv", "age_entry")))
#>                       exp(Est.)      2.5%      97.5%
#> chivHIV positive      5.4359084 2.7755939 10.6460459
#> chivHIV neg exposed   1.0000000 1.0000000  1.0000000
#> chivHIV neg unexposed 1.0000000 1.0000000  1.0000000
#> chivUnknown           1.0000000 1.0000000  1.0000000
#> age_entry             0.8772548 0.7798760  0.9867927
#> chivHIV positive      5.4337909 2.7745567 10.6417304
#> age_entry             0.8772983 0.7799259  0.9868274

k) Bonus: Fit an appropriate Parametric model or Frailty model whichever is most suitable and distinguish how you will interpret the results for a single categorical and also a single continuous variable.

Appropriate Parametric Model

Among the candidate models fitted, the Gompertz parametric survival model was the most suitable, as it had the lowest AIC (875.0) and BIC, indicating the best balance of fit and parsimony. Frailty models were not required, as there was no indication of clustering or unmeasured shared risk within subgroups.

Interpretation of a Categorical Variable (HIV Status)

For the categorical predictor HIV status, the Gompertz model estimated that HIV-positive children had a hazard of death approximately 5.46 times higher than HIV-negative children (HR = 5.46, 95% CI: 2.78–10.7, p < 0.001). This interpretation means that, holding other factors constant, HIV infection is associated with a markedly elevated risk of mortality throughout follow-up. In practical terms, HIV infection substantially accelerates progression to death compared with uninfected children.

Interpretation of a Continuous Variable (Age at Enrolment)

For the continuous predictor age at enrolment, the Gompertz model estimated a hazard ratio of 0.88 per year (95% CI: 0.78–0.99, p = 0.03). This indicates that each additional year of age at enrolment is associated with about a 12% reduction in the hazard of death. In other words, older children at the time of enrolment are at lower risk of mortality compared with younger children, suggesting age acts as a protective factor.

Running 6 parameric survival models in flexsurvreg to choose the model that fits the data well

s_object <- Surv(overall_survival, died) ~ chiv + age_enrol_years


models <- list(
  expo   = try(flexsurvreg(s_object, data = paed, dist = "exponential"), silent = TRUE),
  weib   = try(flexsurvreg(s_object, data = paed, dist = "weibull"), silent = TRUE),
  gomp   = try(flexsurvreg(s_object, data = paed, dist = "gompertz"), silent = TRUE),
  lnorm  = try(flexsurvreg(s_object, data = paed, dist = "lnorm"), silent = TRUE),
  llog   = try(flexsurvreg(s_object, data = paed, dist = "llogis"), silent = TRUE),
  ggamma = try(flexsurvreg(s_object, data = paed, dist = "gengamma"), silent = TRUE)
)

Extracting AIC and BIC values for ranking the best model

extract_ic <- function(m, n) {
  if (inherits(m, "try-error")) return(c(AIC = NA, BIC = NA))
  k <- length(m$res[, "est"])      # number of parameters
  ll <- m$loglik
  aic <- m$AIC
  bic <- -2 * ll + k * log(n)
  c(AIC = aic, BIC = bic)
}

n <- nrow(paed)  # sample size

ic_values <- t(sapply(models, extract_ic, n = n))

# Convert to data.frame for easy sorting
ic_df <- as.data.frame(ic_values)
ic_df$Model <- rownames(ic_df)

# Rankings
aic_rank <- ic_df[order(ic_df$AIC), c("Model", "AIC")]
bic_rank <- ic_df[order(ic_df$BIC), c("Model", "BIC")]

print(aic_rank)
#>         Model      AIC
#> gomp     gomp 875.0119
#> ggamma ggamma 880.0386
#> expo     expo       NA
#> weib     weib       NA
#> lnorm   lnorm       NA
#> llog     llog       NA
print(bic_rank)
#>         Model      BIC
#> gomp     gomp 892.5324
#> ggamma ggamma 901.9392
#> expo     expo       NA
#> weib     weib       NA
#> lnorm   lnorm       NA
#> llog     llog       NA


# Define time grid for plotting
time_grid <- seq(0, max(paed$overall_survival, na.rm = TRUE), length.out = 200)

# Function to extract hazard estimates from a fitted flexsurvreg model
get_hazard_df <- function(model, model_name, time_grid) {
  if (inherits(model, "try-error")) return(NULL)
  pred <- summary(model, type = "hazard", t = time_grid, ci = FALSE)
  data.frame(
    time = time_grid,
    hazard = pred[[1]]$est,
    model = model_name
  )
}

# Collect hazards from your models
haz_list <- list(
  get_hazard_df(models$gomp, "Gompertz", time_grid),
  get_hazard_df(models$ggamma, "GenGamma", time_grid)
  # Add others if they converge, e.g. Weibull, Lognormal, Log-logistic
)

haz_df <- bind_rows(haz_list)

# Plot with ggplot
ggplot(haz_df, aes(x = time, y = hazard, colour = model)) +
  geom_line(size = 1.2) +
  labs(
    title = "Hazard Functions from Parametric Models",
    x = "Time (years)",
    y = "Hazard Rate"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

Running the best model based on AIC and BIC

Gompertz_model<- flexsurvreg(s_object, data = paed, dist = "gompertz")
Gompertz_model
#> Call:
#> flexsurvreg(formula = s_object, data = paed, dist = "gompertz")
#> 
#> Estimates: 
#>                   data mean  est        L95%       U95%       se       
#> shape                    NA  -1.68e-03  -2.23e-03  -1.12e-03   2.84e-04
#> rate                     NA   9.78e-05   5.50e-05   1.74e-04   2.87e-05
#> chivHIV positive   3.02e-01   1.70e+00   1.02e+00   2.37e+00   3.43e-01
#> age_enrol_years    1.53e+00  -1.32e-01  -2.50e-01  -1.39e-02   6.03e-02
#>                   exp(est)   L95%       U95%     
#> shape                    NA         NA         NA
#> rate                     NA         NA         NA
#> chivHIV positive   5.46e+00   2.78e+00   1.07e+01
#> age_enrol_years    8.76e-01   7.79e-01   9.86e-01
#> 
#> N = 590,  Events: 43,  Censored: 547
#> Total time at risk: 677963.2
#> Log-likelihood = -433.5059, df = 4
#> AIC = 875.0119

Part III

Across all the modeling frameworks applied, the Cox proportional hazards model on the Lexis dataset, the time-split Cox model, the Poisson regression under a piecewise exponential formulation, the parametric Gompertz survival model, and the flexible additive hazard model (GAMM) using R2BayesX , the results were consistent.

In each case, HIV status emerged as the strongest predictor of mortality, with HIV-positive children having a markedly higher hazard of death compared to exposed or unexposed peers. This was confirmed by hazard ratios greater than 5 in both the Cox and GAMM models. Similarly, age at enrolment consistently showed a protective effect, with older children experiencing reduced mortality risk across all models, regardless of whether age was modeled linearly (Cox, Poisson, Gompertz) or flexibly through smoothing (GAMM).

The flexible GAMM added value by allowing the hazard to vary smoothly with survival time, yet the smooth term did not reveal substantial deviations from proportionality, suggesting that the Cox model’s proportional hazards assumption was adequate. Importantly, the GAMM’s parametric coefficients aligned closely with the Cox and Gompertz estimates, reinforcing the robustness of the conclusions.

The consistency also highlights that the substantive conclusions are not sensitive to the chosen modeling strategy, although the GAMM provided additional reassurance by flexibly checking for non-linearities and time-varying effects.

Comparing the traditionalCox model with the Cox(splitting time at every event), and Cox (Lexis dataset - unsplit)

Cox-Unsplit

Cox_lexis <- coxph(Surv(tfe, tfe+lex.dur,lex.Xst == "Dead") ~ chiv+ age_enrol_years ,
eps = 10^-11,iter.max = 25, data =poisson)

Cox-Split at every time interval

Cox_timeSplit<- coxph(Surv(tfe, tfe+lex.dur, lex.Xst == "Dead")~ chiv+ age_enrol_years ,
eps = 10^-11,iter.max = 25, data =poisson.s)

rbind(ci.exp(cox_fit),
ci.exp(Cox_lexis),
ci.exp(Cox_timeSplit, subset = c("chiv", "age_enrol_years ")))
#>                       exp(Est.)     2.5%      97.5%
#> chivHIV positive      5.4359084 2.775594 10.6460459
#> chivHIV neg exposed   1.0000000 1.000000  1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000  1.0000000
#> chivUnknown           1.0000000 1.000000  1.0000000
#> age_entry             0.8772548 0.779876  0.9867927
#> chivHIV positive      5.4359084 2.775594 10.6460459
#> chivHIV neg exposed   1.0000000 1.000000  1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000  1.0000000
#> chivUnknown           1.0000000 1.000000  1.0000000
#> age_enrol_years       0.8772548 0.779876  0.9867927
#> chivHIV positive      5.4359084 2.775594 10.6460459
#> chivHIV neg exposed   1.0000000 1.000000  1.0000000
#> chivHIV neg unexposed 1.0000000 1.000000  1.0000000
#> chivUnknown           1.0000000 1.000000  1.0000000

A flexible GAMM model


gam_fit <- bayesx(died ~ chiv + age_enrol_years +
    sx(overall_survival, bs = "bl"),
  family = "cox",
  data   = paed,
  method = "REML"
)

summary(gam_fit)
#> Call:
#> bayesx(formula = died ~ chiv + age_enrol_years + sx(overall_survival, 
#>     bs = "bl"), data = paed, family = "cox", method = "REML")
#>  
#> Fixed effects estimation results:
#> 
#> Parametric coefficients:
#>                  Estimate Std. Error  t value Pr(>|t|)    
#> (Intercept)      -11.2382     0.2609 -43.0723   <2e-16 ***
#> chivHIV positive   1.6973     0.3432   4.9452   <2e-16 ***
#> age_enrol_years   -0.1315     0.0602  -2.1843   0.0293 *  
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Smooth terms:
#>                       Variance Smooth Par.        df Stopped
#> sx(overall_survival)    0.0005   1993.4700    1.0742       0
#>  
#> N = 590  df = 4.07418  AIC = 875.595  BIC = 893.441  
#> logLik = -433.7235  method = REML  family = cox

plot(gam_fit,  resid = TRUE, cex.resid = 0.1)

Model equations of the models fitted

1. Traditional Cox Proportional Hazards Model

\[ h(t \mid X) = h_0(t)\exp(\beta^\top X) \]

2. Piecewise Cox model (time split lexis data)

\[ \lambda(t \mid X) = \lambda_0^{(k)} \exp(\beta^\top X), \quad \text{for } t \in (\tau_{k-1}, \tau_k] \]

3. Traditional Poisson model

\[ \mu_i = Y_i \cdot \exp(\alpha_k + \beta^\top X_i) \]

4. Piecewise Poisson model (time split lexis data)

\[ \lambda_i = \frac{\mu_i}{Y_i} = \exp\left(\alpha_{k(i)} + \beta^\top X_i\right) \]

5. GAMM

\[ h(t \mid X) = h_0(t)\exp\left(\beta^\top X + f(t)\right) \]

6. Gompertz model

\[ h(t \mid X) = \lambda \exp(\gamma t)\exp(\beta^\top X) \]

7. Weibull

PH model

\[ h(t \mid X) = \lambda_k t^{k-1} \exp(\beta^\top X) \]

So, Who needs the Cox model anyway? I don’t 🤷‍♂️