Introduction

This report analyses the rats dataset from the survival package in R. The dataset comes from a study where rats were exposed to a carcinogenic agent and monitored for tumour development. We use parametric survival models to understand the time-to-tumour patterns and compare the treated and control groups.

The key variables we work with are:

  • time — observed survival time (days)
  • status / event — tumour occurrence indicator (1 = tumour, 0 = censored)
  • treatment — treatment group (Control vs Treated)
  • sex — sex of the rat (f = female, m = male)
  • litter — litter identifier (used to account for clustering)

Packages

library(survival)
library(flexsurv)
library(survminer)
library(ggplot2)
library(dplyr)
library(knitr)
library(kableExtra)
library(broom)
library(gridExtra)
library(patchwork)

# helper to style all our tables consistently
style_table <- function(df, caption) {
  df %>%
    kbl(caption = caption) %>%
    kable_styling(
      bootstrap_options = c("striped", "hover", "condensed"),
      full_width = FALSE
    ) %>%
    row_spec(0, bold = TRUE, background = "#2E86AB", color = "white")
}

Data Preparation

data(rats)

rats <- rats %>%
  mutate(
    litter    = as.factor(litter),
    event     = as.numeric(status),
    treatment = factor(ifelse(rx == 1, "Treated", "Control"),
                       levels = c("Control", "Treated")),
    sex       = as.factor(sex)
  )

rats_clean <- rats %>%
  select(litter, time, event, treatment, sex) %>%
  na.omit()

Missing values

data.frame(
  Variable = names(rats),
  Missing  = colSums(is.na(rats)),
  Percent  = round(100 * colSums(is.na(rats)) / nrow(rats), 1)
) %>%
  style_table("Table 1: Missing Data Check")
Table 1: Missing Data Check
Variable Missing Percent
litter litter 0 0
rx rx 0 0
time time 0 0
status status 0 0
sex sex 0 0
event event 0 0
treatment treatment 0 0

No missing values — all 300 observations are complete.


Descriptive Summary

# overall + by group
overall <- data.frame(
  Group          = "Overall",
  N              = nrow(rats_clean),
  Events         = sum(rats_clean$event),
  Censored       = sum(rats_clean$event == 0),
  Event_Rate_Pct = round(mean(rats_clean$event) * 100, 1),
  Min_Time       = min(rats_clean$time),
  Max_Time       = max(rats_clean$time),
  Mean_Time      = round(mean(rats_clean$time), 2),
  Median_Time    = median(rats_clean$time),
  SD_Time        = round(sd(rats_clean$time), 2)
)

by_group <- rats_clean %>%
  group_by(Group = treatment) %>%
  summarise(
    N              = n(),
    Events         = sum(event),
    Censored       = sum(event == 0),
    Event_Rate_Pct = round(mean(event) * 100, 1),
    Min_Time       = min(time),
    Max_Time       = max(time),
    Mean_Time      = round(mean(time), 2),
    Median_Time    = median(time),
    SD_Time        = round(sd(time), 2),
    .groups        = "drop"
  ) %>%
  mutate(Group = as.character(Group))

bind_rows(overall, by_group) %>%
  style_table("Table 2: Sample Summary by Group")
Table 2: Sample Summary by Group
Group N Events Censored Event_Rate_Pct Min_Time Max_Time Mean_Time Median_Time SD_Time
Overall 300 42 258 14.0 23 104 90.44 98.0 17.06
Control 200 21 179 10.5 32 104 90.94 99.0 16.58
Treated 100 21 79 21.0 23 104 89.45 93.5 18.01

A few things stand out immediately. The overall event rate is only 14% (42/300), meaning the majority of rats never developed a tumour during the study. The treated group has a noticeably higher event rate (~21%) compared to control (~10.5%), which makes biological sense — the treated rats received a carcinogen.

p1 <- ggplot(rats_clean, aes(x = time, fill = treatment)) +
  geom_histogram(bins = 20, alpha = 0.75, color = "white", position = "identity") +
  scale_fill_manual(values = c("Control" = "#2E86AB", "Treated" = "#F18F01")) +
  labs(title = "Distribution of Survival Times", x = "Time (days)", y = "Count", fill = "") +
  theme_minimal()

p2 <- ggplot(rats_clean, aes(x = treatment, fill = treatment)) +
  geom_bar() +
  scale_fill_manual(values = c("#2E86AB", "#F18F01")) +
  labs(title = "Treatment Group Sizes", x = "", y = "Count") +
  theme_minimal() + theme(legend.position = "none")

p3 <- ggplot(rats_clean, aes(x = sex, fill = sex)) +
  geom_bar() +
  scale_fill_manual(values = c("#A23B72", "#3B7080")) +
  labs(title = "Sex Distribution", x = "", y = "Count") +
  theme_minimal() + theme(legend.position = "none")

p4 <- ggplot(rats_clean, aes(x = treatment, y = time, fill = treatment)) +
  geom_boxplot(alpha = 0.8) +
  scale_fill_manual(values = c("#2E86AB", "#F18F01")) +
  labs(title = "Survival Time by Treatment", x = "", y = "Time (days)") +
  theme_minimal() + theme(legend.position = "none")

grid.arrange(p1, p2, p3, p4, ncol = 2)

The survival time distribution is left-skewed — most rats survived close to the maximum follow-up of 104 days. The 2:1 allocation (200 control, 100 treated) is by design.


Kaplan–Meier Analysis

surv_obj <- Surv(time = rats_clean$time, event = rats_clean$event)

Overall survival curve

km_overall <- survfit(surv_obj ~ 1, data = rats_clean)

ggsurvplot(
  km_overall,
  data      = rats_clean,
  conf.int  = TRUE,
  risk.table = TRUE,
  palette   = "#2E86AB",
  ggtheme   = theme_minimal(),
  title     = "Overall Kaplan–Meier Survival Curve",
  xlab      = "Time (days)",
  ylab      = "Survival Probability S(t)",
  risk.table.title = "Number at Risk"
)

Survival by treatment group

km_treatment <- survfit(surv_obj ~ treatment, data = rats_clean)

ggsurvplot(
  km_treatment,
  data          = rats_clean,
  conf.int      = TRUE,
  risk.table    = TRUE,
  pval          = TRUE,
  pval.method   = TRUE,
  palette       = c("#2E86AB", "#F18F01"),
  legend.labs   = c("Control", "Treated"),
  legend.title  = "Group",
  ggtheme       = theme_minimal(),
  title         = "Kaplan–Meier Curves by Treatment Group",
  subtitle      = "Median survival not reached — event rate < 50%",
  xlab          = "Time (days)",
  ylab          = "Survival Probability S(t)",
  surv.median.line = "none",
  risk.table.title = "Number at Risk",
  tables.theme  = theme_cleantable()
)

Survival by sex

km_sex <- survfit(surv_obj ~ sex, data = rats_clean)

ggsurvplot(
  km_sex,
  data         = rats_clean,
  conf.int     = TRUE,
  risk.table   = TRUE,
  pval         = TRUE,
  pval.method  = TRUE,
  palette      = c("#A23B72", "#3B7080"),
  legend.labs  = c("Female", "Male"),
  legend.title = "Sex",
  ggtheme      = theme_minimal(),
  title        = "Kaplan–Meier Curves by Sex",
  xlab         = "Time (days)",
  ylab         = "Survival Probability S(t)",
  risk.table.title = "Number at Risk",
  tables.theme = theme_cleantable()
)

Sex turns out to be a very strong predictor (p < 0.0001), something we carry into the Cox models below.

Why median survival is NA

Median survival is only defined when at least 50% of subjects experience the event. With only 14% overall event rate, the KM curve never drops below 0.5, so the median is mathematically undefined — this is expected, not an error. We report the Restricted Mean Survival Time (RMST) instead.

rmst_overall <- summary(km_overall)$table
rmst_groups  <- summary(km_treatment)$table

rmst_table <- data.frame(
  Group     = c("Overall", "Control", "Treated"),
  RMST_days = round(c(
    rmst_overall["rmean"],
    rmst_groups["treatment=Control", "rmean"],
    rmst_groups["treatment=Treated", "rmean"]
  ), 2),
  SE = round(c(
    rmst_overall["se(rmean)"],
    rmst_groups["treatment=Control", "se(rmean)"],
    rmst_groups["treatment=Treated", "se(rmean)"]
  ), 4)
)

style_table(rmst_table, "Table 3: Restricted Mean Survival Time by Group")
Table 3: Restricted Mean Survival Time by Group
Group RMST_days SE
Overall 99.78 0.7206
Control 100.38 0.8245
Treated 98.55 1.3952
key_times <- c(25, 50, 75, 100)
sp        <- summary(km_treatment, times = key_times)

data.frame(
  Time      = sp$time,
  Group     = gsub("treatment=", "", sp$strata),
  Survival  = round(sp$surv,  4),
  Lower_95  = round(sp$lower, 4),
  Upper_95  = round(sp$upper, 4),
  N_at_risk = sp$n.risk,
  Events    = sp$n.event
) %>%
  style_table("Table 4: Survival Probabilities at Key Time Points")
Table 4: Survival Probabilities at Key Time Points
Time Group Survival Lower_95 Upper_95 N_at_risk Events
25 Control 1.0000 1.0000 1.0000 200 0
50 Control 0.9848 0.9679 1.0000 195 3
75 Control 0.9419 0.9091 0.9759 168 8
100 Control 0.8909 0.8453 0.9390 99 8
25 Treated 1.0000 1.0000 1.0000 99 0
50 Treated 0.9697 0.9365 1.0000 95 3
75 Treated 0.9254 0.8736 0.9802 83 4
100 Treated 0.7987 0.7131 0.8946 44 9

Log-rank test

lr_test <- survdiff(surv_obj ~ treatment, data = rats_clean)
pval    <- 1 - pchisq(lr_test$chisq, df = 1)

data.frame(
  Statistic   = round(lr_test$chisq, 4),
  df          = 1,
  p_value     = round(pval, 4),
  Conclusion  = ifelse(pval < 0.05,
                       "Significant difference between groups",
                       "No significant difference")
) %>%
  style_table("Table 5: Log-Rank Test Result")
Table 5: Log-Rank Test Result
Statistic df p_value Conclusion
5.5487 1 0.0185 Significant difference between groups

Cox Proportional Hazards Models

We fit three versions of the Cox model: unadjusted (treatment only), adjusted (treatment + sex), and a clustered version that accounts for within-litter correlation using robust standard errors.

Unadjusted Cox model

cox_simple <- coxph(Surv(time, event) ~ treatment, data = rats_clean)

tidy(cox_simple, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~round(.x, 3))) %>%
  style_table("Table 6: Unadjusted Cox Model (Treatment Only)")
Table 6: Unadjusted Cox Model (Treatment Only)
term estimate std.error statistic p.value conf.low conf.high
treatmentTreated 2.042 0.309 2.311 0.021 1.115 3.739

Multivariable Cox model

cox_multi <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean)

tidy(cox_multi, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~round(.x, 3))) %>%
  style_table("Table 7: Multivariable Cox Model (Treatment + Sex)")
Table 7: Multivariable Cox Model (Treatment + Sex)
term estimate std.error statistic p.value conf.low conf.high
treatmentTreated 2.206 0.309 2.557 0.011 1.203 4.044
sexm 0.047 0.725 -4.232 0.000 0.011 0.193

After adjusting for sex, the treatment effect strengthens (HR = 2.21, p = 0.011). Sex is highly significant — male rats have dramatically lower hazard (HR ≈ 0.047, p < 0.001), suggesting a strong sex effect on tumour development.

Cox model with litter clustering

cox_cluster <- coxph(
  Surv(time, event) ~ treatment + sex + cluster(litter),
  data = rats_clean
)

tidy(cox_cluster, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~round(.x, 3))) %>%
  style_table("Table 8: Cox Model with Litter-Clustered Standard Errors")
Table 8: Cox Model with Litter-Clustered Standard Errors
term estimate std.error robust.se statistic p.value conf.low conf.high
treatmentTreated 2.206 0.309 0.293 2.702 0.007 1.243 3.915
sexm 0.047 0.725 0.721 -4.257 0.000 0.011 0.191

Accounting for litter clustering gives us robust standard errors. The treatment effect remains significant (HR = 2.21, p = 0.007), and the results are largely consistent with the unadjusted clustered SEs, confirming our earlier findings are not an artefact of within-litter dependence.

Tie-handling comparison

cox_b <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean, ties = "breslow")
cox_e <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean, ties = "efron")
cox_x <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean, ties = "exact")

data.frame(
  Method       = c("Breslow", "Efron", "Exact"),
  Treatment_HR = round(exp(c(coef(cox_b)["treatmentTreated"],
                              coef(cox_e)["treatmentTreated"],
                              coef(cox_x)["treatmentTreated"])), 3),
  Treatment_p  = round(c(summary(cox_b)$coefficients["treatmentTreated","Pr(>|z|)"],
                          summary(cox_e)$coefficients["treatmentTreated","Pr(>|z|)"],
                          summary(cox_x)$coefficients["treatmentTreated","Pr(>|z|)"]), 4)
) %>%
  style_table("Table 9: Tie-Handling Method Comparison")
Table 9: Tie-Handling Method Comparison
Method Treatment_HR Treatment_p
Breslow 2.193 0.0111
Efron 2.206 0.0106
Exact 2.203 0.0109

All three methods give nearly identical results, so the choice of tie-handling does not affect our conclusions here.

Adjusted survival curves from Cox model

newdata_cox <- data.frame(
  treatment = factor(c("Control", "Treated"), levels = c("Control", "Treated")),
  sex       = factor(c("f", "f"), levels = levels(rats_clean$sex))
)

cox_surv <- survfit(cox_multi, newdata = newdata_cox)

ggsurvplot(
  cox_surv,
  data         = newdata_cox,
  conf.int     = TRUE,
  palette      = c("#2E86AB", "#F18F01"),
  legend.labs  = c("Control", "Treated"),
  legend.title = "Group",
  ggtheme      = theme_minimal(),
  title        = "Adjusted Survival Curves from Cox Model (Female Rats)",
  xlab         = "Time (days)",
  ylab         = "Survival Probability S(t)"
)


Proportional Hazards Assumption Check

ph_test <- cox.zph(cox_multi)

as.data.frame(ph_test$table) %>%
  mutate(Variable = rownames(.), across(where(is.numeric), ~round(.x, 4))) %>%
  select(Variable, everything()) %>%
  style_table("Table 10: Schoenfeld Residuals Test (PH Assumption)")
Table 10: Schoenfeld Residuals Test (PH Assumption)
Variable chisq df p
treatment treatment 5.3927 1 0.0202
sex sex 0.6019 1 0.4379
GLOBAL GLOBAL 6.1450 2 0.0463
ph_test <- cox.zph(cox_multi)

par(mfrow = c(1, 2))

plot(ph_test, var = 1,
     main = "Schoenfeld Residuals — Treatment",
     ylab = "Beta(t)", xlab = "Time (days)", col = "#F18F01", lwd = 2)
abline(h = coef(cox_multi)[1], col = "red", lty = 2, lwd = 2)

plot(ph_test, var = 2,
     main = "Schoenfeld Residuals — Sex",
     ylab = "Beta(t)", xlab = "Time (days)", col = "#A23B72", lwd = 2)
abline(h = coef(cox_multi)[2], col = "red", lty = 2, lwd = 2)

par(mfrow = c(1, 1))

The Schoenfeld test gives p = 0.020 for treatment, suggesting the proportional hazards assumption may be violated — the hazard ratio between groups appears to change over time. This motivates our use of parametric models, which do not require the PH assumption, as the primary inferential framework for this analysis.


Weibull AFT Model

Since the PH assumption is questionable for treatment, we also fit an Accelerated Failure Time (AFT) model, which models how covariates accelerate or decelerate survival time directly.

aft_weibull <- survreg(
  Surv(time, event) ~ treatment + sex,
  data = rats_clean,
  dist = "weibull"
)

tidy(aft_weibull, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~round(.x, 3))) %>%
  style_table("Table 11: Weibull AFT Model Coefficients")
Table 11: Weibull AFT Model Coefficients
term estimate std.error statistic p.value conf.low conf.high
(Intercept) 4.971 0.080 62.124 0.000 4.815 5.128
treatmentTreated -0.212 0.086 -2.455 0.014 -0.381 -0.043
sexm 0.820 0.223 3.671 0.000 0.382 1.257
Log(scale) -1.326 0.141 -9.424 0.000 NA NA
aft_coefs <- coef(aft_weibull)
aft_coefs <- aft_coefs[!names(aft_coefs) %in% c("(Intercept)", "Log(scale)")]

data.frame(
  Variable   = names(aft_coefs),
  Time_Ratio = round(exp(aft_coefs), 3),
  Interpretation = c(
    "Treated rats develop tumours ~19% sooner than control",
    "Male rats survive ~2.3x longer tumour-free than female"
  )
) %>%
  style_table("Table 12: Time Ratios from Weibull AFT Model")
Table 12: Time Ratios from Weibull AFT Model
Variable Time_Ratio Interpretation
treatmentTreated treatmentTreated 0.809 Treated rats develop tumours ~19% sooner than control
sexm sexm 2.270 Male rats survive ~2.3x longer tumour-free than female

The time ratio for treatment (0.809) means treated rats develop tumours about 19% sooner than control rats. The time ratio for sex (2.27) tells us male rats remain tumour-free more than twice as long as females on average.


Parametric Model Fitting

We fit seven parametric distributions to find the best model for survival time.

dist_list <- c(
  "Exponential"       = "exp",
  "Weibull"           = "weibull",
  "Log-Normal"        = "lnorm",
  "Log-Logistic"      = "llogis",
  "Gamma"             = "gamma",
  "Gompertz"          = "gompertz",
  "Generalized Gamma" = "gengamma"
)

fits <- lapply(dist_list, function(d) {
  tryCatch(
    flexsurvreg(surv_obj ~ 1, data = rats_clean, dist = d),
    error = function(e) NULL
  )
})
names(fits) <- names(dist_list)
fits <- Filter(Negate(is.null), fits)

Model comparison table

model_comparison <- data.frame(
  Model         = names(fits),
  Log_Likelihood = sapply(fits, function(m) round(logLik(m)[1], 3)),
  AIC           = sapply(fits, function(m) round(AIC(m), 3)),
  BIC           = sapply(fits, function(m) round(BIC(m), 3)),
  Num_Params    = sapply(fits, function(m) length(m$coefficients))
) %>%
  arrange(AIC)

rownames(model_comparison) <- NULL

best_row <- which.min(model_comparison$AIC)

model_comparison %>%
  kbl(caption = "Table 13: Parametric Model Comparison (sorted by AIC)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  row_spec(0, bold = TRUE, background = "#2E86AB", color = "white") %>%
  row_spec(best_row, bold = TRUE, background = "#F0F8E8")
Table 13: Parametric Model Comparison (sorted by AIC)
Model Log_Likelihood AIC BIC Num_Params
Weibull -287.103 578.205 585.613 2
Log-Logistic -287.163 578.326 585.733 2
Gamma -287.186 578.372 585.779 2
Log-Normal -287.433 578.865 586.273 2
Generalized Gamma -287.066 580.131 591.243 3
Gompertz -288.326 580.651 588.059 2
Exponential -313.774 629.547 633.251 1
best_model_name <- model_comparison$Model[1]
best_model      <- fits[[best_model_name]]

cat("Best model:", best_model_name, "\n")
## Best model: Weibull
cat("AIC:", round(AIC(best_model), 3), "\n")
## AIC: 578.205
cat("BIC:", round(BIC(best_model), 3), "\n")
## BIC: 585.613

AIC comparison plot

ggplot(model_comparison, aes(x = reorder(Model, AIC), y = AIC, fill = Model == best_model_name)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("FALSE" = "#2E86AB", "TRUE" = "#F18F01")) +
  coord_flip() +
  labs(title = "AIC Comparison Across Parametric Models",
       subtitle = paste("Best model highlighted:", best_model_name),
       x = "", y = "AIC") +
  theme_minimal()

All models vs KM curve

colors   <- c("#E53935","#1E88E5","#43A047","#FB8C00","#8E24AA","#00ACC1","#6D4C41")
time_seq <- seq(0.1, max(rats_clean$time), length.out = 300)

plot(
  km_overall, col = "black", lwd = 2.5, conf.int = FALSE,
  xlab = "Time (days)", ylab = "Survival Probability S(t)",
  main = "All Parametric Models vs Kaplan–Meier",
  ylim = c(0, 1)
)

for (i in seq_along(fits)) {
  sp <- summary(fits[[i]], t = time_seq, type = "survival")[[1]]
  lines(sp$time, sp$est, col = colors[i], lwd = 1.8, lty = 2)
}

legend("bottomleft",
       legend = c("Kaplan–Meier", names(fits)),
       col    = c("black", colors[seq_along(fits)]),
       lwd    = 2, lty = c(1, rep(2, length(fits))),
       cex    = 0.75, bty = "n")

MLE parameter estimates for best model

param_table <- as.data.frame(best_model$res)
param_table$Parameter <- rownames(param_table)

all_cols  <- colnames(param_table)
est_col   <- grep("^est$",           all_cols, value = TRUE)
lower_col <- grep("l.*%|lower|L95",  all_cols, value = TRUE, ignore.case = TRUE)[1]
upper_col <- grep("u.*%|upper|U95",  all_cols, value = TRUE, ignore.case = TRUE)[1]
se_col    <- grep("^se$",            all_cols, value = TRUE)

keep <- c("Parameter", est_col, lower_col, upper_col, se_col)
keep <- keep[keep %in% colnames(param_table)]

param_table <- param_table[, keep]
colnames(param_table) <- c("Parameter","Estimate","Lower_95CI","Upper_95CI","SE")[1:ncol(param_table)]
param_table[, -1] <- round(param_table[, -1], 4)
rownames(param_table) <- NULL

style_table(param_table, paste("Table 14: MLE Parameters —", best_model_name, "Model"))
Table 14: MLE Parameters — Weibull Model
Parameter Estimate Lower_95CI Upper_95CI SE
shape 3.6675 2.7737 4.8492 0.5226
scale 160.6253 136.6374 188.8244 13.2554

Life Functions of the Best Model

Overall life functions

S_t <- summary(best_model, t = time_seq, type = "survival")[[1]]$est
h_t <- summary(best_model, t = time_seq, type = "hazard")[[1]]$est
f_t <- h_t * S_t       # f(t) = h(t) * S(t)
F_t <- 1 - S_t

# Mean and variance (Weibull)
if (best_model_name == "Weibull") {
  shape  <- best_model$res["shape", "est"]
  scale  <- best_model$res["scale", "est"]
  mean_T <- scale * gamma(1 + 1/shape)
  var_T  <- scale^2 * (gamma(1 + 2/shape) - (gamma(1 + 1/shape))^2)
}

par(mfrow = c(2, 2), mar = c(4.5, 4.5, 3, 1))

plot(time_seq, S_t, type = "l", col = "#1E88E5", lwd = 2.5,
     xlab = "Time (days)", ylab = "S(t)",
     main = paste("Survival Function —", best_model_name), ylim = c(0,1))
abline(h = 0.5, lty = 2, col = "grey60"); grid()

plot(time_seq, h_t, type = "l", col = "#E53935", lwd = 2.5,
     xlab = "Time (days)", ylab = "h(t)",
     main = paste("Hazard Function —", best_model_name))
grid()

plot(time_seq, f_t, type = "l", col = "#43A047", lwd = 2.5,
     xlab = "Time (days)", ylab = "f(t)",
     main = paste("Density Function —", best_model_name))
grid()

plot(time_seq, F_t, type = "l", col = "#8E24AA", lwd = 2.5,
     xlab = "Time (days)", ylab = "F(t)",
     main = paste("CDF —", best_model_name), ylim = c(0,1))
abline(h = 0.5, lty = 2, col = "grey60"); grid()

par(mfrow = c(1,1))
if (best_model_name == "Weibull") {
  data.frame(
    Quantity = c("Shape (k)", "Scale (λ)", "Mean Survival Time",
                 "Variance", "SD"),
    Value    = c(round(shape, 4), round(scale, 4),
                 paste(round(mean_T, 2), "days"),
                 paste(round(var_T, 2), "days²"),
                 paste(round(sqrt(var_T), 2), "days"))
  ) %>%
    style_table("Table 15: Weibull Model Life Function Summary")
}
Table 15: Weibull Model Life Function Summary
Quantity Value
Shape (k) 3.6675
Scale (λ) 160.6253
Mean Survival Time 144.89 days
Variance 1932.53 days²
SD 43.96 days
data.frame(
  Time_days   = c(10, 25, 50, 75, 100),
  S_t         = round(sapply(c(10,25,50,75,100), function(t)
                  S_t[which.min(abs(time_seq - t))]), 4),
  h_t         = round(sapply(c(10,25,50,75,100), function(t)
                  h_t[which.min(abs(time_seq - t))]), 6),
  F_t         = round(sapply(c(10,25,50,75,100), function(t)
                  F_t[which.min(abs(time_seq - t))]), 4)
) %>%
  style_table("Table 16: Life Function Values at Key Time Points")
Table 16: Life Function Values at Key Time Points
Time_days S_t h_t F_t
10 1.0000 0.000013 0.0000
25 0.9989 0.000162 0.0011
50 0.9861 0.001023 0.0139
75 0.9402 0.003011 0.0598
100 0.8396 0.006421 0.1604

The shape parameter k > 1 confirms an increasing hazard — the longer a rat survives, the higher its risk of developing a tumour. The mean tumour-free survival time is approximately 144.9 days.


Life functions by treatment group

We fit the best model separately for each group to compare the survival patterns directly.

rats_ctrl <- subset(rats_clean, treatment == "Control")
rats_trt  <- subset(rats_clean, treatment == "Treated")

best_dist <- dist_list[[best_model_name]]

fit_ctrl <- flexsurvreg(Surv(time, event) ~ 1, data = rats_ctrl, dist = best_dist)
fit_trt  <- flexsurvreg(Surv(time, event) ~ 1, data = rats_trt,  dist = best_dist)
if (best_model_name == "Weibull") {
  shape_ctrl <- fit_ctrl$res["shape","est"]
  scale_ctrl <- fit_ctrl$res["scale","est"]
  mean_ctrl  <- scale_ctrl * gamma(1 + 1/shape_ctrl)
  var_ctrl   <- scale_ctrl^2 * (gamma(1+2/shape_ctrl) - gamma(1+1/shape_ctrl)^2)

  shape_trt  <- fit_trt$res["shape","est"]
  scale_trt  <- fit_trt$res["scale","est"]
  mean_trt   <- scale_trt * gamma(1 + 1/shape_trt)
  var_trt    <- scale_trt^2 * (gamma(1+2/shape_trt) - gamma(1+1/shape_trt)^2)

  data.frame(
    Group              = c("Control", "Treated"),
    Shape_k            = round(c(shape_ctrl, shape_trt), 4),
    Scale_lambda       = round(c(scale_ctrl, scale_trt), 4),
    Mean_Survival_days = round(c(mean_ctrl, mean_trt), 2),
    Variance_days2     = round(c(var_ctrl, var_trt), 2),
    SD_days            = round(c(sqrt(var_ctrl), sqrt(var_trt)), 2)
  ) %>%
    style_table("Table 17: Weibull Parameters by Treatment Group")
}
Table 17: Weibull Parameters by Treatment Group
Group Shape_k Scale_lambda Mean_Survival_days Variance_days2 SD_days
Control 3.1597 191.3995 171.32 3532.64 59.44
Treated 4.3831 134.2006 122.28 997.09 31.58

Both groups have shape > 1, confirming increasing hazard in both. Control rats have a longer mean tumour-free survival (171.3 days) compared to treated rats (122.3 days) — a difference of 49 days, consistent with the carcinogenic treatment accelerating tumour onset.

time_seq2 <- seq(0.1, max(rats_clean$time), by = 1)

S_ctrl <- summary(fit_ctrl, t = time_seq2, type = "survival")[[1]]$est
h_ctrl <- summary(fit_ctrl, t = time_seq2, type = "hazard")[[1]]$est
f_ctrl <- h_ctrl * S_ctrl
F_ctrl <- 1 - S_ctrl

S_trt  <- summary(fit_trt, t = time_seq2, type = "survival")[[1]]$est
h_trt  <- summary(fit_trt, t = time_seq2, type = "hazard")[[1]]$est
f_trt  <- h_trt * S_trt
F_trt  <- 1 - S_trt

cols2 <- c("Control" = "#2196F3", "Treated" = "#F44336")

par(mfrow = c(2,2), mar = c(4.5, 4.5, 3.5, 1))

plot(time_seq2, S_ctrl, type="l", col=cols2[1], lwd=2.5,
     ylim=c(0,1), xlab="Time (days)", ylab="S(t)",
     main="Survival Function by Group")
lines(time_seq2, S_trt, col=cols2[2], lwd=2.5)
abline(h=0.5, lty=2, col="grey60")
legend("topright", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()

plot(time_seq2, h_ctrl, type="l", col=cols2[1], lwd=2.5,
     xlab="Time (days)", ylab="h(t)", main="Hazard Function by Group")
lines(time_seq2, h_trt, col=cols2[2], lwd=2.5)
legend("topleft", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()

plot(time_seq2, f_ctrl, type="l", col=cols2[1], lwd=2.5,
     xlab="Time (days)", ylab="f(t)", main="Density Function by Group")
lines(time_seq2, f_trt, col=cols2[2], lwd=2.5)
legend("topright", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()

plot(time_seq2, F_ctrl, type="l", col=cols2[1], lwd=2.5,
     ylim=c(0,1), xlab="Time (days)", ylab="F(t)", main="CDF by Group")
lines(time_seq2, F_trt, col=cols2[2], lwd=2.5)
abline(h=0.5, lty=2, col="grey60")
legend("bottomright", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()

par(mfrow=c(1,1))

Goodness of Fit — PP Plot

km_fit     <- survfit(surv_obj ~ 1, data = rats_clean)
km_times   <- km_fit$time
km_surv    <- km_fit$surv
param_surv <- summary(best_model, t = km_times, type = "survival")[[1]]$est

plot(
  1 - km_surv, 1 - param_surv,
  pch = 16, col = "#1E88E5", cex = 0.8,
  xlab = "Empirical CDF (KM)",
  ylab = paste("Fitted CDF (", best_model_name, ")"),
  main = paste("PP-Plot: KM vs", best_model_name),
  xlim = c(0,1), ylim = c(0,1)
)
abline(0, 1, col = "red", lwd = 2, lty = 2)
legend("bottomright", legend = "Perfect fit", col = "red", lty = 2, lwd = 2, bty = "n")
grid()

Points lying close to the 45° diagonal line indicate good agreement between the fitted Weibull model and the empirical KM estimate — the model fits the data well.


Frailty Model — Accounting for Litter Clustering

Rats within the same litter share genetic and environmental factors, making their outcomes correlated. A standard Cox model treats all observations as independent, which would underestimate standard errors. We use a shared frailty model to account for this.

frailty_model <- coxph(
  Surv(time, event) ~ treatment + frailty(litter, distribution = "gamma"),
  data = rats_clean
)

summary(frailty_model)
## Call:
## coxph(formula = Surv(time, event) ~ treatment + frailty(litter, 
##     distribution = "gamma"), data = rats_clean)
## 
##   n= 300, number of events= 42 
## 
##                           coef   se(coef) se2    Chisq DF    p    
## treatmentTreated          0.7271 0.3183   0.3126  5.22  1.00 0.022
## frailty(litter, distribut                        63.05 47.67 0.067
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## treatmentTreated     2.069     0.4833     1.109     3.861
## 
## Iterations: 6 outer, 23 Newton-Raphson
##      Variance of random effect= 2.021201   I-likelihood = -217.5 
## Degrees of freedom for terms=  1.0 47.7 
## Concordance= 0.923  (se = 0.014 )
## Likelihood ratio test= 88.54  on 48.63 df,   p=4e-04
hr    <- exp(coef(frailty_model)["treatmentTreated"])
hr_ci <- exp(confint(frailty_model))

data.frame(
  Model     = "Frailty (gamma)",
  HR        = round(hr, 4),
  Lower_95  = round(hr_ci["treatmentTreated", 1], 4),
  Upper_95  = round(hr_ci["treatmentTreated", 2], 4),
  Direction = ifelse(hr > 1, "Treated group has higher hazard", "Treated group has lower hazard")
) %>%
  style_table("Table 18: Frailty Model — Hazard Ratio for Treatment")
Table 18: Frailty Model — Hazard Ratio for Treatment
Model HR Lower_95 Upper_95 Direction
treatmentTreated Frailty (gamma) 2.069 1.1088 3.8607 Treated group has higher hazard
tryCatch({
  theta <- frailty_model$history[[1]]$theta
  cat(sprintf("Frailty variance (theta): %.4f\n", theta))
  if (theta > 0.1) {
    cat("There is meaningful within-litter clustering.\n")
    cat("The frailty model is the appropriate choice for this data.\n")
  } else {
    cat("Litter clustering appears minimal, but including frailty is still good practice.\n")
  }
}, error = function(e) {
  cat("Could not extract theta automatically. Check: frailty_model$history\n")
})
## Frailty variance (theta): 2.0204
## There is meaningful within-litter clustering.
## The frailty model is the appropriate choice for this data.

After adjusting for litter clustering, treated rats have a significantly higher hazard of tumour development compared to controls (HR ≈ 2.07, p < 0.05), which is consistent with the carcinogenic agent in the treatment.


Final Summary

data.frame(
  Quantity = c(
    "Total observations", "Events (tumours)", "Censored",
    "Overall event rate", "Log-rank p-value",
    "Best parametric model", "Best model AIC",
    if (best_model_name == "Weibull") c("Weibull shape (k)", "Weibull scale (λ)",
      "Mean survival time", "Variance") else NULL,
    "Frailty HR (Treated vs Control)"
  ),
  Value = c(
    nrow(rats_clean), sum(rats_clean$event), sum(rats_clean$event == 0),
    paste0(round(mean(rats_clean$event)*100, 1), "%"),
    round(pval, 4),
    best_model_name,
    round(model_comparison$AIC[1], 3),
    if (best_model_name == "Weibull") c(round(shape,4), round(scale,4),
      paste(round(mean_T,2),"days"), paste(round(var_T,2),"days²")) else NULL,
    round(hr, 4)
  )
) %>%
  style_table("Table 19: Final Results Summary")
Table 19: Final Results Summary
Quantity Value
Total observations 300
Events (tumours) 42
Censored 258
Overall event rate 14%
Log-rank p-value 0.0185
Best parametric model Weibull
Best model AIC 578.205
Weibull shape (k) 3.6675
Weibull scale (λ) 160.6253
Mean survival time 144.89 days
Variance 1932.53 days²
Frailty HR (Treated vs Control) 2.069

Save Plots for GitHub

if (!dir.exists("outputs")) dir.create("outputs")

# KM by treatment
png("outputs/km_by_treatment.png", width = 900, height = 700, res = 120)
print(ggsurvplot(km_treatment, data = rats_clean, conf.int = TRUE,
  risk.table = TRUE, pval = TRUE, palette = c("#2E86AB","#F18F01"),
  legend.labs = c("Control","Treated"), ggtheme = theme_minimal(),
  title = "KM Curves by Treatment", xlab = "Time (days)",
  surv.median.line = "none"))
dev.off()
## png 
##   2
# KM by sex
png("outputs/km_by_sex.png", width = 900, height = 700, res = 120)
print(ggsurvplot(km_sex, data = rats_clean, conf.int = TRUE,
  risk.table = TRUE, pval = TRUE, palette = c("#A23B72","#3B7080"),
  legend.labs = c("Female","Male"), ggtheme = theme_minimal(),
  title = "KM Curves by Sex", xlab = "Time (days)"))
dev.off()
## png 
##   2
# AIC comparison bar chart
png("outputs/aic_comparison.png", width = 900, height = 600, res = 120)
print(ggplot(model_comparison, aes(x = reorder(Model, AIC), y = AIC,
    fill = Model == best_model_name)) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values = c("FALSE" = "#2E86AB", "TRUE" = "#F18F01")) +
  coord_flip() +
  labs(title = "AIC Comparison Across Parametric Models", x = "", y = "AIC") +
  theme_minimal())
dev.off()
## png 
##   2
# Adjusted Cox survival curves
png("outputs/cox_adjusted_curves.png", width = 900, height = 700, res = 120)
print(ggsurvplot(cox_surv, data = newdata_cox, conf.int = TRUE,
  palette = c("#2E86AB","#F18F01"), legend.labs = c("Control","Treated"),
  ggtheme = theme_minimal(), title = "Adjusted Survival Curves from Cox Model",
  xlab = "Time (days)"))
dev.off()
## png 
##   2
# Schoenfeld residuals
png("outputs/schoenfeld_residuals.png", width = 900, height = 500, res = 120)
par(mfrow = c(1, 2))
plot(ph_test, var = 1, main = "Schoenfeld — Treatment",
     ylab = "Beta(t)", xlab = "Time (days)", col = "#F18F01", lwd = 2)
abline(h = coef(cox_multi)[1], col = "red", lty = 2, lwd = 2)
plot(ph_test, var = 2, main = "Schoenfeld — Sex",
     ylab = "Beta(t)", xlab = "Time (days)", col = "#A23B72", lwd = 2)
abline(h = coef(cox_multi)[2], col = "red", lty = 2, lwd = 2)
par(mfrow = c(1, 1))
dev.off()
## png 
##   2
# PP plot
png("outputs/pp_plot.png", width = 700, height = 700, res = 120)
plot(1 - km_surv, 1 - param_surv, pch = 16, col = "#1E88E5", cex = 0.8,
     xlab = "Empirical CDF (KM)",
     ylab = paste("Fitted CDF (", best_model_name, ")"),
     main = paste("PP-Plot: KM vs", best_model_name),
     xlim = c(0,1), ylim = c(0,1))
abline(0, 1, col = "red", lwd = 2, lty = 2)
legend("bottomright", legend = "Perfect fit", col = "red", lty = 2, lwd = 2, bty = "n")
grid()
dev.off()
## png 
##   2
# Overall life functions
png("outputs/life_functions_overall.png", width = 900, height = 700, res = 120)
par(mfrow = c(2,2), mar = c(4.5,4.5,3,1))
plot(time_seq, S_t, type="l", col="#1E88E5", lwd=2.5,
     xlab="Time (days)", ylab="S(t)", main="Survival Function", ylim=c(0,1)); grid()
plot(time_seq, h_t, type="l", col="#E53935", lwd=2.5,
     xlab="Time (days)", ylab="h(t)", main="Hazard Function"); grid()
plot(time_seq, f_t, type="l", col="#43A047", lwd=2.5,
     xlab="Time (days)", ylab="f(t)", main="Density Function"); grid()
plot(time_seq, F_t, type="l", col="#8E24AA", lwd=2.5,
     xlab="Time (days)", ylab="F(t)", main="CDF", ylim=c(0,1)); grid()
par(mfrow=c(1,1))
dev.off()
## png 
##   2
# Life functions by group
png("outputs/life_functions_by_group.png", width = 900, height = 700, res = 120)
par(mfrow=c(2,2), mar=c(4.5,4.5,3.5,1))
plot(time_seq2, S_ctrl, type="l", col="#2196F3", lwd=2.5,
     ylim=c(0,1), xlab="Time (days)", ylab="S(t)", main="Survival by Group")
lines(time_seq2, S_trt, col="#F44336", lwd=2.5)
legend("topright", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
plot(time_seq2, h_ctrl, type="l", col="#2196F3", lwd=2.5,
     xlab="Time (days)", ylab="h(t)", main="Hazard by Group")
lines(time_seq2, h_trt, col="#F44336", lwd=2.5)
legend("topleft", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
plot(time_seq2, f_ctrl, type="l", col="#2196F3", lwd=2.5,
     xlab="Time (days)", ylab="f(t)", main="Density by Group")
lines(time_seq2, f_trt, col="#F44336", lwd=2.5)
legend("topright", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
plot(time_seq2, F_ctrl, type="l", col="#2196F3", lwd=2.5,
     ylim=c(0,1), xlab="Time (days)", ylab="F(t)", main="CDF by Group")
lines(time_seq2, F_trt, col="#F44336", lwd=2.5)
legend("bottomright", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
par(mfrow=c(1,1))
dev.off()
## png 
##   2
cat("All 8 plots saved to outputs/\n")
## All 8 plots saved to outputs/