This report analyses the rats dataset from the
survival package in R. The dataset comes from a study where
rats were exposed to a carcinogenic agent and monitored for tumour
development. We use parametric survival models to understand the
time-to-tumour patterns and compare the treated and control groups.
The key variables we work with are:
time — observed survival time (days)status / event — tumour occurrence
indicator (1 = tumour, 0 = censored)treatment — treatment group (Control vs Treated)sex — sex of the rat (f = female, m = male)litter — litter identifier (used to account for
clustering)library(survival)
library(flexsurv)
library(survminer)
library(ggplot2)
library(dplyr)
library(knitr)
library(kableExtra)
library(broom)
library(gridExtra)
library(patchwork)
# helper to style all our tables consistently
style_table <- function(df, caption) {
df %>%
kbl(caption = caption) %>%
kable_styling(
bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE
) %>%
row_spec(0, bold = TRUE, background = "#2E86AB", color = "white")
}data(rats)
rats <- rats %>%
mutate(
litter = as.factor(litter),
event = as.numeric(status),
treatment = factor(ifelse(rx == 1, "Treated", "Control"),
levels = c("Control", "Treated")),
sex = as.factor(sex)
)
rats_clean <- rats %>%
select(litter, time, event, treatment, sex) %>%
na.omit()data.frame(
Variable = names(rats),
Missing = colSums(is.na(rats)),
Percent = round(100 * colSums(is.na(rats)) / nrow(rats), 1)
) %>%
style_table("Table 1: Missing Data Check")| Variable | Missing | Percent | |
|---|---|---|---|
| litter | litter | 0 | 0 |
| rx | rx | 0 | 0 |
| time | time | 0 | 0 |
| status | status | 0 | 0 |
| sex | sex | 0 | 0 |
| event | event | 0 | 0 |
| treatment | treatment | 0 | 0 |
No missing values — all 300 observations are complete.
# overall + by group
overall <- data.frame(
Group = "Overall",
N = nrow(rats_clean),
Events = sum(rats_clean$event),
Censored = sum(rats_clean$event == 0),
Event_Rate_Pct = round(mean(rats_clean$event) * 100, 1),
Min_Time = min(rats_clean$time),
Max_Time = max(rats_clean$time),
Mean_Time = round(mean(rats_clean$time), 2),
Median_Time = median(rats_clean$time),
SD_Time = round(sd(rats_clean$time), 2)
)
by_group <- rats_clean %>%
group_by(Group = treatment) %>%
summarise(
N = n(),
Events = sum(event),
Censored = sum(event == 0),
Event_Rate_Pct = round(mean(event) * 100, 1),
Min_Time = min(time),
Max_Time = max(time),
Mean_Time = round(mean(time), 2),
Median_Time = median(time),
SD_Time = round(sd(time), 2),
.groups = "drop"
) %>%
mutate(Group = as.character(Group))
bind_rows(overall, by_group) %>%
style_table("Table 2: Sample Summary by Group")| Group | N | Events | Censored | Event_Rate_Pct | Min_Time | Max_Time | Mean_Time | Median_Time | SD_Time |
|---|---|---|---|---|---|---|---|---|---|
| Overall | 300 | 42 | 258 | 14.0 | 23 | 104 | 90.44 | 98.0 | 17.06 |
| Control | 200 | 21 | 179 | 10.5 | 32 | 104 | 90.94 | 99.0 | 16.58 |
| Treated | 100 | 21 | 79 | 21.0 | 23 | 104 | 89.45 | 93.5 | 18.01 |
A few things stand out immediately. The overall event rate is only 14% (42/300), meaning the majority of rats never developed a tumour during the study. The treated group has a noticeably higher event rate (~21%) compared to control (~10.5%), which makes biological sense — the treated rats received a carcinogen.
p1 <- ggplot(rats_clean, aes(x = time, fill = treatment)) +
geom_histogram(bins = 20, alpha = 0.75, color = "white", position = "identity") +
scale_fill_manual(values = c("Control" = "#2E86AB", "Treated" = "#F18F01")) +
labs(title = "Distribution of Survival Times", x = "Time (days)", y = "Count", fill = "") +
theme_minimal()
p2 <- ggplot(rats_clean, aes(x = treatment, fill = treatment)) +
geom_bar() +
scale_fill_manual(values = c("#2E86AB", "#F18F01")) +
labs(title = "Treatment Group Sizes", x = "", y = "Count") +
theme_minimal() + theme(legend.position = "none")
p3 <- ggplot(rats_clean, aes(x = sex, fill = sex)) +
geom_bar() +
scale_fill_manual(values = c("#A23B72", "#3B7080")) +
labs(title = "Sex Distribution", x = "", y = "Count") +
theme_minimal() + theme(legend.position = "none")
p4 <- ggplot(rats_clean, aes(x = treatment, y = time, fill = treatment)) +
geom_boxplot(alpha = 0.8) +
scale_fill_manual(values = c("#2E86AB", "#F18F01")) +
labs(title = "Survival Time by Treatment", x = "", y = "Time (days)") +
theme_minimal() + theme(legend.position = "none")
grid.arrange(p1, p2, p3, p4, ncol = 2)The survival time distribution is left-skewed — most rats survived close to the maximum follow-up of 104 days. The 2:1 allocation (200 control, 100 treated) is by design.
km_overall <- survfit(surv_obj ~ 1, data = rats_clean)
ggsurvplot(
km_overall,
data = rats_clean,
conf.int = TRUE,
risk.table = TRUE,
palette = "#2E86AB",
ggtheme = theme_minimal(),
title = "Overall Kaplan–Meier Survival Curve",
xlab = "Time (days)",
ylab = "Survival Probability S(t)",
risk.table.title = "Number at Risk"
)km_treatment <- survfit(surv_obj ~ treatment, data = rats_clean)
ggsurvplot(
km_treatment,
data = rats_clean,
conf.int = TRUE,
risk.table = TRUE,
pval = TRUE,
pval.method = TRUE,
palette = c("#2E86AB", "#F18F01"),
legend.labs = c("Control", "Treated"),
legend.title = "Group",
ggtheme = theme_minimal(),
title = "Kaplan–Meier Curves by Treatment Group",
subtitle = "Median survival not reached — event rate < 50%",
xlab = "Time (days)",
ylab = "Survival Probability S(t)",
surv.median.line = "none",
risk.table.title = "Number at Risk",
tables.theme = theme_cleantable()
)km_sex <- survfit(surv_obj ~ sex, data = rats_clean)
ggsurvplot(
km_sex,
data = rats_clean,
conf.int = TRUE,
risk.table = TRUE,
pval = TRUE,
pval.method = TRUE,
palette = c("#A23B72", "#3B7080"),
legend.labs = c("Female", "Male"),
legend.title = "Sex",
ggtheme = theme_minimal(),
title = "Kaplan–Meier Curves by Sex",
xlab = "Time (days)",
ylab = "Survival Probability S(t)",
risk.table.title = "Number at Risk",
tables.theme = theme_cleantable()
)Sex turns out to be a very strong predictor (p < 0.0001), something we carry into the Cox models below.
Median survival is only defined when at least 50% of subjects experience the event. With only 14% overall event rate, the KM curve never drops below 0.5, so the median is mathematically undefined — this is expected, not an error. We report the Restricted Mean Survival Time (RMST) instead.
rmst_overall <- summary(km_overall)$table
rmst_groups <- summary(km_treatment)$table
rmst_table <- data.frame(
Group = c("Overall", "Control", "Treated"),
RMST_days = round(c(
rmst_overall["rmean"],
rmst_groups["treatment=Control", "rmean"],
rmst_groups["treatment=Treated", "rmean"]
), 2),
SE = round(c(
rmst_overall["se(rmean)"],
rmst_groups["treatment=Control", "se(rmean)"],
rmst_groups["treatment=Treated", "se(rmean)"]
), 4)
)
style_table(rmst_table, "Table 3: Restricted Mean Survival Time by Group")| Group | RMST_days | SE |
|---|---|---|
| Overall | 99.78 | 0.7206 |
| Control | 100.38 | 0.8245 |
| Treated | 98.55 | 1.3952 |
key_times <- c(25, 50, 75, 100)
sp <- summary(km_treatment, times = key_times)
data.frame(
Time = sp$time,
Group = gsub("treatment=", "", sp$strata),
Survival = round(sp$surv, 4),
Lower_95 = round(sp$lower, 4),
Upper_95 = round(sp$upper, 4),
N_at_risk = sp$n.risk,
Events = sp$n.event
) %>%
style_table("Table 4: Survival Probabilities at Key Time Points")| Time | Group | Survival | Lower_95 | Upper_95 | N_at_risk | Events |
|---|---|---|---|---|---|---|
| 25 | Control | 1.0000 | 1.0000 | 1.0000 | 200 | 0 |
| 50 | Control | 0.9848 | 0.9679 | 1.0000 | 195 | 3 |
| 75 | Control | 0.9419 | 0.9091 | 0.9759 | 168 | 8 |
| 100 | Control | 0.8909 | 0.8453 | 0.9390 | 99 | 8 |
| 25 | Treated | 1.0000 | 1.0000 | 1.0000 | 99 | 0 |
| 50 | Treated | 0.9697 | 0.9365 | 1.0000 | 95 | 3 |
| 75 | Treated | 0.9254 | 0.8736 | 0.9802 | 83 | 4 |
| 100 | Treated | 0.7987 | 0.7131 | 0.8946 | 44 | 9 |
lr_test <- survdiff(surv_obj ~ treatment, data = rats_clean)
pval <- 1 - pchisq(lr_test$chisq, df = 1)
data.frame(
Statistic = round(lr_test$chisq, 4),
df = 1,
p_value = round(pval, 4),
Conclusion = ifelse(pval < 0.05,
"Significant difference between groups",
"No significant difference")
) %>%
style_table("Table 5: Log-Rank Test Result")| Statistic | df | p_value | Conclusion |
|---|---|---|---|
| 5.5487 | 1 | 0.0185 | Significant difference between groups |
We fit three versions of the Cox model: unadjusted (treatment only), adjusted (treatment + sex), and a clustered version that accounts for within-litter correlation using robust standard errors.
cox_simple <- coxph(Surv(time, event) ~ treatment, data = rats_clean)
tidy(cox_simple, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~round(.x, 3))) %>%
style_table("Table 6: Unadjusted Cox Model (Treatment Only)")| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| treatmentTreated | 2.042 | 0.309 | 2.311 | 0.021 | 1.115 | 3.739 |
cox_multi <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean)
tidy(cox_multi, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~round(.x, 3))) %>%
style_table("Table 7: Multivariable Cox Model (Treatment + Sex)")| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| treatmentTreated | 2.206 | 0.309 | 2.557 | 0.011 | 1.203 | 4.044 |
| sexm | 0.047 | 0.725 | -4.232 | 0.000 | 0.011 | 0.193 |
After adjusting for sex, the treatment effect strengthens (HR = 2.21, p = 0.011). Sex is highly significant — male rats have dramatically lower hazard (HR ≈ 0.047, p < 0.001), suggesting a strong sex effect on tumour development.
cox_cluster <- coxph(
Surv(time, event) ~ treatment + sex + cluster(litter),
data = rats_clean
)
tidy(cox_cluster, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~round(.x, 3))) %>%
style_table("Table 8: Cox Model with Litter-Clustered Standard Errors")| term | estimate | std.error | robust.se | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|---|
| treatmentTreated | 2.206 | 0.309 | 0.293 | 2.702 | 0.007 | 1.243 | 3.915 |
| sexm | 0.047 | 0.725 | 0.721 | -4.257 | 0.000 | 0.011 | 0.191 |
Accounting for litter clustering gives us robust standard errors. The treatment effect remains significant (HR = 2.21, p = 0.007), and the results are largely consistent with the unadjusted clustered SEs, confirming our earlier findings are not an artefact of within-litter dependence.
cox_b <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean, ties = "breslow")
cox_e <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean, ties = "efron")
cox_x <- coxph(Surv(time, event) ~ treatment + sex, data = rats_clean, ties = "exact")
data.frame(
Method = c("Breslow", "Efron", "Exact"),
Treatment_HR = round(exp(c(coef(cox_b)["treatmentTreated"],
coef(cox_e)["treatmentTreated"],
coef(cox_x)["treatmentTreated"])), 3),
Treatment_p = round(c(summary(cox_b)$coefficients["treatmentTreated","Pr(>|z|)"],
summary(cox_e)$coefficients["treatmentTreated","Pr(>|z|)"],
summary(cox_x)$coefficients["treatmentTreated","Pr(>|z|)"]), 4)
) %>%
style_table("Table 9: Tie-Handling Method Comparison")| Method | Treatment_HR | Treatment_p |
|---|---|---|
| Breslow | 2.193 | 0.0111 |
| Efron | 2.206 | 0.0106 |
| Exact | 2.203 | 0.0109 |
All three methods give nearly identical results, so the choice of tie-handling does not affect our conclusions here.
newdata_cox <- data.frame(
treatment = factor(c("Control", "Treated"), levels = c("Control", "Treated")),
sex = factor(c("f", "f"), levels = levels(rats_clean$sex))
)
cox_surv <- survfit(cox_multi, newdata = newdata_cox)
ggsurvplot(
cox_surv,
data = newdata_cox,
conf.int = TRUE,
palette = c("#2E86AB", "#F18F01"),
legend.labs = c("Control", "Treated"),
legend.title = "Group",
ggtheme = theme_minimal(),
title = "Adjusted Survival Curves from Cox Model (Female Rats)",
xlab = "Time (days)",
ylab = "Survival Probability S(t)"
)ph_test <- cox.zph(cox_multi)
as.data.frame(ph_test$table) %>%
mutate(Variable = rownames(.), across(where(is.numeric), ~round(.x, 4))) %>%
select(Variable, everything()) %>%
style_table("Table 10: Schoenfeld Residuals Test (PH Assumption)")| Variable | chisq | df | p | |
|---|---|---|---|---|
| treatment | treatment | 5.3927 | 1 | 0.0202 |
| sex | sex | 0.6019 | 1 | 0.4379 |
| GLOBAL | GLOBAL | 6.1450 | 2 | 0.0463 |
ph_test <- cox.zph(cox_multi)
par(mfrow = c(1, 2))
plot(ph_test, var = 1,
main = "Schoenfeld Residuals — Treatment",
ylab = "Beta(t)", xlab = "Time (days)", col = "#F18F01", lwd = 2)
abline(h = coef(cox_multi)[1], col = "red", lty = 2, lwd = 2)
plot(ph_test, var = 2,
main = "Schoenfeld Residuals — Sex",
ylab = "Beta(t)", xlab = "Time (days)", col = "#A23B72", lwd = 2)
abline(h = coef(cox_multi)[2], col = "red", lty = 2, lwd = 2)The Schoenfeld test gives p = 0.020 for treatment, suggesting the proportional hazards assumption may be violated — the hazard ratio between groups appears to change over time. This motivates our use of parametric models, which do not require the PH assumption, as the primary inferential framework for this analysis.
Since the PH assumption is questionable for treatment, we also fit an Accelerated Failure Time (AFT) model, which models how covariates accelerate or decelerate survival time directly.
aft_weibull <- survreg(
Surv(time, event) ~ treatment + sex,
data = rats_clean,
dist = "weibull"
)
tidy(aft_weibull, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~round(.x, 3))) %>%
style_table("Table 11: Weibull AFT Model Coefficients")| term | estimate | std.error | statistic | p.value | conf.low | conf.high |
|---|---|---|---|---|---|---|
| (Intercept) | 4.971 | 0.080 | 62.124 | 0.000 | 4.815 | 5.128 |
| treatmentTreated | -0.212 | 0.086 | -2.455 | 0.014 | -0.381 | -0.043 |
| sexm | 0.820 | 0.223 | 3.671 | 0.000 | 0.382 | 1.257 |
| Log(scale) | -1.326 | 0.141 | -9.424 | 0.000 | NA | NA |
aft_coefs <- coef(aft_weibull)
aft_coefs <- aft_coefs[!names(aft_coefs) %in% c("(Intercept)", "Log(scale)")]
data.frame(
Variable = names(aft_coefs),
Time_Ratio = round(exp(aft_coefs), 3),
Interpretation = c(
"Treated rats develop tumours ~19% sooner than control",
"Male rats survive ~2.3x longer tumour-free than female"
)
) %>%
style_table("Table 12: Time Ratios from Weibull AFT Model")| Variable | Time_Ratio | Interpretation | |
|---|---|---|---|
| treatmentTreated | treatmentTreated | 0.809 | Treated rats develop tumours ~19% sooner than control |
| sexm | sexm | 2.270 | Male rats survive ~2.3x longer tumour-free than female |
The time ratio for treatment (0.809) means treated rats develop tumours about 19% sooner than control rats. The time ratio for sex (2.27) tells us male rats remain tumour-free more than twice as long as females on average.
We fit seven parametric distributions to find the best model for survival time.
dist_list <- c(
"Exponential" = "exp",
"Weibull" = "weibull",
"Log-Normal" = "lnorm",
"Log-Logistic" = "llogis",
"Gamma" = "gamma",
"Gompertz" = "gompertz",
"Generalized Gamma" = "gengamma"
)
fits <- lapply(dist_list, function(d) {
tryCatch(
flexsurvreg(surv_obj ~ 1, data = rats_clean, dist = d),
error = function(e) NULL
)
})
names(fits) <- names(dist_list)
fits <- Filter(Negate(is.null), fits)model_comparison <- data.frame(
Model = names(fits),
Log_Likelihood = sapply(fits, function(m) round(logLik(m)[1], 3)),
AIC = sapply(fits, function(m) round(AIC(m), 3)),
BIC = sapply(fits, function(m) round(BIC(m), 3)),
Num_Params = sapply(fits, function(m) length(m$coefficients))
) %>%
arrange(AIC)
rownames(model_comparison) <- NULL
best_row <- which.min(model_comparison$AIC)
model_comparison %>%
kbl(caption = "Table 13: Parametric Model Comparison (sorted by AIC)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
row_spec(0, bold = TRUE, background = "#2E86AB", color = "white") %>%
row_spec(best_row, bold = TRUE, background = "#F0F8E8")| Model | Log_Likelihood | AIC | BIC | Num_Params |
|---|---|---|---|---|
| Weibull | -287.103 | 578.205 | 585.613 | 2 |
| Log-Logistic | -287.163 | 578.326 | 585.733 | 2 |
| Gamma | -287.186 | 578.372 | 585.779 | 2 |
| Log-Normal | -287.433 | 578.865 | 586.273 | 2 |
| Generalized Gamma | -287.066 | 580.131 | 591.243 | 3 |
| Gompertz | -288.326 | 580.651 | 588.059 | 2 |
| Exponential | -313.774 | 629.547 | 633.251 | 1 |
best_model_name <- model_comparison$Model[1]
best_model <- fits[[best_model_name]]
cat("Best model:", best_model_name, "\n")## Best model: Weibull
## AIC: 578.205
## BIC: 585.613
ggplot(model_comparison, aes(x = reorder(Model, AIC), y = AIC, fill = Model == best_model_name)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("FALSE" = "#2E86AB", "TRUE" = "#F18F01")) +
coord_flip() +
labs(title = "AIC Comparison Across Parametric Models",
subtitle = paste("Best model highlighted:", best_model_name),
x = "", y = "AIC") +
theme_minimal()colors <- c("#E53935","#1E88E5","#43A047","#FB8C00","#8E24AA","#00ACC1","#6D4C41")
time_seq <- seq(0.1, max(rats_clean$time), length.out = 300)
plot(
km_overall, col = "black", lwd = 2.5, conf.int = FALSE,
xlab = "Time (days)", ylab = "Survival Probability S(t)",
main = "All Parametric Models vs Kaplan–Meier",
ylim = c(0, 1)
)
for (i in seq_along(fits)) {
sp <- summary(fits[[i]], t = time_seq, type = "survival")[[1]]
lines(sp$time, sp$est, col = colors[i], lwd = 1.8, lty = 2)
}
legend("bottomleft",
legend = c("Kaplan–Meier", names(fits)),
col = c("black", colors[seq_along(fits)]),
lwd = 2, lty = c(1, rep(2, length(fits))),
cex = 0.75, bty = "n")param_table <- as.data.frame(best_model$res)
param_table$Parameter <- rownames(param_table)
all_cols <- colnames(param_table)
est_col <- grep("^est$", all_cols, value = TRUE)
lower_col <- grep("l.*%|lower|L95", all_cols, value = TRUE, ignore.case = TRUE)[1]
upper_col <- grep("u.*%|upper|U95", all_cols, value = TRUE, ignore.case = TRUE)[1]
se_col <- grep("^se$", all_cols, value = TRUE)
keep <- c("Parameter", est_col, lower_col, upper_col, se_col)
keep <- keep[keep %in% colnames(param_table)]
param_table <- param_table[, keep]
colnames(param_table) <- c("Parameter","Estimate","Lower_95CI","Upper_95CI","SE")[1:ncol(param_table)]
param_table[, -1] <- round(param_table[, -1], 4)
rownames(param_table) <- NULL
style_table(param_table, paste("Table 14: MLE Parameters —", best_model_name, "Model"))| Parameter | Estimate | Lower_95CI | Upper_95CI | SE |
|---|---|---|---|---|
| shape | 3.6675 | 2.7737 | 4.8492 | 0.5226 |
| scale | 160.6253 | 136.6374 | 188.8244 | 13.2554 |
S_t <- summary(best_model, t = time_seq, type = "survival")[[1]]$est
h_t <- summary(best_model, t = time_seq, type = "hazard")[[1]]$est
f_t <- h_t * S_t # f(t) = h(t) * S(t)
F_t <- 1 - S_t
# Mean and variance (Weibull)
if (best_model_name == "Weibull") {
shape <- best_model$res["shape", "est"]
scale <- best_model$res["scale", "est"]
mean_T <- scale * gamma(1 + 1/shape)
var_T <- scale^2 * (gamma(1 + 2/shape) - (gamma(1 + 1/shape))^2)
}
par(mfrow = c(2, 2), mar = c(4.5, 4.5, 3, 1))
plot(time_seq, S_t, type = "l", col = "#1E88E5", lwd = 2.5,
xlab = "Time (days)", ylab = "S(t)",
main = paste("Survival Function —", best_model_name), ylim = c(0,1))
abline(h = 0.5, lty = 2, col = "grey60"); grid()
plot(time_seq, h_t, type = "l", col = "#E53935", lwd = 2.5,
xlab = "Time (days)", ylab = "h(t)",
main = paste("Hazard Function —", best_model_name))
grid()
plot(time_seq, f_t, type = "l", col = "#43A047", lwd = 2.5,
xlab = "Time (days)", ylab = "f(t)",
main = paste("Density Function —", best_model_name))
grid()
plot(time_seq, F_t, type = "l", col = "#8E24AA", lwd = 2.5,
xlab = "Time (days)", ylab = "F(t)",
main = paste("CDF —", best_model_name), ylim = c(0,1))
abline(h = 0.5, lty = 2, col = "grey60"); grid()if (best_model_name == "Weibull") {
data.frame(
Quantity = c("Shape (k)", "Scale (λ)", "Mean Survival Time",
"Variance", "SD"),
Value = c(round(shape, 4), round(scale, 4),
paste(round(mean_T, 2), "days"),
paste(round(var_T, 2), "days²"),
paste(round(sqrt(var_T), 2), "days"))
) %>%
style_table("Table 15: Weibull Model Life Function Summary")
}| Quantity | Value |
|---|---|
| Shape (k) | 3.6675 |
| Scale (λ) | 160.6253 |
| Mean Survival Time | 144.89 days |
| Variance | 1932.53 days² |
| SD | 43.96 days |
data.frame(
Time_days = c(10, 25, 50, 75, 100),
S_t = round(sapply(c(10,25,50,75,100), function(t)
S_t[which.min(abs(time_seq - t))]), 4),
h_t = round(sapply(c(10,25,50,75,100), function(t)
h_t[which.min(abs(time_seq - t))]), 6),
F_t = round(sapply(c(10,25,50,75,100), function(t)
F_t[which.min(abs(time_seq - t))]), 4)
) %>%
style_table("Table 16: Life Function Values at Key Time Points")| Time_days | S_t | h_t | F_t |
|---|---|---|---|
| 10 | 1.0000 | 0.000013 | 0.0000 |
| 25 | 0.9989 | 0.000162 | 0.0011 |
| 50 | 0.9861 | 0.001023 | 0.0139 |
| 75 | 0.9402 | 0.003011 | 0.0598 |
| 100 | 0.8396 | 0.006421 | 0.1604 |
The shape parameter k > 1 confirms an increasing hazard — the longer a rat survives, the higher its risk of developing a tumour. The mean tumour-free survival time is approximately 144.9 days.
We fit the best model separately for each group to compare the survival patterns directly.
rats_ctrl <- subset(rats_clean, treatment == "Control")
rats_trt <- subset(rats_clean, treatment == "Treated")
best_dist <- dist_list[[best_model_name]]
fit_ctrl <- flexsurvreg(Surv(time, event) ~ 1, data = rats_ctrl, dist = best_dist)
fit_trt <- flexsurvreg(Surv(time, event) ~ 1, data = rats_trt, dist = best_dist)if (best_model_name == "Weibull") {
shape_ctrl <- fit_ctrl$res["shape","est"]
scale_ctrl <- fit_ctrl$res["scale","est"]
mean_ctrl <- scale_ctrl * gamma(1 + 1/shape_ctrl)
var_ctrl <- scale_ctrl^2 * (gamma(1+2/shape_ctrl) - gamma(1+1/shape_ctrl)^2)
shape_trt <- fit_trt$res["shape","est"]
scale_trt <- fit_trt$res["scale","est"]
mean_trt <- scale_trt * gamma(1 + 1/shape_trt)
var_trt <- scale_trt^2 * (gamma(1+2/shape_trt) - gamma(1+1/shape_trt)^2)
data.frame(
Group = c("Control", "Treated"),
Shape_k = round(c(shape_ctrl, shape_trt), 4),
Scale_lambda = round(c(scale_ctrl, scale_trt), 4),
Mean_Survival_days = round(c(mean_ctrl, mean_trt), 2),
Variance_days2 = round(c(var_ctrl, var_trt), 2),
SD_days = round(c(sqrt(var_ctrl), sqrt(var_trt)), 2)
) %>%
style_table("Table 17: Weibull Parameters by Treatment Group")
}| Group | Shape_k | Scale_lambda | Mean_Survival_days | Variance_days2 | SD_days |
|---|---|---|---|---|---|
| Control | 3.1597 | 191.3995 | 171.32 | 3532.64 | 59.44 |
| Treated | 4.3831 | 134.2006 | 122.28 | 997.09 | 31.58 |
Both groups have shape > 1, confirming increasing hazard in both. Control rats have a longer mean tumour-free survival (171.3 days) compared to treated rats (122.3 days) — a difference of 49 days, consistent with the carcinogenic treatment accelerating tumour onset.
time_seq2 <- seq(0.1, max(rats_clean$time), by = 1)
S_ctrl <- summary(fit_ctrl, t = time_seq2, type = "survival")[[1]]$est
h_ctrl <- summary(fit_ctrl, t = time_seq2, type = "hazard")[[1]]$est
f_ctrl <- h_ctrl * S_ctrl
F_ctrl <- 1 - S_ctrl
S_trt <- summary(fit_trt, t = time_seq2, type = "survival")[[1]]$est
h_trt <- summary(fit_trt, t = time_seq2, type = "hazard")[[1]]$est
f_trt <- h_trt * S_trt
F_trt <- 1 - S_trt
cols2 <- c("Control" = "#2196F3", "Treated" = "#F44336")
par(mfrow = c(2,2), mar = c(4.5, 4.5, 3.5, 1))
plot(time_seq2, S_ctrl, type="l", col=cols2[1], lwd=2.5,
ylim=c(0,1), xlab="Time (days)", ylab="S(t)",
main="Survival Function by Group")
lines(time_seq2, S_trt, col=cols2[2], lwd=2.5)
abline(h=0.5, lty=2, col="grey60")
legend("topright", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()
plot(time_seq2, h_ctrl, type="l", col=cols2[1], lwd=2.5,
xlab="Time (days)", ylab="h(t)", main="Hazard Function by Group")
lines(time_seq2, h_trt, col=cols2[2], lwd=2.5)
legend("topleft", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()
plot(time_seq2, f_ctrl, type="l", col=cols2[1], lwd=2.5,
xlab="Time (days)", ylab="f(t)", main="Density Function by Group")
lines(time_seq2, f_trt, col=cols2[2], lwd=2.5)
legend("topright", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()
plot(time_seq2, F_ctrl, type="l", col=cols2[1], lwd=2.5,
ylim=c(0,1), xlab="Time (days)", ylab="F(t)", main="CDF by Group")
lines(time_seq2, F_trt, col=cols2[2], lwd=2.5)
abline(h=0.5, lty=2, col="grey60")
legend("bottomright", legend=names(cols2), col=cols2, lwd=2, bty="n"); grid()km_fit <- survfit(surv_obj ~ 1, data = rats_clean)
km_times <- km_fit$time
km_surv <- km_fit$surv
param_surv <- summary(best_model, t = km_times, type = "survival")[[1]]$est
plot(
1 - km_surv, 1 - param_surv,
pch = 16, col = "#1E88E5", cex = 0.8,
xlab = "Empirical CDF (KM)",
ylab = paste("Fitted CDF (", best_model_name, ")"),
main = paste("PP-Plot: KM vs", best_model_name),
xlim = c(0,1), ylim = c(0,1)
)
abline(0, 1, col = "red", lwd = 2, lty = 2)
legend("bottomright", legend = "Perfect fit", col = "red", lty = 2, lwd = 2, bty = "n")
grid()Points lying close to the 45° diagonal line indicate good agreement between the fitted Weibull model and the empirical KM estimate — the model fits the data well.
Rats within the same litter share genetic and environmental factors, making their outcomes correlated. A standard Cox model treats all observations as independent, which would underestimate standard errors. We use a shared frailty model to account for this.
frailty_model <- coxph(
Surv(time, event) ~ treatment + frailty(litter, distribution = "gamma"),
data = rats_clean
)
summary(frailty_model)## Call:
## coxph(formula = Surv(time, event) ~ treatment + frailty(litter,
## distribution = "gamma"), data = rats_clean)
##
## n= 300, number of events= 42
##
## coef se(coef) se2 Chisq DF p
## treatmentTreated 0.7271 0.3183 0.3126 5.22 1.00 0.022
## frailty(litter, distribut 63.05 47.67 0.067
##
## exp(coef) exp(-coef) lower .95 upper .95
## treatmentTreated 2.069 0.4833 1.109 3.861
##
## Iterations: 6 outer, 23 Newton-Raphson
## Variance of random effect= 2.021201 I-likelihood = -217.5
## Degrees of freedom for terms= 1.0 47.7
## Concordance= 0.923 (se = 0.014 )
## Likelihood ratio test= 88.54 on 48.63 df, p=4e-04
hr <- exp(coef(frailty_model)["treatmentTreated"])
hr_ci <- exp(confint(frailty_model))
data.frame(
Model = "Frailty (gamma)",
HR = round(hr, 4),
Lower_95 = round(hr_ci["treatmentTreated", 1], 4),
Upper_95 = round(hr_ci["treatmentTreated", 2], 4),
Direction = ifelse(hr > 1, "Treated group has higher hazard", "Treated group has lower hazard")
) %>%
style_table("Table 18: Frailty Model — Hazard Ratio for Treatment")| Model | HR | Lower_95 | Upper_95 | Direction | |
|---|---|---|---|---|---|
| treatmentTreated | Frailty (gamma) | 2.069 | 1.1088 | 3.8607 | Treated group has higher hazard |
tryCatch({
theta <- frailty_model$history[[1]]$theta
cat(sprintf("Frailty variance (theta): %.4f\n", theta))
if (theta > 0.1) {
cat("There is meaningful within-litter clustering.\n")
cat("The frailty model is the appropriate choice for this data.\n")
} else {
cat("Litter clustering appears minimal, but including frailty is still good practice.\n")
}
}, error = function(e) {
cat("Could not extract theta automatically. Check: frailty_model$history\n")
})## Frailty variance (theta): 2.0204
## There is meaningful within-litter clustering.
## The frailty model is the appropriate choice for this data.
After adjusting for litter clustering, treated rats have a significantly higher hazard of tumour development compared to controls (HR ≈ 2.07, p < 0.05), which is consistent with the carcinogenic agent in the treatment.
data.frame(
Quantity = c(
"Total observations", "Events (tumours)", "Censored",
"Overall event rate", "Log-rank p-value",
"Best parametric model", "Best model AIC",
if (best_model_name == "Weibull") c("Weibull shape (k)", "Weibull scale (λ)",
"Mean survival time", "Variance") else NULL,
"Frailty HR (Treated vs Control)"
),
Value = c(
nrow(rats_clean), sum(rats_clean$event), sum(rats_clean$event == 0),
paste0(round(mean(rats_clean$event)*100, 1), "%"),
round(pval, 4),
best_model_name,
round(model_comparison$AIC[1], 3),
if (best_model_name == "Weibull") c(round(shape,4), round(scale,4),
paste(round(mean_T,2),"days"), paste(round(var_T,2),"days²")) else NULL,
round(hr, 4)
)
) %>%
style_table("Table 19: Final Results Summary")| Quantity | Value |
|---|---|
| Total observations | 300 |
| Events (tumours) | 42 |
| Censored | 258 |
| Overall event rate | 14% |
| Log-rank p-value | 0.0185 |
| Best parametric model | Weibull |
| Best model AIC | 578.205 |
| Weibull shape (k) | 3.6675 |
| Weibull scale (λ) | 160.6253 |
| Mean survival time | 144.89 days |
| Variance | 1932.53 days² |
| Frailty HR (Treated vs Control) | 2.069 |
if (!dir.exists("outputs")) dir.create("outputs")
# KM by treatment
png("outputs/km_by_treatment.png", width = 900, height = 700, res = 120)
print(ggsurvplot(km_treatment, data = rats_clean, conf.int = TRUE,
risk.table = TRUE, pval = TRUE, palette = c("#2E86AB","#F18F01"),
legend.labs = c("Control","Treated"), ggtheme = theme_minimal(),
title = "KM Curves by Treatment", xlab = "Time (days)",
surv.median.line = "none"))
dev.off()## png
## 2
# KM by sex
png("outputs/km_by_sex.png", width = 900, height = 700, res = 120)
print(ggsurvplot(km_sex, data = rats_clean, conf.int = TRUE,
risk.table = TRUE, pval = TRUE, palette = c("#A23B72","#3B7080"),
legend.labs = c("Female","Male"), ggtheme = theme_minimal(),
title = "KM Curves by Sex", xlab = "Time (days)"))
dev.off()## png
## 2
# AIC comparison bar chart
png("outputs/aic_comparison.png", width = 900, height = 600, res = 120)
print(ggplot(model_comparison, aes(x = reorder(Model, AIC), y = AIC,
fill = Model == best_model_name)) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values = c("FALSE" = "#2E86AB", "TRUE" = "#F18F01")) +
coord_flip() +
labs(title = "AIC Comparison Across Parametric Models", x = "", y = "AIC") +
theme_minimal())
dev.off()## png
## 2
# Adjusted Cox survival curves
png("outputs/cox_adjusted_curves.png", width = 900, height = 700, res = 120)
print(ggsurvplot(cox_surv, data = newdata_cox, conf.int = TRUE,
palette = c("#2E86AB","#F18F01"), legend.labs = c("Control","Treated"),
ggtheme = theme_minimal(), title = "Adjusted Survival Curves from Cox Model",
xlab = "Time (days)"))
dev.off()## png
## 2
# Schoenfeld residuals
png("outputs/schoenfeld_residuals.png", width = 900, height = 500, res = 120)
par(mfrow = c(1, 2))
plot(ph_test, var = 1, main = "Schoenfeld — Treatment",
ylab = "Beta(t)", xlab = "Time (days)", col = "#F18F01", lwd = 2)
abline(h = coef(cox_multi)[1], col = "red", lty = 2, lwd = 2)
plot(ph_test, var = 2, main = "Schoenfeld — Sex",
ylab = "Beta(t)", xlab = "Time (days)", col = "#A23B72", lwd = 2)
abline(h = coef(cox_multi)[2], col = "red", lty = 2, lwd = 2)
par(mfrow = c(1, 1))
dev.off()## png
## 2
# PP plot
png("outputs/pp_plot.png", width = 700, height = 700, res = 120)
plot(1 - km_surv, 1 - param_surv, pch = 16, col = "#1E88E5", cex = 0.8,
xlab = "Empirical CDF (KM)",
ylab = paste("Fitted CDF (", best_model_name, ")"),
main = paste("PP-Plot: KM vs", best_model_name),
xlim = c(0,1), ylim = c(0,1))
abline(0, 1, col = "red", lwd = 2, lty = 2)
legend("bottomright", legend = "Perfect fit", col = "red", lty = 2, lwd = 2, bty = "n")
grid()
dev.off()## png
## 2
# Overall life functions
png("outputs/life_functions_overall.png", width = 900, height = 700, res = 120)
par(mfrow = c(2,2), mar = c(4.5,4.5,3,1))
plot(time_seq, S_t, type="l", col="#1E88E5", lwd=2.5,
xlab="Time (days)", ylab="S(t)", main="Survival Function", ylim=c(0,1)); grid()
plot(time_seq, h_t, type="l", col="#E53935", lwd=2.5,
xlab="Time (days)", ylab="h(t)", main="Hazard Function"); grid()
plot(time_seq, f_t, type="l", col="#43A047", lwd=2.5,
xlab="Time (days)", ylab="f(t)", main="Density Function"); grid()
plot(time_seq, F_t, type="l", col="#8E24AA", lwd=2.5,
xlab="Time (days)", ylab="F(t)", main="CDF", ylim=c(0,1)); grid()
par(mfrow=c(1,1))
dev.off()## png
## 2
# Life functions by group
png("outputs/life_functions_by_group.png", width = 900, height = 700, res = 120)
par(mfrow=c(2,2), mar=c(4.5,4.5,3.5,1))
plot(time_seq2, S_ctrl, type="l", col="#2196F3", lwd=2.5,
ylim=c(0,1), xlab="Time (days)", ylab="S(t)", main="Survival by Group")
lines(time_seq2, S_trt, col="#F44336", lwd=2.5)
legend("topright", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
plot(time_seq2, h_ctrl, type="l", col="#2196F3", lwd=2.5,
xlab="Time (days)", ylab="h(t)", main="Hazard by Group")
lines(time_seq2, h_trt, col="#F44336", lwd=2.5)
legend("topleft", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
plot(time_seq2, f_ctrl, type="l", col="#2196F3", lwd=2.5,
xlab="Time (days)", ylab="f(t)", main="Density by Group")
lines(time_seq2, f_trt, col="#F44336", lwd=2.5)
legend("topright", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
plot(time_seq2, F_ctrl, type="l", col="#2196F3", lwd=2.5,
ylim=c(0,1), xlab="Time (days)", ylab="F(t)", main="CDF by Group")
lines(time_seq2, F_trt, col="#F44336", lwd=2.5)
legend("bottomright", legend=c("Control","Treated"), col=c("#2196F3","#F44336"), lwd=2, bty="n"); grid()
par(mfrow=c(1,1))
dev.off()## png
## 2
## All 8 plots saved to outputs/