VQUIN Code: Quarto file

VQUIN Code

library(mgcv)
library(ggplot2)
library(gridExtra)
library(tidyr)
library(patchwork)

#Dataframe VQUIN 
dataVQUIN <- data.frame(
  time = c(0, 6, 12, 18, 24, 30, 36, 42, 48, 54),
  control = c(0.00, 0.000799823, 0.001855899, 0.00298935, 0.003962505, 0.005923512, 0.007805757, 0.008778634, 0.008755893, 0.008818015),
  intervention = c(0.00, 0.00, 0.00, 0.00, 0.00, 0.000964834, 0.000946531, 0.000845305, 0.002888125, 0.00278468))

#VQUIN
data_longVQUIN <- pivot_longer(dataVQUIN, cols = c(control, intervention), names_to = "group", values_to = "incidence")
data_longVQUIN$group <- factor(data_longVQUIN$group)

Fitting the GAM and GLM models - x4 curves x3 models

#Fit GLM LINEAR VQUIN
glm_controlVQUIN <- glm(control ~ time, data = dataVQUIN, family = gaussian())
glm_interventionVQUIN <- glm(intervention ~ time, data = dataVQUIN, family = gaussian())

#Fit GLM LOGISTIC VQUIN
glm_control_logVQUIN <- glm(control ~ time, data = dataVQUIN, family = binomial(link = "logit"))
glm_intervention_logVQUIN <- glm(intervention ~ time, data = dataVQUIN, family = binomial(link = "logit"))

#Fit GAM VQUIN
gam_modelVQUIN <- gam(incidence ~ s(time, by = group) + group, data = data_longVQUIN, method = "REML")

Generating predictions - VQUIN

#Generating predictions VQUIN
pred_dataVQUIN <- expand.grid(time = seq(0, max(dataVQUIN$time), length.out = 100),
                         group = levels(data_longVQUIN$group))
pred_dataVQUIN$glm_predVQUIN <- ifelse(pred_dataVQUIN$group == "control",
                             predict(glm_controlVQUIN, newdata = pred_dataVQUIN, type = "response"),
                             predict(glm_interventionVQUIN, newdata = pred_dataVQUIN, type = "response"))
pred_dataVQUIN$glm_pred_logVQUIN <- ifelse(pred_dataVQUIN$group == "control",
                                 predict(glm_control_logVQUIN, newdata = pred_dataVQUIN, type = "response"),
                                 predict(glm_intervention_logVQUIN, newdata = pred_dataVQUIN, type = "response"))
pred_dataVQUIN$gam_predVQUIN <- predict(gam_modelVQUIN, newdata = pred_dataVQUIN, type = "response")
#Calculate RMSE for each model
calculate_rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}
#RMSE for GLM Linear
rmse_glm_controlVQUIN <- calculate_rmse(dataVQUIN$control, predict(glm_controlVQUIN, newdata = dataVQUIN))
rmse_glm_interventionVQUIN <- calculate_rmse(dataVQUIN$intervention, predict(glm_interventionVQUIN, newdata = dataVQUIN))
rmse_glm_linearVQUIN <- mean(c(rmse_glm_controlVQUIN, rmse_glm_interventionVQUIN))

#RMSE for GLM Logistic
rmse_glm_control_logVQUIN <- calculate_rmse(dataVQUIN$control, predict(glm_control_logVQUIN, newdata = dataVQUIN, type = "response"))
rmse_glm_intervention_logVQUIN <- calculate_rmse(dataVQUIN$intervention, predict(glm_intervention_logVQUIN, newdata = dataVQUIN, type = "response"))
rmse_glm_logisticVQUIN <- mean(c(rmse_glm_control_logVQUIN, rmse_glm_intervention_logVQUIN))

#RMSE for GAM
gam_predictionsVQUIN <- predict(gam_modelVQUIN, newdata = data_longVQUIN, type = "response")
rmse_gamVQUIN <- calculate_rmse(data_longVQUIN$incidence, gam_predictionsVQUIN)

#Print RMSE values
cat("RMSE for GLM Linear Model (Control + Intervention):", rmse_glm_linearVQUIN, "\n") #0.0006 
cat("RMSE for GLM Logistic Model (Control + Intervention):", rmse_glm_logisticVQUIN, "\n") #0.00096
cat("RMSE for GAM Model:", rmse_gamVQUIN, "\n") #0.00026

VQUIN predictions plots (Figure 3)

# Create GLM plot (left - shows y-axis only)
glm_plotVQUIN <- ggplot(data_longVQUIN, aes(x = time, y = incidence * 100, color = group)) +
  geom_point() +
  geom_line(data = pred_dataVQUIN, aes(y = glm_predVQUIN * 100)) +
  labs(title = "GLM Regression - Linear",
       x = "", y = "Cumulative Incidence (%)") +
  scale_color_manual(values = c("control" = "red", "intervention" = "blue")) +
  theme_minimal() +
  ylim(0, 6) +
  theme(legend.position = "none")

# GAM plot (middle - no axes)
gam_plotVQUIN <- ggplot(data_longVQUIN, aes(x = time, y = incidence * 100, color = group)) +
  geom_point() +
  geom_line(data = pred_dataVQUIN, aes(y = gam_predVQUIN * 100)) +
  labs(title = "GAM Regression",
       x = "", y = "") +
  scale_color_manual(values = c("control" = "red", "intervention" = "blue")) +
  theme_minimal() +
  ylim(0, 6) +
  theme(legend.position = "none",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

# GLM plot - logistic (right - no axes)
glm_plot_logVQUIN <- ggplot(data_longVQUIN, aes(x = time, y = incidence * 100, color = group)) +
  geom_point() +
  geom_line(data = pred_dataVQUIN, aes(y = glm_pred_logVQUIN * 100)) +
  labs(title = "GLM Regression - Logistic",
       x = "", y = "") +
  scale_color_manual(values = c("control" = "red", "intervention" = "blue")) +
  theme_minimal() +
  ylim(0, 6) +
  theme(legend.position = "none",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())

# Combine plots with shared x-axis label
combined_plotVQUIN <- glm_plotVQUIN + gam_plotVQUIN + glm_plot_logVQUIN +
  plot_layout(guides = "collect") &
  theme(legend.position = "bottom", 
        legend.text = element_text(size = 14))

# Add overall x-axis label
combined_plotVQUIN <- combined_plotVQUIN + 
  plot_annotation(caption = "Time (weeks)", 
                  theme = theme(plot.caption = element_text(hjust = 0.5, size = 12)))

# Display the combined plot
print(combined_plotVQUIN)

#Then, print model summaries

Goodness of fit of the models

VQUIN GOF

#McFadden's Pseudo R-squared for GLMs
pseudo_r2_controlVQUIN <- 1 - (glm_controlVQUIN$deviance / glm_controlVQUIN$null.deviance)
pseudo_r2_interventionVQUIN <- 1 - (glm_interventionVQUIN$deviance / glm_interventionVQUIN$null.deviance)

# Print McFadden's Pseudo R-squared for Linear GLMs
cat("McFadden's Pseudo R-squared for Linear GLM (Control):", pseudo_r2_controlVQUIN, "\n")
cat("McFadden's Pseudo R-squared for Linear GLM (Intervention):", pseudo_r2_interventionVQUIN, "\n")

# Pseudo R-squared for Logistic GLMs
pseudo_r2_control_logVQUIN <- 1 - (glm_control_logVQUIN$deviance / glm_control_logVQUIN$null.deviance)
pseudo_r2_intervention_logVQUIN <- 1 - (glm_intervention_logVQUIN$deviance / glm_intervention_logVQUIN$null.deviance)

# Print Pseudo R-squared for Logistic GLMs
cat("Pseudo R-squared for Logistic GLM (Control):", pseudo_r2_control_logVQUIN, "\n")
cat("Pseudo R-squared for Logistic GLM (Intervention):", pseudo_r2_intervention_logVQUIN, "\n")

#AIC for the GAM model
aic_valueVQUIN <- AIC(gam_modelVQUIN)
print(aic_valueVQUIN)

#McFadden's Pseudo R-squared for Linear GLM (Control): 0.96 
#McFadden's Pseudo R-squared for Linear GLM (Intervention): 0.75
#Pseudo R-squared for Logistic GLM (Control): 0.80 
#Pseudo R-squared for Logistic GLM (Intervention): 0.84
#AIC VQUIN : -250.5508

GAM Cumulative Incidence Curves with 95% UI (Figure 4)

Using final choice GCV = 5

library(mgcv)
library(ggplot2)
library(dplyr)

# Fixed function to add (0,0) point with matching column structure
add_zero_point <- function(data) {
  # Create a zero point with the same structure as the input data
  zero_point <- data[1, ]  # Take first row as template
  zero_point[1, ] <- NA    # Clear all values
  
  # Set the specific values we want
  zero_point$time <- 0
  zero_point$incidence <- 0
  zero_point$group <- unique(data$group)[1]  # Take first unique group value
  
  # Add weight column if it doesn't exist
  if(!"weight" %in% names(data)) {
    data$weight <- 1  # Default weight for existing data
    zero_point$weight <- 100  # High weight for zero point
  } else {
    zero_point$weight <- 100
  }
  
  # Fill any remaining NA values with appropriate defaults
  for(col in names(zero_point)) {
    if(is.na(zero_point[[col]]) && col != "time" && col != "incidence" && col != "group" && col != "weight") {
      if(is.numeric(data[[col]])) {
        zero_point[[col]] <- 0
      } else if(is.character(data[[col]]) || is.factor(data[[col]])) {
        zero_point[[col]] <- data[[col]][1]
      }
    }
  }
  
  # Combine the data
  rbind(data, zero_point)
}

# Prepare data with zero points
control_data_VQUIN <- data_longVQUIN %>% 
  filter(group == "control") %>%
  add_zero_point()

intervention_data_VQUIN <- data_longVQUIN %>% 
  filter(group == "intervention") %>%
  add_zero_point()

# Fit optimal GAM models (k=5, GCV method) with weights for zero constraint
control_gam_k5_GCV_VQUIN <- gam(
  incidence ~ s(time, k = 5), 
  data = control_data_VQUIN, 
  weights = ifelse(control_data_VQUIN$time == 0, 100, 1),
  method = "GCV.Cp"
)

intervention_gam_k5_GCV_VQUIN <- gam(
  incidence ~ s(time, k = 5), 
  data = intervention_data_VQUIN, 
  weights = ifelse(intervention_data_VQUIN$time == 0, 100, 1),
  method = "GCV.Cp"
)

# Generate predictions for all time points
original_times <- sort(unique(data_longVQUIN$time))
new_data <- data.frame(time = original_times)

# Function to get predictions with CIs (MODIFIED TO CONVERT TO PERCENTAGES)
get_predictions <- function(model, group_name) {
  pred <- predict(model, newdata = new_data, se.fit = TRUE)
  data.frame(
    time = original_times,
    group = group_name,
    incidence = as.numeric(pred$fit) * 100,  # Convert to percentage
    se = as.numeric(pred$se.fit) * 100,      # Convert SE to percentage
    stringsAsFactors = FALSE
  ) %>%
    mutate(
      ci_lower = pmax(0, incidence - 1.96 * se),
      ci_upper = incidence + 1.96 * se,
      ci_width = ci_upper - ci_lower
    )
}

# Get predictions for both groups
control_predictions <- get_predictions(control_gam_k5_GCV_VQUIN, "control")
intervention_predictions <- get_predictions(intervention_gam_k5_GCV_VQUIN, "intervention")

# Combine predictions
gam_predictions_k5_VQUIN <- rbind(control_predictions, intervention_predictions)

# Verify zero constraint
print("=== ZERO CONSTRAINT VERIFICATION ===")
print("Predictions at time 0:")
time_zero_preds <- gam_predictions_k5_VQUIN %>% filter(time == 0)
print(time_zero_preds)

# Display CI width comparison
control_ci_width <- mean(control_predictions$ci_width)
intervention_ci_width <- mean(intervention_predictions$ci_width)

print("=== CI WIDTH COMPARISON ===")
print(paste("Control mean CI width:", format(control_ci_width, scientific = TRUE)))
print(paste("Intervention mean CI width:", format(intervention_ci_width, scientific = TRUE)))
print(paste("Ratio (intervention/control):", round(intervention_ci_width / control_ci_width, 1)))

# Create plot (MODIFIED FOR PERCENTAGES)
p_vquin <- ggplot(gam_predictions_k5_VQUIN, aes(x = time, y = incidence, color = group, fill = group)) +
  geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper), alpha = 0.4, color = NA) +
  geom_line(size = 1.2) +
  scale_color_manual(values = c("control" = "red", "intervention" = "blue"),
                     labels = c("Control group", "Intervention group")) +
  scale_fill_manual(values = c("control" = "red", "intervention" = "blue"),
                    labels = c("Control group", "Intervention group")) +
  labs(
    x = "Time since enrolment (weeks)",
    y = "Cumulative Incidence (%)",  # Updated label
    title = "VQUIN GAM Model (k=5, GCV)"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank()) +
  coord_cartesian(ylim = c(0, 5))  # Updated to percentage scale (0-5%)

print(p_vquin)

# Week 26 predictions
week_26_data <- data.frame(time = 26)

# Get week 26 predictions for both groups
control_pred_26 <- predict(control_gam_k5_GCV_VQUIN, newdata = week_26_data, se.fit = TRUE)
intervention_pred_26 <- predict(intervention_gam_k5_GCV_VQUIN, newdata = week_26_data, se.fit = TRUE)

# Create week 26 results table
week_26_results <- data.frame(
  Group = c("Control", "Intervention"),
  Week = 26,
  Predicted_Incidence = c(as.numeric(control_pred_26$fit), as.numeric(intervention_pred_26$fit)),
  Standard_Error = c(as.numeric(control_pred_26$se.fit), as.numeric(intervention_pred_26$se.fit))
) %>%
  mutate(
    CI_Lower_95 = pmax(0, Predicted_Incidence - 1.96 * Standard_Error),
    CI_Upper_95 = Predicted_Incidence + 1.96 * Standard_Error,
    CI_Width = CI_Upper_95 - CI_Lower_95
  )

print("=== WEEK 26 GAM PREDICTIONS ===")
print(week_26_results)

Curves with delay scenarios (Figure 5)

library(mgcv)

# —————————————————————————————————————————————————————
# A) Fit‐based cumulative‐incidence at t = 0…54, with p0‐spike

time0 <- 0:54
p0    <- 61/3948
back_wk <- 13

# Use the k=5 GCV GAM models
Fc0 <- predict(control_gam_k5_GCV_VQUIN,
               newdata = data.frame(time = time0))
Fi0 <- predict(intervention_gam_k5_GCV_VQUIN,
               newdata = data.frame(time = time0))

Fc0 <- p0 + (1 - p0)*Fc0
Fi0 <- p0 + (1 - p0)*Fi0

h_ctrl <- diff(c(0, Fc0)) / (1 - c(0, Fc0)[-length(Fc0)])
h_int  <- diff(c(0, Fi0)) / (1 - c(0, Fi0)[-length(Fi0)])

# —————————————————————————————————————————————————————
# B) "Pre‐screen" ramp length and nominal post‐screen delays

back_wk <- 13            # show 3 months before screen
nominal_delays <- c(-13, -10, -5, 5, 10, 20) # weeks of *additional* delay after screen

# —————————————————————————————————————————————————————
# C) build a time axis from –back_wk … +54

time_ext <- seq(-back_wk, 54, by = 1)

# —————————————————————————————————————————————————————
# D) single‐curve builder: √ ramp on [–back_wk,0], then hazard‐switch at d

build_curve <- function(d) {
  n    <- length(time_ext)
  Fcum <- numeric(n)

  # find index of screen‐time t0 = 0
  i0   <- which(time_ext == 0)

  # 1) PRE‐SCREEN: √ ramp from (–back_wk → 0) up to p0
  s_up <- time_ext[1:(i0-1)] + back_wk
  Fcum[1:(i0-1)] <- p0 * sqrt( s_up / back_wk )

  # 2) AT SCREEN
  Fcum[i0] <- p0
  Sprev    <- 1 - p0

  # 3) POST‐SCREEN: control hazard until t0 ≤ d, then intervention hazard
  for(i in (i0+1):n) {
    t0 <- time_ext[i]
    if (d > 0 && t0 <= d) {
      h_t <- h_ctrl[t0 + 1]
    } else {
      idx <- if (d > 0) (t0 - d + 1) else (t0 + 1)
      h_t <- h_int[ min(idx, length(h_int)) ]
    }
    Snew     <- Sprev * (1 - h_t)
    Fcum[i]  <- 1 - Snew
    Sprev    <- Snew
  }

  Fcum
}

# —————————————————————————————————————————————————————
# E) build all the curves

# 1) Control & Instant curves
ctrl_ext <- build_curve( Inf)   # pure control
inst_ext <- build_curve(-Inf)   # pure intervention

#   and the √ ramp function for pre‐screen:
ramp <- p0 * sqrt( (time_ext + back_wk) / back_wk )

# 2) separate positive and negative delays
pos_delays <- nominal_delays[nominal_delays >= 0]
neg_delays <- nominal_delays[nominal_delays < 0]

# 3) positive delays:
pos_mat <- if(length(pos_delays) > 0) sapply(pos_delays, function(d) build_curve(d)) else NULL

# 4) negative delays: ramp until delay point, then copy instant TPT curve
neg_mat <- if(length(neg_delays) > 0) {
  sapply(neg_delays, function(d) {
    n    <- length(time_ext)
    Fcum <- numeric(n)
    
    # find index of delay point d
    id_match <- which(time_ext == d)
    if(length(id_match) == 0) {
      # If exact match not found, find closest index
      id <- which.min(abs(time_ext - d))
    } else {
      id <- id_match[1]
    }
    
    i0   <- which(time_ext == 0)  # screen time index
    
    # Before delay point: follow the ramp
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0  # ensure non-negative
      Fcum[1:(id-1)] <- p0 * sqrt( s_up / back_wk )
    }
    
    # At delay point: set starting value from ramp
    s_delay <- max(0, d + back_wk)
    start_value <- p0 * sqrt( s_delay / back_wk )
    Fcum[id] <- start_value
    
    # After delay point: copy the instant TPT curve pattern
    if(id < n) {
      # Find the baseline instant TPT value at time 0 (our reference point)
      inst_at_zero <- inst_ext[i0]
      
      for(i in (id+1):n) {
        # Calculate how many steps from delay point
        steps_from_delay <- i - id
        
        # Find corresponding point in instant TPT curve (relative to time 0)
        inst_index <- i0 + steps_from_delay
        
        if(inst_index <= length(inst_ext) && inst_index > 0) {
          # Get the instant TPT value at this point
          inst_value <- inst_ext[inst_index]
          
          # Calculate the change from instant TPT baseline
          inst_change <- inst_value - inst_at_zero
          
          # Apply the same change to our starting value
          Fcum[i] <- start_value + inst_change
        } else if(inst_index > length(inst_ext)) {
          # If we've gone beyond available instant TPT data, 
          # use the last available change and keep it constant
          last_inst_value <- inst_ext[length(inst_ext)]
          last_change <- last_inst_value - inst_at_zero
          Fcum[i] <- start_value + last_change
        } else {
          # If we somehow get a negative index, keep the start value
          Fcum[i] <- start_value
        }
      }
    }
    
    Fcum
  })
} else NULL

# Combine all delay matrices
if(!is.null(neg_mat) && !is.null(pos_mat)) {
  delay_mat <- cbind(neg_mat, pos_mat)
  colnames(delay_mat) <- c(
    paste0(neg_delays, " wk delay"),
    paste0(pos_delays, " wk delay"))
} else if(!is.null(neg_mat)) {
  delay_mat <- neg_mat
  colnames(delay_mat) <- paste0(neg_delays, " wk delay")
} else if(!is.null(pos_mat)) {
  delay_mat <- pos_mat
  colnames(delay_mat) <- paste0(pos_delays, " wk delay")
} else {
  delay_mat <- NULL
}

# —————————————————————————————————————————————————————
# F') compressed x‐axis plot, up to +30 wk, now including the negative delay curves

# how many "plot units" to devote to the negative side?
neg_width <- 30

# remap time_ext → x_plot:
x_plot <- ifelse(
  time_ext < 0,
  neg_width * (time_ext + back_wk) / back_wk,  # −back_wk…0 → 0…30
  neg_width + pmin(time_ext, 54)              #   0…30 → 30…60
)

# bind together Control, Instant, and only your chosen delays:
if(!is.null(delay_mat)) {
  all_mat <- cbind(
    Control       = ctrl_ext,
    `Instant TPT` = inst_ext,
    delay_mat
  )
} else {
  all_mat <- cbind(
    Control       = ctrl_ext,
    `Instant TPT` = inst_ext
  )
}

# subset to t ≤ 26 wk 
keep   <- which(time_ext <= 26)
x_sub   <- x_plot[keep]
mat_sub <- all_mat[keep, , drop=FALSE]

# CORRECTED: Convert to percentages in the data
mat_sub_pct <- mat_sub * 100

# MODIFIED: Create gradient colors from better (green) to worse (red) delays
n_delay_cols <- if(!is.null(delay_mat)) ncol(delay_mat) else 0

if(n_delay_cols > 0) {
  # Create a gradient from green (better/earlier) to red (worse/later)
  delay_colors <- colorRampPalette(c("#00AA00", "#FFAA00", "#FF0000"))(n_delay_cols)
  cols <- c("red", "blue", delay_colors)
} else {
  cols <- c("red", "blue")
}

ltys <- c(1, 1, rep(2, n_delay_cols))
yr   <- range(mat_sub_pct)

# CORRECTED: Use matplot with percentage data and larger axis text
matplot(
  x_sub, mat_sub_pct,
  type="l", col=cols, lty=ltys, lwd=2,
  xaxt="n", ylim=yr,
  xlab="Weeks since randomisation",
  ylab="Cumulative incidence of TB (%)",  # Updated label
  main="VQUIN Delayed-TPT scenarios",
  cex.axis = 1.0,  # Size 12 for axis numbers
  cex.lab = 1.0    # Size 12 for axis labels
)

# custom x-axis ticks:
neg_ticks <- seq(-back_wk, 0, by=2)     
neg_at    <- neg_width * (neg_ticks + back_wk) / back_wk
pos_ticks <- seq(0, 26, by=2) 
pos_at    <- neg_width + pos_ticks

axis(1,
     at     = c(neg_at, pos_at),
     labels = c(neg_ticks, pos_ticks),
     cex.axis = 1.0)  # Size 12 for axis tick labels

# vertical line at t=0
abline(v=neg_width, lty=3, col="gray50")

# legend with larger text
legend("topleft",
       legend    = colnames(all_mat),
       col       = cols,
       lty       = ltys,
       lwd       = 2,
       bty       = "n",
       ncol      = 1,        
       cex       = 1.2,      # Increased from 0.8 to 1.2 for larger legend
       y.intersp = 0.8,      
       x.intersp = 0.5,      
       inset     = c(0.02, 0.02))

# --------------------------------------------------------
# G) Print cumulative incidence at 26 weeks for all curves
# --------------------------------------------------------

t_target <- 26
idx_26 <- which(time_ext == t_target)

if (length(idx_26) > 0) {
  ci_26 <- all_mat[idx_26, ] * 100   # convert to %
  cat("Cumulative incidence at", t_target, "weeks (%):\n")
  print(round(ci_26, 2))
} else {
  warning("No exact match for 26 weeks in time_ext")
}

Appendix A1 TABLE of all 2-week delay scenario statistics with UIs :

library(mgcv)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(scales)

# ——————————————————————————————————————————————————————
# Enhanced TB Delays Analysis with 2-week intervals - Using Real GAM Models
# TBCHAMP ANALYSIS
# ——————————————————————————————————————————————————————

# Input Parameters (TBCHAMP data)
n_control <- 897
n_intervention <- 907
py_control <- 938
py_intervention <- 952
n_simulations <- 5000
set.seed(123)

cat("Enhanced Delays Analysis Parameters (TBCHAMP):\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function (from your working code)
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Scenario Builder (exactly from your working code)
# ——————————————————————————————————————————————————————

bootstrap_scenario <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 to consider uncertainty in p0
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # TBCHAMP p0 values 32 and 1120
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  #enforce monotonicity (Simulated cumulative curves can wiggle; discrete hazards can become negative - preventing this)
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  
  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this delay scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 13
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention (Instant TPT) - FIXED: use p0_draw instead of p0
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # CORRECTED: Negative delays (pre-screen intervention) - FIXED: use p0_draw instead of p0
    # This means intervention starts BEFORE time 0, providing earlier protection
    
    # Find delay point index
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    # Before delay point: ramp (no intervention yet)
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At delay point: start intervention
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * sqrt(s_delay / back_wk)  # FIXED
    Sprev <- 1 - Fcum[id]
    
    # CORRECTED: After delay point, use intervention hazards consistently
    # Map the post-delay timeline to intervention hazards starting from time 0
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      
      # Calculate how long intervention has been active
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        # Use intervention hazard corresponding to this duration
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        # Use last available intervention hazard
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        # This shouldn't happen with negative delays, but use first intervention hazard as fallback
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays (post-screen delays) - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At screen
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    # Post-screen: control until delay point, then intervention
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        # Use control hazard until delay point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        # Use intervention hazard after delay point
        # Map to intervention timeline: intervention has been active for (t0 - delay_weeks) weeks
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Enhanced Analysis with 2-week intervals using REAL GAM models
# ——————————————————————————————————————————————————————

cat("Running enhanced Monte Carlo simulations with 2-week intervals...\n")

# Reset seed right before simulations for exact reproducibility
set.seed(42)

# Set up time vectors
time0 <- 0:26
back_wk <- 13
time_ext <- seq(-back_wk, 26, by = 1)

# Create 2-week interval delays from -13 to +26
delays_2wk <- c(seq(-13, -1, by = 2), 0, seq(2, 26, by = 2))

# Generate simulated predictions using your ACTUAL GAM models
cat("Generating GAM simulations using your trained models...\n")
# NOTE: Make sure your GAM models are loaded: control_gam_k5_GCV_TBCHAMP and intervention_gam_k5_GCV_TBCHAMP
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all scenarios including baseline
# In Code 2, change to only include delay scenarios:
all_delays <- setNames(delays_2wk, paste0(delays_2wk, " wk"))

# Run simulations for all scenarios
cat("Running scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_delays)) {
  cat("Processing scenario:", scenario_name, "\n")
  delay_val <- all_delays[scenario_name]
  
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario(i, control_sims, intervention_sims, delay_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# Check if "0 wk" exists and compare a few values
print(simulation_results[["0 wk"]][1:5])  # First 5 simulation results
print(mean(simulation_results[["0 wk"]]))   # Mean for "0 wk" scenario

# ——————————————————————————————————————————————————————
# 4) Calculate Excess Cases PER 100,000 using your exact methodology
# ——————————————————————————————————————————————————————

# Get baseline for relative calculations
baseline_sims <- simulation_results[["0 wk"]]  # Uses "0 wk" as baseline
baseline_mean <- mean(baseline_sims)

# Calculate metrics for each scenario using your exact approach
results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Determine person-years (Control uses control PY, others use intervention PY)
  py <- ifelse(scenario_name == "Control", py_control, py_intervention)
  sample_size <- ifelse(scenario_name == "Control", n_control, n_intervention)
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases (95% CI) - absolute numbers in original cohort
  mean_cases <- mean_incidence * sample_size
  cases_ci_lower <- ci_lower * sample_size
  cases_ci_upper <- ci_upper * sample_size
  
  # 3. Excess calculations (compared to Instant TPT)
  if(scenario_name != "Instant TPT") {
    # Excess incidence simulations
    excess_sims <- scenario_sims - baseline_sims
    excess_mean <- mean(excess_sims)
    excess_ci_lower <- quantile(excess_sims, 0.025)
    excess_ci_upper <- quantile(excess_sims, 0.975)
    
    # Excess rate per 100 person-yrs
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
    excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
    
    # MODIFIED: Excess cases per 100,000 contacts (not absolute numbers)
    excess_cases_per_100k <- excess_mean * 100000
    excess_cases_per_100k_ci_lower <- excess_ci_lower * 100000
    excess_cases_per_100k_ci_upper <- excess_ci_upper * 100000
  } else {
    # Instant TPT is the baseline, so excess = 0
    excess_rate_per_100py <- 0
    excess_rate_ci_lower <- 0
    excess_rate_ci_upper <- 0
    excess_cases_per_100k <- 0
    excess_cases_per_100k_ci_lower <- 0
    excess_cases_per_100k_ci_upper <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / baseline_mean
  
  # Store results
  results_list[[scenario_name]] <- list(
    scenario = scenario_name,
    incidence_rate_per_100py = rate_per_100py,
    incidence_rate_ci_lower = rate_ci_lower,
    incidence_rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    mean_cases = mean_cases,
    cases_ci_lower = cases_ci_lower,
    cases_ci_upper = cases_ci_upper,
    excess_cases_per_100k = excess_cases_per_100k,
    excess_cases_per_100k_ci_lower = excess_cases_per_100k_ci_lower,
    excess_cases_per_100k_ci_upper = excess_cases_per_100k_ci_upper,
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# 5) Prepare Plot Data using your results (MODIFIED for per 100,000)
# ——————————————————————————————————————————————————————

plot_data <- data.frame()

for(scenario_name in names(results_list)) {
  # Don't skip any scenarios this time - keep baseline (week 0) on plot
  
  r <- results_list[[scenario_name]]
  
  # Extract delay value for ordering
  if(scenario_name == "26 wk") {  # CHANGED: Check for "26 wk" instead of "Control"
    delay_weeks <- 999  # Put at the end
    delay_label <- "No TPT"
    delay_type <- "No Treatment"
  } else {
    delay_weeks <- as.numeric(sub(" wk", "", scenario_name))
    delay_label <- as.character(delay_weeks)
    
    # Assign delay types
    if(delay_weeks == 0) {
      delay_type <- "Baseline (0 wk)"  # Special case for baseline
    } else if(delay_weeks < 0) {
      delay_type <- "Pre-screen"
    } else {
      delay_type <- "Post-screen"
    }
  }
  
  plot_data <- rbind(plot_data, data.frame(
    scenario = scenario_name,
    delay_weeks = delay_weeks,
    delay_label = delay_label,
    excess_cases_per_100k = r$excess_cases_per_100k,
    ci_lower = r$excess_cases_per_100k_ci_lower,
    ci_upper = r$excess_cases_per_100k_ci_upper,
    delay_type = delay_type,
    stringsAsFactors = FALSE
  ))
}

# Order by delay weeks
plot_data <- plot_data[order(plot_data$delay_weeks), ]

# ——————————————————————————————————————————————————————
# 6) Create Enhanced Color-Blind Friendly Bar Plot (UPDATED LABELS)
# ——————————————————————————————————————————————————————

# Set up RColorBrewer Paired palette
paired_colors <- brewer.pal(12, "Paired")

# Assign colors to delay types
color_mapping <- c(
  "Pre-screen" = paired_colors[2],    # Light blue
  "Post-screen" = paired_colors[6],   # Light orange  
  "No Treatment" = paired_colors[8]   # Light red
)

# Create the enhanced bar plot
enhanced_plot <- ggplot(plot_data, aes(x = reorder(delay_label, delay_weeks), 
                                      y = excess_cases_per_100k, 
                                      fill = delay_type)) +
  geom_col(width = 0.8, alpha = 0.8, color = "white", size = 0.3) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), 
                width = 0.4, color = "black", size = 0.6, alpha = 0.8) +
  
  # Add value labels on bars with smart positioning
  # Add value labels on bars with smart positioning
geom_text(aes(label = sprintf("%.0f", excess_cases_per_100k),
              y = ifelse(excess_cases_per_100k >= 0, 
                        pmax(ci_upper + 150, excess_cases_per_100k + 150),  # Above positive bars
                        pmin(ci_lower - 150, excess_cases_per_100k - 150))), # Below negative bars
          size = 4.2, fontface = "bold", color = "black") +
  
  # Styling
  scale_fill_manual(values = color_mapping, name = "Treatment Timing") +
  scale_y_continuous(labels = number_format(accuracy = 1), 
                     expand = expansion(mult = c(0.1, 0.15))) +
  
  # Reference line at zero
  geom_hline(yintercept = 0, linetype = "solid", color = "black", size = 0.8) +
  
  # UPDATED Labels and theme
  labs(
    title = "VQUIN: Excess TB Cases per 100,000 Contacts by Treatment Delay",
    subtitle = paste0("Compared to Instant TPT (0 week delay) (95% CI from ", 
                     format(n_simulations, big.mark = ","), " Monte Carlo simulations)"),
    x = "Treatment Delay (weeks)",
    y = "Excess TB Cases per 100,000 Contacts (26 weeks)",
    caption = "Negative delays = pre-screen treatment | Positive delays = post-screen treatment"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11, color = "black"),
    plot.caption = element_text(size = 9, color = "black"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 13, color = "black"),
    axis.text.y = element_text(size = 13, color = "black"),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "top",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "gray80", fill = NA, size = 0.5)
  )

# Display the plot
print(enhanced_plot)

# ——————————————————————————————————————————————————————
# 7) Summary Statistics Table (UPDATED for per 100,000)
# ——————————————————————————————————————————————————————

cat("\n=== ENHANCED DELAY ANALYSIS SUMMARY - TBCHAMP (Excess per 100,000 contacts) ===\n")
cat("Delay intervals: Every 2 weeks from -26 to +26 weeks\n")
cat("Monte Carlo simulations:", format(n_simulations, big.mark = ","), "\n")
cat("Results standardized to per 100,000 contacts over 26 weeks\n\n")

# Print key statistics
summary_table <- plot_data %>%
  select(delay_label, delay_type, excess_cases_per_100k, ci_lower, ci_upper) %>%
  mutate(
    excess_cases_ci = sprintf("%.0f (%.0f to %.0f)", excess_cases_per_100k, ci_lower, ci_upper)
  ) %>%
  select(delay_label, delay_type, excess_cases_ci)

colnames(summary_table) <- c("Delay (weeks)", "Type", "Excess Cases per 100,000 (95% CI)")

print(summary_table)

cat("\nNote: Negative delays indicate treatment started before screening\n")
cat("Positive delays indicate treatment started after screening\n")
cat("Results calculated using TBCHAMP GAM models and Monte Carlo approach\n")
cat("Units: Excess TB cases per 100,000 contacts over 26 weeks\n")

Delay scenarios bar plot with UIs (Figure 6)

library(mgcv)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(scales)

# ——————————————————————————————————————————————————————
# Enhanced TB Delays Analysis with 2-week intervals - Using Real GAM Models
# ——————————————————————————————————————————————————————

# Input Parameters (from your data)
n_control <- 897
n_intervention <- 907
py_control <- 938
py_intervention <- 952
n_simulations <- 5000
set.seed(123)

cat("Enhanced Delays Analysis Parameters:\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function (from your working code)
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Scenario Builder (exactly from your working code)
# ——————————————————————————————————————————————————————

bootstrap_scenario <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 to consider uncertainty in p0
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # TBCHAMP p0 values 32 and 1120
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  #enforce monotonicity (Simulated cumulative curves can wiggle; discrete hazards can become negative - preventing this)
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  
  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this delay scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 26
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention (Instant TPT) - FIXED: use p0_draw instead of p0
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # CORRECTED: Negative delays (pre-screen intervention) - FIXED: use p0_draw instead of p0
    # This means intervention starts BEFORE time 0, providing earlier protection
    
    # Find delay point index
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    # Before delay point: ramp (no intervention yet)
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At delay point: start intervention
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * sqrt(s_delay / back_wk)  # FIXED
    Sprev <- 1 - Fcum[id]
    
    # CORRECTED: After delay point, use intervention hazards consistently
    # Map the post-delay timeline to intervention hazards starting from time 0
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      
      # Calculate how long intervention has been active
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        # Use intervention hazard corresponding to this duration
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        # Use last available intervention hazard
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        # This shouldn't happen with negative delays, but use first intervention hazard as fallback
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays (post-screen delays) - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At screen
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    # Post-screen: control until delay point, then intervention
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        # Use control hazard until delay point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        # Use intervention hazard after delay point
        # Map to intervention timeline: intervention has been active for (t0 - delay_weeks) weeks
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Enhanced Analysis with 2-week intervals using REAL GAM models
# ——————————————————————————————————————————————————————

cat("Running enhanced Monte Carlo simulations with 2-week intervals...\n")

# Set up time vectors
time0 <- 0:26
back_wk <- 13
time_ext <- seq(-back_wk, 26, by = 1)

# Create 2-week interval delays from -26 to +26
delays_2wk <- seq(-13, 24, by = 2)

# Generate simulated predictions using your ACTUAL GAM models
cat("Generating GAM simulations using your trained models...\n")
# NOTE: Make sure your GAM models are loaded: control_gam_k5_GCV_VQUIN and intervention_gam_k5_GCV_VQUIN
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all scenarios including baseline
all_delays <- c("Instant TPT" = -Inf, setNames(delays_2wk, paste0(delays_2wk, " wk")), "Control" = Inf)

# Run simulations for all scenarios
cat("Running scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_delays)) {
  cat("Processing scenario:", scenario_name, "\n")
  delay_val <- all_delays[scenario_name]
  
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario(i, control_sims, intervention_sims, delay_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# ——————————————————————————————————————————————————————
# 4) Calculate Excess Cases PER 100,000 using your exact methodology
# ——————————————————————————————————————————————————————

# Get baseline (Instant TPT) for relative calculations
baseline_sims <- simulation_results[["Instant TPT"]]
baseline_mean <- mean(baseline_sims)

# Calculate metrics for each scenario using your exact approach
results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Determine person-years (Control uses control PY, others use intervention PY)
  py <- ifelse(scenario_name == "Control", py_control, py_intervention)
  sample_size <- ifelse(scenario_name == "Control", n_control, n_intervention)
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases (95% CI) - absolute numbers in original cohort
  mean_cases <- mean_incidence * sample_size
  cases_ci_lower <- ci_lower * sample_size
  cases_ci_upper <- ci_upper * sample_size
  
  # 3. Excess calculations (compared to Instant TPT)
  if(scenario_name != "Instant TPT") {
    # Excess incidence simulations
    excess_sims <- scenario_sims - baseline_sims
    excess_mean <- mean(excess_sims)
    excess_ci_lower <- quantile(excess_sims, 0.025)
    excess_ci_upper <- quantile(excess_sims, 0.975)
    
    # Excess rate per 100 person-yrs
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
    excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
    
    # MODIFIED: Excess cases per 100,000 contacts (not absolute numbers)
    excess_cases_per_100k <- excess_mean * 100000
    excess_cases_per_100k_ci_lower <- excess_ci_lower * 100000
    excess_cases_per_100k_ci_upper <- excess_ci_upper * 100000
  } else {
    # Instant TPT is the baseline, so excess = 0
    excess_rate_per_100py <- 0
    excess_rate_ci_lower <- 0
    excess_rate_ci_upper <- 0
    excess_cases_per_100k <- 0
    excess_cases_per_100k_ci_lower <- 0
    excess_cases_per_100k_ci_upper <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / baseline_mean
  
  # Store results
  results_list[[scenario_name]] <- list(
    scenario = scenario_name,
    incidence_rate_per_100py = rate_per_100py,
    incidence_rate_ci_lower = rate_ci_lower,
    incidence_rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    mean_cases = mean_cases,
    cases_ci_lower = cases_ci_lower,
    cases_ci_upper = cases_ci_upper,
    excess_cases_per_100k = excess_cases_per_100k,
    excess_cases_per_100k_ci_lower = excess_cases_per_100k_ci_lower,
    excess_cases_per_100k_ci_upper = excess_cases_per_100k_ci_upper,
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# 5) Prepare Plot Data using your results (MODIFIED for per 100,000)
# ——————————————————————————————————————————————————————

plot_data <- data.frame()

for(scenario_name in names(results_list)) {
  if(scenario_name == "Instant TPT") next  # Skip baseline
  
  r <- results_list[[scenario_name]]
  
  # Extract delay value for ordering
  if(scenario_name == "Control") {
    delay_weeks <- 999  # Put at the end
    delay_label <- "No TPT"
    delay_type <- "No Treatment"
  } else {
    delay_weeks <- as.numeric(sub(" wk", "", scenario_name))
    delay_label <- as.character(delay_weeks)
    delay_type <- ifelse(delay_weeks < 0, "Pre-screen", "Post-screen")
  }
  
  plot_data <- rbind(plot_data, data.frame(
    scenario = scenario_name,
    delay_weeks = delay_weeks,
    delay_label = delay_label,
    excess_cases_per_100k = r$excess_cases_per_100k,
    ci_lower = r$excess_cases_per_100k_ci_lower,
    ci_upper = r$excess_cases_per_100k_ci_upper,
    delay_type = delay_type,
    stringsAsFactors = FALSE
  ))
}

# Order by delay weeks
plot_data <- plot_data[order(plot_data$delay_weeks), ]

# ——————————————————————————————————————————————————————
# 6) Create Enhanced Color-Blind Friendly Bar Plot (UPDATED LABELS)
# ——————————————————————————————————————————————————————

# Set up RColorBrewer Paired palette
paired_colors <- brewer.pal(12, "Paired")

# Assign colors to delay types
color_mapping <- c(
  "Pre-screen" = paired_colors[2],    # Light blue
  "Post-screen" = paired_colors[6],   # Light orange  
  "No Treatment" = paired_colors[8]   # Light red
)

# Create the enhanced bar plot
enhanced_plot <- ggplot(plot_data, aes(x = reorder(delay_label, delay_weeks), 
                                      y = excess_cases_per_100k, 
                                      fill = delay_type)) +
  geom_col(width = 0.8, alpha = 0.8, color = "white", size = 0.3) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), 
                width = 0.4, color = "black", size = 0.6, alpha = 0.8) +
  
  # Add value labels on bars with smart positioning
  # Add value labels on bars with smart positioning
geom_text(aes(label = sprintf("%.0f", excess_cases_per_100k),
              y = ifelse(excess_cases_per_100k >= 0, 
                        pmax(ci_upper + 150, excess_cases_per_100k + 150),  # Above positive bars
                        pmin(ci_lower - 150, excess_cases_per_100k - 150))), # Below negative bars
          size = 4.2, fontface = "bold", color = "black") +
  
  # Styling
  scale_fill_manual(values = color_mapping, name = "Treatment Timing") +
  scale_y_continuous(labels = number_format(accuracy = 1), 
                     expand = expansion(mult = c(0.1, 0.15))) +
  
  # Reference line at zero
  geom_hline(yintercept = 0, linetype = "solid", color = "black", size = 0.8) +
  
  # UPDATED Labels and theme
  labs(
    title = "VQUIN: Excess TB Cases per 100,000 Contacts by Treatment Delay",
    subtitle = paste0("Compared to Instant TPT (0 week delay) (95% UI from ", 
                     format(n_simulations, big.mark = ","), " Monte Carlo simulations)"),
    x = "Treatment Delay (weeks)",
    y = "Excess TB Cases per 100,000 Contacts (26 weeks)",
    caption = "Negative delays = pre-screen treatment | Positive delays = post-screen treatment"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11, color = "black"),
    plot.caption = element_text(size = 9, color = "black"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 13, color = "black"),
    axis.text.y = element_text(size = 13, color = "black"),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "top",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "gray80", fill = NA, size = 0.5)
  )

# Display the plot
print(enhanced_plot)

# ——————————————————————————————————————————————————————
# 7) Summary Statistics Table (UPDATED for per 100,000)
# ——————————————————————————————————————————————————————

cat("\n=== ENHANCED DELAY ANALYSIS SUMMARY (Excess per 100,000 contacts) ===\n")
cat("Delay intervals: Every 2 weeks from -26 to +26 weeks\n")
cat("Monte Carlo simulations:", format(n_simulations, big.mark = ","), "\n")
cat("Results standardized to per 100,000 contacts over 26 weeks\n\n")

# Print key statistics
summary_table <- plot_data %>%
  select(delay_label, delay_type, excess_cases_per_100k, ci_lower, ci_upper) %>%
  mutate(
    excess_cases_ci = sprintf("%.0f (%.0f to %.0f)", excess_cases_per_100k, ci_lower, ci_upper)
  ) %>%
  select(delay_label, delay_type, excess_cases_ci)

colnames(summary_table) <- c("Delay (weeks)", "Type", "Excess Cases per 100,000 (95% CI)")

print(summary_table)

cat("\nNote: Negative delays indicate treatment started before screening\n")
cat("Positive delays indicate treatment started after screening\n")
cat("Results calculated using GAM models and Monte Carlo approach\n")
cat("Units: Excess TB cases per 100,000 contacts over 26 weeks\n")

Shortening

Curves with shortened scenarios (Figure 7)

library(mgcv)

# —————————————————————————————————————————————————————
# A) Fit‐based cumulative‐incidence at t = 0…54, with p0‐spike
# UPDATED: Using k=5 GCV GAM models

time0 <- 0:54
p0    <- 61/3948

# Use the new k=5 GCV GAM models
Fc0 <- predict(control_gam_k5_GCV_VQUIN,
               newdata = data.frame(time = time0))
Fi0 <- predict(intervention_gam_k5_GCV_VQUIN,
               newdata = data.frame(time = time0))

Fc0 <- p0 + (1 - p0)*Fc0
Fi0 <- p0 + (1 - p0)*Fi0

h_ctrl <- diff(c(0, Fc0)) / (1 - c(0, Fc0)[-length(Fc0)])
h_int  <- diff(c(0, Fi0)) / (1 - c(0, Fi0)[-length(Fi0)])

# —————————————————————————————————————————————————————
# B) "Pre‐screen" ramp length and treatment parameters

back_wk        <- 13            # show 3 months before screen
shorten_weeks  <- c(6, 13, 20)       # weeks at which to shorten treatment

# —————————————————————————————————————————————————————
# C) build a time axis from –back_wk … +54

time_ext <- seq(-back_wk, 54, by = 1)

# —————————————————————————————————————————————————————
# D) single‐curve builder: √ ramp on [–back_wk,0], then hazard‐switch at d

build_curve <- function(d) {
  n    <- length(time_ext)
  Fcum <- numeric(n)

  # find index of screen‐time t0 = 0
  i0   <- which(time_ext == 0)

  # 1) PRE‐SCREEN: √ ramp from (–back_wk → 0) up to p0
  s_up <- time_ext[1:(i0-1)] + back_wk
  Fcum[1:(i0-1)] <- p0 * sqrt( s_up / back_wk )

  # 2) AT SCREEN
  Fcum[i0] <- p0
  Sprev    <- 1 - p0

  # 3) POST‐SCREEN: control hazard until t0 ≤ d, then intervention hazard
  for(i in (i0+1):n) {
    t0 <- time_ext[i]
    if (d > 0 && t0 <= d) {
      h_t <- h_ctrl[t0 + 1]
    } else {
      idx <- if (d > 0) (t0 - d + 1) else (t0 + 1)
      h_t <- h_int[ min(idx, length(h_int)) ]
    }
    Snew     <- Sprev * (1 - h_t)
    Fcum[i]  <- 1 - Snew
    Sprev    <- Snew
  }

  Fcum
}

# —————————————————————————————————————————————————————
# E) UPDATED: shortened treatment curve builder using time-specific control slope

build_shortened_curve <- function(shorten_at) {
  n    <- length(time_ext)
  Fcum <- numeric(n)

  # find index of screen‐time t0 = 0
  i0   <- which(time_ext == 0)

  # 1) PRE‐SCREEN: √ ramp from (–back_wk → 0) up to p0
  s_up <- time_ext[1:(i0-1)] + back_wk
  Fcum[1:(i0-1)] <- p0 * sqrt( s_up / back_wk )

  # 2) AT SCREEN
  Fcum[i0] <- p0

  # 3) POST‐SCREEN: follow intervention until shorten_at, then follow control pattern from shortening timepoint
  for(i in (i0+1):n) {
    t0 <- time_ext[i]
    if (t0 <= shorten_at) {
      # Follow intervention curve
      Fcum[i] <- Fi0[min(t0 + 1, length(Fi0))]
    } else {
      # After shortening point: take the value at shortening point and add 
      # the same increment that control would have from the shortening timepoint
      value_at_shorten <- Fi0[min(shorten_at + 1, length(Fi0))]
      
      # NEW: Use control increment from shortening timepoint, not from time 0
      control_at_shorten <- Fc0[min(shorten_at + 1, length(Fc0))]
      control_at_current <- Fc0[min(t0 + 1, length(Fc0))]
      control_increment <- control_at_current - control_at_shorten
      
      Fcum[i] <- value_at_shorten + control_increment
    }
  }

  Fcum
}

# —————————————————————————————————————————————————————
# F) build all the curves

# 1) explicitly build Control & Instant
ctrl_ext <- build_curve( Inf)   # pure control
inst_ext <- build_curve(-Inf)   # pure intervention

# 2) shortened treatment curves
shorten_mat <- if(length(shorten_weeks) > 0) {
  sapply(shorten_weeks, build_shortened_curve)
} else NULL

# Add column names for shortened treatments
if(!is.null(shorten_mat)) {
  colnames(shorten_mat) <- paste0("TPT ", shorten_weeks, " wk")
}

# —————————————————————————————————————————————————————
# G) compressed x‐axis plot, up to +54 wk, now including shortened treatment curves

# how many "plot units" to devote to the negative side?
neg_width <- 30

# remap time_ext → x_plot:
x_plot <- ifelse(
  time_ext < 0,
  neg_width * (time_ext + back_wk) / back_wk,  # −back_wk…0 → 0…20
  neg_width + pmin(time_ext, 54)              #   0…54 → 20…74
)

# bind together Control, Instant, and shortened treatments only:
if(!is.null(shorten_mat)) {
  all_mat <- cbind(
    Control       = ctrl_ext,
    `Instant TPT` = inst_ext,
    shorten_mat
  )
} else {
  all_mat <- cbind(
    Control       = ctrl_ext,
    `Instant TPT` = inst_ext
  )
}

# subset to t ≤ 26 wk
keep   <- which(time_ext <= 26)
x_sub   <- x_plot[keep]
mat_sub <- all_mat[keep, , drop=FALSE]

# CORRECTED: Convert to percentages in the data
mat_sub_pct <- mat_sub * 100

# MODIFIED: Create gradient colors where shorter treatment is worse (red) and longer is better (green)
n_shorten_cols <- if(!is.null(shorten_mat)) ncol(shorten_mat) else 0

if(n_shorten_cols > 0) {
  # Create a gradient from red (shorter/worse) to green (longer/better)
  # Since shorten_weeks is ordered from shortest to longest: 6, 13, 20
  shorten_colors <- colorRampPalette(c("#FF0000", "#FF7F00", "#00AA00"))(n_shorten_cols)
  cols <- c("red", "blue", shorten_colors)
} else {
  cols <- c("red", "blue")
}

ltys <- c(1, 1, rep(3, n_shorten_cols))
yr   <- range(mat_sub_pct)

# CORRECTED: Use matplot with percentage data and larger axis text
matplot(
  x_sub, mat_sub_pct,
  type="l", col=cols, lty=ltys, lwd=2,
  xaxt="n", ylim=yr,
  xlab="Weeks since randomisation",
  ylab="Cumulative incidence of TB (%)",  # Updated label
  main="VQUIN Shortened-TPT scenarios",
  cex.axis = 1.0,  # Size 12 for axis numbers
  cex.lab = 1.0    # Size 12 for axis labels
)

# custom x-axis ticks:
neg_ticks <- seq(-back_wk, 0, by=2)     
neg_at    <- neg_width * (neg_ticks + back_wk) / back_wk
pos_ticks <- seq(0, 54, by=5)
pos_at    <- neg_width + pos_ticks

axis(1,
     at     = c(neg_at, pos_at),
     labels = c(neg_ticks, pos_ticks),
     cex.axis = 1.0)  # Size 12 for axis tick labels

# vertical line at t=0
abline(v=neg_width, lty=3, col="gray50")

# Add vertical lines to show shortening points
if(!is.null(shorten_mat)) {
  for(sw in shorten_weeks) {
    if(sw <= 54) {  # only show if visible on plot
      shorten_x <- neg_width + sw
      abline(v=shorten_x, lty=4, col="gray70", lwd=1)
    }
  }
}

# legend with larger text
legend("topleft",
       legend    = colnames(all_mat),
       col       = cols,
       lty       = ltys,
       lwd       = 2,
       bty       = "n",
       ncol      = 1,        
       cex       = 1.2,      # Increased from 0.8 to 1.2 for larger legend
       y.intersp = 0.8,      
       x.intersp = 0.5,      
       inset     = c(0.02, 0.02))

t_target <- 26
idx_26 <- which(time_ext == t_target)

if (length(idx_26) > 0) {
  ci_26 <- all_mat[idx_26, ] * 100   # convert to %
  cat("Cumulative incidence at", t_target, "weeks (%):\n")
  print(round(ci_26, 2))
} else {
  warning("No exact match for 26 weeks in time_ext")
}

Appendix Table A2 of all 2-week shortened scenario statistics with UIs

(w correct control change at that point in time, not t=0)

library(mgcv)
library(ggplot2)
library(dplyr)
library(parallel)

# ——————————————————————————————————————————————————————
# 0) Input Parameters - MODIFY THESE WITH YOUR ACTUAL DATA
# ——————————————————————————————————————————————————————

# Sample sizes
n_control <- 897      # REPLACE with your actual control sample size
n_intervention <- 907 # REPLACE with your actual intervention sample size

# Person-years of follow-up
py_control <- 938     # REPLACE with your actual control person-years
py_intervention <- 952 # REPLACE with your actual intervention person-years

# Monte Carlo parameters
n_simulations <- 5000  # Number of bootstrap simulations
set.seed(123)         # For reproducibility

cat("Analysis Parameters:\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Scenario Builder (FIXED p0 inconsistency)
# ——————————————————————————————————————————————————————

bootstrap_scenario <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 to consider uncertainty in p0
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # VQUIN p0 values 61 and 3948
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  #enforce monotonicity (Simulated cumulative curves can wiggle; discrete hazards can become negative - preventing this)
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  
  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this delay scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 13
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention (Instant TPT) - FIXED: use p0_draw instead of p0
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # CORRECTED: Negative delays (pre-screen intervention) - FIXED: use p0_draw instead of p0
    # This means intervention starts BEFORE time 0, providing earlier protection
    
    # Find delay point index
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    # Before delay point: ramp (no intervention yet)
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At delay point: start intervention
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * sqrt(s_delay / back_wk)  # FIXED
    Sprev <- 1 - Fcum[id]
    
    # CORRECTED: After delay point, use intervention hazards consistently
    # Map the post-delay timeline to intervention hazards starting from time 0
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      
      # Calculate how long intervention has been active
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        # Use intervention hazard corresponding to this duration
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        # Use last available intervention hazard
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        # This shouldn't happen with negative delays, but use first intervention hazard as fallback
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays (post-screen delays) - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At screen
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    # Post-screen: control until delay point, then intervention
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        # Use control hazard until delay point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        # Use intervention hazard after delay point
        # Map to intervention timeline: intervention has been active for (t0 - delay_weeks) weeks
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Monte Carlo Simulations - Every 2 Week Delays
# ——————————————————————————————————————————————————————

cat("Running Monte Carlo simulations for 2-week delay intervals...\n")

# Reset seed right before simulations for exact reproducibility
set.seed(42)

# Set up time vectors
time0 <- 0:26
back_wk <- 13
time_ext <- seq(-back_wk, 26, by = 1)

# Create 2-week delay intervals from -26 to +26 weeks
delays_2wk <- c(seq(-13, -1, by = 2), 0, seq(2, 26, by = 2))

# Generate simulated predictions for both models
cat("Generating GAM simulations...\n")
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all delay scenarios (including baseline at week 0 = no delay)
all_delays <- setNames(delays_2wk, paste0(delays_2wk, " wk"))

# Run simulations for all scenarios
cat("Running scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_delays)) {
  cat("Processing scenario:", scenario_name, "\n")
  delay_val <- all_delays[scenario_name]
  
  # Run simulations for this scenario
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario(i, control_sims, intervention_sims, delay_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# Check if "0 wk" exists and compare a few values
print(simulation_results[["0 wk"]][1:5])  # First 5 simulation results
print(mean(simulation_results[["0 wk"]]))   # Mean for "0 wk" scenario

# ——————————————————————————————————————————————————————
# 4) Calculate Key Metrics with Week 0 as Baseline
# ——————————————————————————————————————————————————————

# Get baseline (Week 0 = no delay) for relative calculations
baseline_sims <- simulation_results[["0 wk"]]
baseline_mean <- mean(baseline_sims)

# Calculate metrics for each scenario
results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  delay_value <- as.numeric(sub(" wk", "", scenario_name))
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Use intervention PY and sample size for all scenarios
  py <- py_intervention
  sample_size <- n_intervention
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases per 100,000 (95% CI)
  cases_per_100k <- mean_incidence * 100000
  cases_per_100k_ci_lower <- ci_lower * 100000
  cases_per_100k_ci_upper <- ci_upper * 100000
  
  # 3. Excess calculations (compared to Week 0)
  if(delay_value != 0) {
    # Excess incidence simulations
    excess_sims <- scenario_sims - baseline_sims
    excess_mean <- mean(excess_sims)
    excess_ci_lower <- quantile(excess_sims, 0.025)
    excess_ci_upper <- quantile(excess_sims, 0.975)
    
    # Excess rate per 100 person-yrs
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
    excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
    
    # Excess cases per 100,000
    excess_cases_per_100k <- excess_mean * 100000
    excess_cases_per_100k_ci_lower <- excess_ci_lower * 100000
    excess_cases_per_100k_ci_upper <- excess_ci_upper * 100000
  } else {
    # Week 0 is the baseline, so excess = 0
    excess_rate_per_100py <- 0
    excess_rate_ci_lower <- 0
    excess_rate_ci_upper <- 0
    excess_cases_per_100k <- 0
    excess_cases_per_100k_ci_lower <- 0
    excess_cases_per_100k_ci_upper <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / baseline_mean
  
  # Store results
  results_list[[scenario_name]] <- list(
    delay_weeks = delay_value,
    incidence_rate_per_100py = rate_per_100py,
    rate_ci_lower = rate_ci_lower,
    rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    cases_per_100k = cases_per_100k,
    cases_per_100k_ci_lower = cases_per_100k_ci_lower,
    cases_per_100k_ci_upper = cases_per_100k_ci_upper,
    excess_cases_per_100k = excess_cases_per_100k,
    excess_cases_per_100k_ci_lower = excess_cases_per_100k_ci_lower,
    excess_cases_per_100k_ci_upper = excess_cases_per_100k_ci_upper,
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# 5) Create Export Table for Word
# ——————————————————————————————————————————————————————

# Build the export table
export_table <- data.frame()

for(scenario_name in names(results_list)) {
  r <- results_list[[scenario_name]]
  
  # Format all confidence intervals
  incidence_rate_ci <- sprintf("%.2f (%.2f-%.2f)", 
                              r$incidence_rate_per_100py, r$rate_ci_lower, r$rate_ci_upper)
  
  if(r$delay_weeks == 0) {
    excess_rate_ci <- "BASELINE"
    excess_cases_ci <- "BASELINE"
  } else {
    excess_rate_ci <- sprintf("%+.2f (%+.2f to %+.2f)", 
                             r$excess_rate_per_100py, r$excess_rate_ci_lower, r$excess_rate_ci_upper)
    excess_cases_ci <- sprintf("%+.0f (%+.0f to %+.0f)", 
                              r$excess_cases_per_100k, r$excess_cases_per_100k_ci_lower, r$excess_cases_per_100k_ci_upper)
  }
  
  cases_per_100k_ci <- sprintf("%.0f (%.0f-%.0f)", 
                              r$cases_per_100k, r$cases_per_100k_ci_lower, r$cases_per_100k_ci_upper)
  
  relative_risk_formatted <- sprintf("%.2f", r$relative_risk)
  
  # Add row to export table
  export_table <- rbind(export_table, data.frame(
    `Delay (weeks)` = r$delay_weeks,
    `Incidence Rate per 100 Person-Yrs (95% CI)` = incidence_rate_ci,
    `Excess Incidence Rate per 100 Person-Yrs (95% CI)` = excess_rate_ci,
    `Mean Expected Cases per 100,000 (95% CI)` = cases_per_100k_ci,
    `Excess Cases per 100,000 (95% CI)` = excess_cases_ci,
    `Relative Risk` = relative_risk_formatted,
    stringsAsFactors = FALSE,
    check.names = FALSE
  ))
}

# Sort by delay weeks
export_table <- export_table[order(export_table$`Delay (weeks)`), ]

# Display the table
cat("\n=== EXPORT TABLE FOR WORD ===\n\n")
print(export_table, row.names = FALSE)

# ——————————————————————————————————————————————————————
# 6) Export Options
# ——————————————————————————————————————————————————————

# Option 1: Save as CSV for easy import to Word
write.csv(export_table, "NEW_VQUIN__tb_delays_analysis_table.csv", row.names = FALSE)
cat("\nTable saved as 'tb_delays_analysis_table.csv'\n")

# Option 2: Create a formatted version for direct copy-paste
cat("\n=== FORMATTED FOR COPY-PASTE TO WORD ===\n")
cat("Delay (weeks)\tIncidence Rate per 100 Person-Yrs (95% CI)\tExcess Incidence Rate per 100 Person-Yrs (95% CI)\tMean Expected Cases per 100,000 (95% CI)\tExcess Cases per 100,000 (95% CI)\tRelative Risk\n")

for(i in 1:nrow(export_table)) {
  cat(paste(export_table[i, ], collapse = "\t"), "\n")
}

# Option 3: Summary statistics
cat("\n=== ANALYSIS SUMMARY ===\n")
cat("Monte Carlo simulations:", n_simulations, "\n")
cat("Delay intervals: Every 2 weeks from -26 to +26 weeks\n")
cat("Baseline scenario: Week 0 (no delay)\n")
cat("Sample size:", n_intervention, "\n")
cat("Person-years:", py_intervention, "\n")
cat("Total scenarios analyzed:", nrow(export_table), "\n")

# Option 4: Key findings
best_scenario <- export_table[which.min(as.numeric(gsub("[^-0-9.]", "", export_table$`Delay (weeks)`))), ]
worst_scenario <- export_table[which.max(as.numeric(gsub("[^-0-9.]", "", export_table$`Delay (weeks)`))), ]

cat("\nBest scenario (most negative delay):", best_scenario$`Delay (weeks)`, "weeks\n")
cat("Worst scenario (most positive delay):", worst_scenario$`Delay (weeks)`, "weeks\n")

cat("\nNote: Negative delays = treatment starts before screening\n")
cat("Positive delays = treatment starts after screening\n")
cat("All excess calculations are relative to Week 0 (no delay)\n")

Bar plot of shortened scenarios (Figure 8)

library(mgcv)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(scales)

# ——————————————————————————————————————————————————————
# Enhanced TB Shortened Treatment Analysis - Using Real GAM Models
# VQUIN ANALYSIS
# ——————————————————————————————————————————————————————

# Input Parameters (VQUIN data)
n_control <- 897
n_intervention <- 907
py_control <- 938
py_intervention <- 952
n_simulations <- 5000
set.seed(123)

cat("Enhanced Shortened Treatment Analysis Parameters (VQUIN):\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function (from your working code)
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Shortened Treatment Scenario Builder (from your code)
# ——————————————————————————————————————————————————————

bootstrap_shortened_scenario <- function(sim_idx, control_sims, intervention_sims, shorten_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 to consider uncertainty in p0
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # VQUIN p0 values 61 and 3948
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  
  # Enforce monotonicity
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this shortened treatment scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 13  # VQUIN: 3 months (13 weeks) pre-period
  
  # Special handling for reference scenarios
  if(shorten_weeks == Inf) {
    # Pure control - no treatment at all
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(shorten_weeks == -Inf) {
    # Full treatment (26 weeks) - equivalent to full TPT
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Shortened treatment scenarios
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)
    }
    
    # At screen
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    # Post-screen: intervention until shorten_weeks, then control
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= shorten_weeks) {
        # Use intervention hazard until shortening point
        h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      } else {
        # After shortening, use control hazard at CURRENT time point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Enhanced Analysis with extended treatment durations
# ——————————————————————————————————————————————————————

cat("Running enhanced Monte Carlo simulations for shortened treatments...\n")

# Set up time vectors
time0 <- 0:26
back_wk <- 13  # VQUIN: 3 months pre-period
time_ext <- seq(-back_wk, 26, by = 1)

# Create treatment duration intervals (every 2 weeks from 2 to 24, plus original values)
treatment_durations <- c(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24)

# Generate simulated predictions using your ACTUAL GAM models
cat("Generating GAM simulations using your trained models...\n")
# NOTE: Make sure your GAM models are loaded: control_gam_k5_GCV_VQUIN and intervention_gam_k5_GCV_VQUIN
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all scenarios including baseline and control
all_scenarios <- c("Control" = Inf, 
                   setNames(treatment_durations, paste0("TPT ", treatment_durations, " wk")),
                   "Full TPT (26 wk)" = -Inf)

# Run simulations for all scenarios
cat("Running shortened treatment scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_scenarios)) {
  cat("Processing scenario:", scenario_name, "\n")
  shorten_val <- all_scenarios[scenario_name]
  
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_shortened_scenario(i, control_sims, intervention_sims, shorten_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# ——————————————————————————————————————————————————————
# 4) Calculate Excess Cases PER 100,000 using your exact methodology
# ——————————————————————————————————————————————————————

# Get baseline (Full TPT 26 weeks) for relative calculations
baseline_sims <- simulation_results[["Full TPT (26 wk)"]]
baseline_mean <- mean(baseline_sims)

# Calculate metrics for each scenario using your exact approach
results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Determine person-years (Control uses control PY, others use intervention PY)
  py <- ifelse(scenario_name == "Control", py_control, py_intervention)
  sample_size <- ifelse(scenario_name == "Control", n_control, n_intervention)
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases (95% CI) - absolute numbers in original cohort
  mean_cases <- mean_incidence * sample_size
  cases_ci_lower <- ci_lower * sample_size
  cases_ci_upper <- ci_upper * sample_size
  
  # 3. Excess calculations (compared to Full TPT 26 weeks)
  if(scenario_name != "Full TPT (26 wk)") {
    # Excess incidence simulations
    excess_sims <- scenario_sims - baseline_sims
    excess_mean <- mean(excess_sims)
    excess_ci_lower <- quantile(excess_sims, 0.025)
    excess_ci_upper <- quantile(excess_sims, 0.975)
    
    # Excess rate per 100 person-yrs
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
    excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
    
    # MODIFIED: Excess cases per 100,000 contacts (not absolute numbers)
    excess_cases_per_100k <- excess_mean * 100000
    excess_cases_per_100k_ci_lower <- excess_ci_lower * 100000
    excess_cases_per_100k_ci_upper <- excess_ci_upper * 100000
    
    # Keep absolute numbers for compatibility
    excess_cases_mean <- excess_mean * sample_size
    excess_cases_ci_lower <- excess_ci_lower * sample_size
    excess_cases_ci_upper <- excess_ci_upper * sample_size
  } else {
    # Full TPT is the baseline, so excess = 0
    excess_rate_per_100py <- 0
    excess_rate_ci_lower <- 0
    excess_rate_ci_upper <- 0
    excess_cases_per_100k <- 0
    excess_cases_per_100k_ci_lower <- 0
    excess_cases_per_100k_ci_upper <- 0
    excess_cases_mean <- 0
    excess_cases_ci_lower <- 0
    excess_cases_ci_upper <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / baseline_mean
  
  # Store results
  results_list[[scenario_name]] <- list(
    scenario = scenario_name,
    incidence_rate_per_100py = rate_per_100py,
    incidence_rate_ci_lower = rate_ci_lower,
    incidence_rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    mean_cases = mean_cases,
    cases_ci_lower = cases_ci_lower,
    cases_ci_upper = cases_ci_upper,
    excess_cases_per_100k = excess_cases_per_100k,
    excess_cases_per_100k_ci_lower = excess_cases_per_100k_ci_lower,
    excess_cases_per_100k_ci_upper = excess_cases_per_100k_ci_upper,
    excess_cases_mean = excess_cases_mean,
    excess_cases_ci_lower = excess_cases_ci_lower,
    excess_cases_ci_upper = excess_cases_ci_upper,
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# 5) Prepare Plot Data using your results (MODIFIED for per 100,000)
# ——————————————————————————————————————————————————————

plot_data <- data.frame()

for(scenario_name in names(results_list)) {
  if(scenario_name == "Full TPT (26 wk)") next  # Skip baseline
  
  r <- results_list[[scenario_name]]
  
  # Extract treatment duration for ordering
  if(scenario_name == "Control") {
    treatment_weeks <- 0  # No treatment
    treatment_label <- "0"
    treatment_type <- "No Treatment"
  } else {
    treatment_weeks <- as.numeric(sub("TPT (\\d+) wk", "\\1", scenario_name))
    treatment_label <- as.character(treatment_weeks)
    treatment_type <- ifelse(treatment_weeks <= 12, "Short Treatment", "Moderate Treatment")
  }
  
  plot_data <- rbind(plot_data, data.frame(
    scenario = scenario_name,
    treatment_weeks = treatment_weeks,
    treatment_label = treatment_label,
    excess_cases_per_100k = r$excess_cases_per_100k,
    ci_lower = r$excess_cases_per_100k_ci_lower,
    ci_upper = r$excess_cases_per_100k_ci_upper,
    treatment_type = treatment_type,
    stringsAsFactors = FALSE
  ))
}

# Order by treatment duration
plot_data <- plot_data[order(plot_data$treatment_weeks), ]

# ——————————————————————————————————————————————————————
# 6) Create Enhanced Color-Blind Friendly Bar Plot (UPDATED LABELS)
# ——————————————————————————————————————————————————————

# Set up RColorBrewer Paired palette
paired_colors <- brewer.pal(12, "Paired")

# Assign colors to treatment types
color_mapping <- c(
  "No Treatment" = paired_colors[8],     # Light red
  "Short Treatment" = paired_colors[6],  # Light orange
  "Moderate Treatment" = paired_colors[2] # Light blue
)

# Create the enhanced bar plot
enhanced_plot <- ggplot(plot_data, aes(x = reorder(treatment_label, treatment_weeks), 
                                      y = excess_cases_per_100k, 
                                      fill = treatment_type)) +
  geom_col(width = 0.8, alpha = 0.8, color = "white", size = 0.3) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), 
                width = 0.4, color = "black", size = 0.6, alpha = 0.8) +
  
  # Add value labels on bars with smart positioning
  geom_text(aes(label = sprintf("%.0f", excess_cases_per_100k),
                y = ifelse(excess_cases_per_100k >= 0, 
                          pmax(ci_upper + 150, excess_cases_per_100k + 150),  # Above positive bars
                          pmin(ci_lower - 150, excess_cases_per_100k - 150))), # Below negative bars
            size = 4.2, fontface = "bold", color = "black") +
  
  # Styling
  scale_fill_manual(values = color_mapping, name = "Treatment Duration") +
  scale_y_continuous(labels = number_format(accuracy = 1), 
                     expand = expansion(mult = c(0.1, 0.15))) +
  
  # Reference line at zero
  geom_hline(yintercept = 0, linetype = "solid", color = "black", size = 0.8) +
  
  # UPDATED Labels and theme for VQUIN
  labs(
    title = "VQUIN: Excess TB Cases per 100,000 Contacts by Treatment Duration",
    subtitle = paste0("Compared to Full Treatment (26 weeks) - 95% UI from ", 
                     format(n_simulations, big.mark = ","), " Monte Carlo simulations"),
    x = "Treatment Duration (weeks)",
    y = "Excess TB Cases per 100,000 Contacts (at 26 weeks)",
    caption = "0 weeks = no treatment | Negative values indicate fewer cases than full treatment"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11, color = "black"),
    plot.caption = element_text(size = 9, color = "black"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 13, color = "black"),
    axis.text.y = element_text(size = 13, color = "black"),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "top",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "gray80", fill = NA, size = 0.5)
  )

# Display the plot
print(enhanced_plot)

# ——————————————————————————————————————————————————————
# 7) Summary Statistics Table (UPDATED for per 100,000)
# ——————————————————————————————————————————————————————

cat("\n=== ENHANCED SHORTENED TREATMENT ANALYSIS SUMMARY - VQUIN (Excess per 100,000 contacts) ===\n")
cat("Treatment durations: Every 2 weeks from 2 to 24 weeks, plus no treatment (0 weeks)\n")
cat("Monte Carlo simulations:", format(n_simulations, big.mark = ","), "\n")
cat("Results standardized to per 100,000 contacts over 26 weeks\n")
cat("Baseline: Full TPT (26 weeks)\n\n")

# Print key statistics
summary_table <- plot_data %>%
  select(treatment_label, treatment_type, excess_cases_per_100k, ci_lower, ci_upper) %>%
  mutate(
    excess_cases_ci = sprintf("%.0f (%.0f to %.0f)", excess_cases_per_100k, ci_lower, ci_upper)
  ) %>%
  select(treatment_label, treatment_type, excess_cases_ci)

colnames(summary_table) <- c("Duration (weeks)", "Type", "Excess Cases per 100,000 (95% CI)")

print(summary_table)

cat("\nNote: All values are excess cases per 100,000 contacts compared to full 26-week treatment\n")
cat("Results calculated using VQUIN GAM models and Monte Carlo approach\n")
cat("Units: Excess TB cases per 100,000 contacts over 26 weeks\n")

# ——————————————————————————————————————————————————————
# 8) Key Findings Summary (UPDATED for per 100,000)
# ——————————————————————————————————————————————————————

cat("\n=== KEY FINDINGS (per 100,000 contacts) ===\n")
best_case <- plot_data[which.min(plot_data$excess_cases_per_100k), ]
worst_case <- plot_data[which.max(plot_data$excess_cases_per_100k), ]

cat("Best shortened treatment:", best_case$treatment_label, "weeks ->", 
    sprintf("%.0f excess cases per 100,000", best_case$excess_cases_per_100k), 
    sprintf("(95%% CI: %.0f to %.0f)", best_case$ci_lower, best_case$ci_upper), "\n")

cat("Worst case (No treatment):", worst_case$treatment_label, "weeks ->", 
    sprintf("%.0f excess cases per 100,000", worst_case$excess_cases_per_100k), 
    sprintf("(95%% CI: %.0f to %.0f)", worst_case$ci_lower, worst_case$ci_upper), "\n")

# Calculate relative risks compared to full treatment
full_tpt_mean <- mean(simulation_results[["Full TPT (26 wk)"]])
cat("\nRelative Risks (compared to Full TPT 26 weeks):\n")
for(scenario in names(simulation_results)) {
  if(scenario != "Full TPT (26 wk)") {
    rr <- mean(simulation_results[[scenario]]) / full_tpt_mean
    cat(sprintf("%s: RR = %.2f\n", scenario, rr))
  }
}

Sensitivity analysis -

Method 1 – Starting at control time 0 instead of at that point in time - Makes table with 95% UIs for delay scenarios

# ——————————————————————————————————————————————————————
# SENSITIVITY ANALYSIS: Modified Bootstrap Scenario Function
# Change: After delay, use intervention rate from time 0 (not from delay point)
# Output: Excess cases per 100,000 contacts (not absolute cases)
# ——————————————————————————————————————————————————————

bootstrap_scenario_sensitivity <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 to consider uncertainty in p0
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # TBCHAMP p0 values 32 and 1120
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  #enforce monotonicity (Simulated cumulative curves can wiggle; discrete hazards can become negative - preventing this)
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  
  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this delay scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 13
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention (Instant TPT) - FIXED: use p0_draw instead of p0
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # CORRECTED: Negative delays (pre-screen intervention) - FIXED: use p0_draw instead of p0
    # This means intervention starts BEFORE time 0, providing earlier protection
    
    # Find delay point index
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    # Before delay point: ramp (no intervention yet)
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At delay point: start intervention
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * sqrt(s_delay / back_wk)  # FIXED
    Sprev <- 1 - Fcum[id]
    
    # CORRECTED: After delay point, use intervention hazards consistently
    # Map the post-delay timeline to intervention hazards starting from time 0
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      
      # Calculate how long intervention has been active
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        # Use intervention hazard corresponding to this duration
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        # Use last available intervention hazard
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        # This shouldn't happen with negative delays, but use first intervention hazard as fallback
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # ——————————————————————————————————————————————————————
    # SENSITIVITY ANALYSIS MODIFICATION HERE:
    # Positive delays (post-screen delays) - MODIFIED LOGIC
    # ——————————————————————————————————————————————————————
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At screen
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    # Post-screen: control until delay point, then intervention
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        # Use control hazard until delay point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        # SENSITIVITY ANALYSIS CHANGE:
        # Use intervention hazard corresponding to time from baseline (t0)
        # NOT from delay point (t0 - delay_weeks)
        intervention_time_from_baseline <- t0  # CHANGED: was t0 - delay_weeks
        h_idx <- min(intervention_time_from_baseline + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# Run Sensitivity Analysis with Modified Logic
# ——————————————————————————————————————————————————————

cat("Running SENSITIVITY ANALYSIS with modified intervention timing logic...\n")
cat("Change: After delay, use intervention rate from time 0 (not from delay point)\n")
cat("Output: Excess cases per 100,000 contacts\n\n")

# Set up time vectors (same as before)
time0 <- 0:26
back_wk <- 13
time_ext <- seq(-back_wk, 26, by = 1)

# Create 2-week interval delays from -26 to +26 (same as before)
delays_2wk <- seq(-13, 24, by = 2)

# Generate simulated predictions using your ACTUAL GAM models (same as before)
cat("Using existing GAM simulations...\n")
# NOTE: Assumes control_sims and intervention_sims are already generated

# Define all scenarios including baseline (same as before)
all_delays <- c("Instant TPT" = -Inf, setNames(delays_2wk, paste0(delays_2wk, " wk")), "Control" = Inf)

# Run simulations for all scenarios with MODIFIED FUNCTION
cat("Running SENSITIVITY scenario simulations...\n")
sensitivity_results <- list()

for(scenario_name in names(all_delays)) {
  cat("Processing sensitivity scenario:", scenario_name, "\n")
  delay_val <- all_delays[scenario_name]
  
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario_sensitivity(i, control_sims, intervention_sims, delay_val)  # MODIFIED FUNCTION
  })
  
  sensitivity_results[[scenario_name]] <- scenario_sims
}

cat("Completed SENSITIVITY simulations!\n\n")

# ——————————————————————————————————————————————————————
# Calculate Metrics for Sensitivity Analysis (PER 100,000)
# ——————————————————————————————————————————————————————

# Get baseline (Instant TPT) for relative calculations
sensitivity_baseline_sims <- sensitivity_results[["Instant TPT"]]
sensitivity_baseline_mean <- mean(sensitivity_baseline_sims)

# Calculate metrics for each scenario using per 100,000 conversion
sensitivity_results_list <- list()

for(scenario_name in names(sensitivity_results)) {
  scenario_sims <- sensitivity_results[[scenario_name]]
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Determine person-years (Control uses control PY, others use intervention PY)
  py <- ifelse(scenario_name == "Control", py_control, py_intervention)
  sample_size <- ifelse(scenario_name == "Control", n_control, n_intervention)
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases per 100,000 contacts (95% CI) - MODIFIED
  mean_cases_per_100k <- mean_incidence * 100000  # CHANGED: was * sample_size
  cases_ci_lower_per_100k <- ci_lower * 100000     # CHANGED: was * sample_size
  cases_ci_upper_per_100k <- ci_upper * 100000     # CHANGED: was * sample_size
  
  # 3. Excess calculations (compared to Instant TPT) - MODIFIED FOR PER 100,000
  if(scenario_name != "Instant TPT") {
    # Excess incidence simulations
    excess_sims <- scenario_sims - sensitivity_baseline_sims
    excess_mean <- mean(excess_sims)
    excess_ci_lower <- quantile(excess_sims, 0.025)
    excess_ci_upper <- quantile(excess_sims, 0.975)
    
    # Excess rate per 100 person-yrs
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
    excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
    
    # Excess cases per 100,000 contacts - MODIFIED
    excess_cases_per_100k <- excess_mean * 100000        # CHANGED: was * sample_size
    excess_cases_ci_lower_per_100k <- excess_ci_lower * 100000  # CHANGED: was * sample_size
    excess_cases_ci_upper_per_100k <- excess_ci_upper * 100000  # CHANGED: was * sample_size
  } else {
    # Instant TPT is the baseline, so excess = 0
    excess_rate_per_100py <- 0
    excess_rate_ci_lower <- 0
    excess_rate_ci_upper <- 0
    excess_cases_per_100k <- 0
    excess_cases_ci_lower_per_100k <- 0
    excess_cases_ci_upper_per_100k <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / sensitivity_baseline_mean
  
  # Store results - UPDATED VARIABLE NAMES
  sensitivity_results_list[[scenario_name]] <- list(
    scenario = scenario_name,
    incidence_rate_per_100py = rate_per_100py,
    incidence_rate_ci_lower = rate_ci_lower,
    incidence_rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    mean_cases_per_100k = mean_cases_per_100k,                     # UPDATED
    cases_ci_lower_per_100k = cases_ci_lower_per_100k,             # UPDATED
    cases_ci_upper_per_100k = cases_ci_upper_per_100k,             # UPDATED
    excess_cases_per_100k = excess_cases_per_100k,                 # UPDATED
    excess_cases_ci_lower_per_100k = excess_cases_ci_lower_per_100k, # UPDATED
    excess_cases_ci_upper_per_100k = excess_cases_ci_upper_per_100k, # UPDATED
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# Prepare Sensitivity Analysis Plot Data (PER 100,000)
# ——————————————————————————————————————————————————————

sensitivity_plot_data <- data.frame()

for(scenario_name in names(sensitivity_results_list)) {
  if(scenario_name == "Instant TPT") next  # Skip baseline
  
  r <- sensitivity_results_list[[scenario_name]]
  
  # Extract delay value for ordering
  if(scenario_name == "Control") {
    delay_weeks <- 999  # Put at the end
    delay_label <- "No Tx"
    delay_type <- "No Treatment"
  } else {
    delay_weeks <- as.numeric(sub(" wk", "", scenario_name))
    delay_label <- as.character(delay_weeks)
    delay_type <- ifelse(delay_weeks < 0, "Pre-screen", "Post-screen")
  }
  
  sensitivity_plot_data <- rbind(sensitivity_plot_data, data.frame(
    scenario = scenario_name,
    delay_weeks = delay_weeks,
    delay_label = delay_label,
    excess_cases_per_100k = r$excess_cases_per_100k,                     # UPDATED
    ci_lower_per_100k = r$excess_cases_ci_lower_per_100k,                # UPDATED
    ci_upper_per_100k = r$excess_cases_ci_upper_per_100k,                # UPDATED
    delay_type = delay_type,
    stringsAsFactors = FALSE
  ))
}

# Order by delay weeks
sensitivity_plot_data <- sensitivity_plot_data[order(sensitivity_plot_data$delay_weeks), ]

# ——————————————————————————————————————————————————————
# Create Sensitivity Analysis Plot (PER 100,000)
# ——————————————————————————————————————————————————————

# Set up RColorBrewer Paired palette
paired_colors <- brewer.pal(12, "Paired")

# Assign colors to delay types
color_mapping <- c(
  "Pre-screen" = paired_colors[2],    # Light blue
  "Post-screen" = paired_colors[6],   # Light orange  
  "No Treatment" = paired_colors[8]   # Light red
)

# Create the sensitivity analysis bar plot - UPDATED LABELS
sensitivity_plot <- ggplot(sensitivity_plot_data, aes(x = reorder(delay_label, delay_weeks), 
                                      y = excess_cases_per_100k,     # UPDATED
                                      fill = delay_type)) +
  geom_col(width = 0.8, alpha = 0.8, color = "white", size = 0.3) +
  geom_errorbar(aes(ymin = ci_lower_per_100k, ymax = ci_upper_per_100k),  # UPDATED
                width = 0.4, color = "black", size = 0.6, alpha = 0.8) +
  
  # Add value labels on bars with smart positioning - UPDATED
  geom_text(aes(label = sprintf("%.0f", excess_cases_per_100k),           # UPDATED - rounded to nearest whole
                y = ifelse(excess_cases_per_100k >= 0, 
                          pmax(ci_upper_per_100k + 50, excess_cases_per_100k + 50),  # UPDATED
                          pmin(ci_lower_per_100k - 50, excess_cases_per_100k - 50))), # UPDATED
            size = 3.2, fontface = "bold", color = "black") +
  
  # Styling
  scale_fill_manual(values = color_mapping, name = "Treatment Timing") +
  scale_y_continuous(labels = number_format(accuracy = 1),      # UPDATED - whole numbers
                     expand = expansion(mult = c(0.1, 0.15))) +
  
  # Reference line at zero
  geom_hline(yintercept = 0, linetype = "solid", color = "black", size = 0.8) +
  
  # Labels and theme - UPDATED
  labs(
    title = "SENSITIVITY ANALYSIS: TBCHAMP Excess TB Cases by Treatment Delay",
    subtitle = paste0("After delay, intervention follows rate from time 0 (not delay point) | 95% CI from ", 
                     format(n_simulations, big.mark = ","), " Monte Carlo simulations"),
    x = "Treatment Delay (weeks)",
    y = "Excess TB Cases per 100,000 Contacts",  # UPDATED
    caption = "Negative delays = pre-screen treatment | Compared to instant TPT baseline"  # UPDATED
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11, color = "gray40"),
    plot.caption = element_text(size = 9, color = "gray50"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.text.y = element_text(size = 10),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "top",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "gray80", fill = NA, size = 0.5)
  )

# Display the sensitivity plot
print(sensitivity_plot)

Sensitivity analysis method 1 for shortened treatments

library(mgcv)
library(ggplot2)
library(dplyr)
library(parallel)

# ——————————————————————————————————————————————————————
# 0) Input Parameters - TBCHAMP Data
# ——————————————————————————————————————————————————————

# Sample sizes (TBCHAMP)
n_control <- 897      # TBCHAMP control sample size
n_intervention <- 907 # TBCHAMP intervention sample size

# Person-years of follow-up (TBCHAMP)
py_control <- 938     # TBCHAMP control person-years
py_intervention <- 952 # TBCHAMP intervention person-years

# Monte Carlo parameters
n_simulations <- 5000  # Number of bootstrap simulations
set.seed(123)         # For reproducibility

cat("Shortened Treatment Analysis Parameters (TBCHAMP):\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Shortened Treatment Scenario Builder
# ——————————————————————————————————————————————————————

bootstrap_shortened_scenario <- function(sim_idx, control_sims, intervention_sims, shorten_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 to consider uncertainty in p0
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # TBCHAMP p0 values
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  
  # Enforce monotonicity (Simulated cumulative curves can wiggle; discrete hazards can become negative - preventing this)
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this shortened treatment scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 13
  
  # Special handling for reference scenarios
  if(shorten_weeks == Inf) {
    # Pure control - no treatment at all
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(shorten_weeks == -Inf) {
    # Full treatment (26 weeks) - equivalent to instant TPT
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Shortened treatment scenarios
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)
    }
    
    # At screen
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    # Post-screen: intervention until shorten_weeks, then control
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= shorten_weeks) {
        # Use intervention hazard until shortening point
        h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      } else {
        # After shortening: use control hazard
        # Map time to control hazard based on time since shortening
        time_since_shorten <- t0 - shorten_weeks
        h_idx <- min(time_since_shorten + 1, length(h_ctrl_sim))
        h_t <- h_ctrl_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Monte Carlo Simulations
# ——————————————————————————————————————————————————————

cat("Running Monte Carlo simulations for shortened treatments...\n")

# Set up time vectors
time0 <- 0:26
back_wk <- 13
time_ext <- seq(-back_wk, 26, by = 1)
shortened_weeks <- seq(2, 24, by = 2)  # Treatment every 2 weeks from 2-24 weeks

# Generate simulated predictions for both models
cat("Generating GAM simulations...\n")
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all scenarios
all_scenarios <- c("Control" = Inf, "Full TPT (26 wk)" = -Inf, setNames(shortened_weeks, paste0("TPT ", shortened_weeks, " wk")))

# Run simulations for all scenarios
cat("Running shortened treatment scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_scenarios)) {
  cat("Processing scenario:", scenario_name, "\n")
  shorten_val <- all_scenarios[scenario_name]
  
  # Run simulations for this scenario
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_shortened_scenario(i, control_sims, intervention_sims, shorten_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# ——————————————————————————————————————————————————————
# 4) Calculate Summary Statistics
# ——————————————————————————————————————————————————————

summary_stats <- data.frame(
  Scenario = names(simulation_results),
  Mean_Incidence = sapply(simulation_results, mean),
  Median_Incidence = sapply(simulation_results, median),
  SD_Incidence = sapply(simulation_results, sd),
  CI_Lower = sapply(simulation_results, function(x) quantile(x, 0.025)),
  CI_Upper = sapply(simulation_results, function(x) quantile(x, 0.975)),
  stringsAsFactors = FALSE
)

cat("Summary Statistics (Final Incidence at Week 26):\n")
print(summary_stats)
cat("\n")

# ——————————————————————————————————————————————————————
# 5) Calculate Excess Incidence per 100,000 Contacts
# ——————————————————————————————————————————————————————

# Get baseline simulations (Full TPT 26 weeks)
baseline_sims <- simulation_results[["Full TPT (26 wk)"]]

# Calculate excess for each scenario
excess_stats <- summary_stats %>%
  filter(Scenario != "Full TPT (26 wk)") %>%
  mutate(
    # Point estimates
    Excess_Mean = Mean_Incidence - mean(baseline_sims),
    Excess_Median = Median_Incidence - median(baseline_sims),
    
    # Extract treatment duration information
    Treatment_Weeks = case_when(
      Scenario == "Control" ~ "No Treatment",
      grepl("TPT", Scenario) ~ as.character(as.numeric(sub("TPT (\\d+) wk", "\\1", Scenario))),
      TRUE ~ NA_character_
    ),
    Order = case_when(
      Scenario == "Control" ~ 999,
      TRUE ~ as.numeric(sub("TPT (\\d+) wk", "\\1", Scenario))
    )
  ) %>%
  arrange(Order)

# Calculate bootstrap confidence intervals for excess
for(i in 1:nrow(excess_stats)) {
  scenario_name <- excess_stats$Scenario[i]
  scenario_sims <- simulation_results[[scenario_name]]
  
  # Calculate excess for each simulation
  excess_sims <- scenario_sims - baseline_sims
  
  # Get percentiles
  excess_stats$Excess_CI_Lower[i] <- quantile(excess_sims, 0.025)
  excess_stats$Excess_CI_Upper[i] <- quantile(excess_sims, 0.975)
}

# Convert to per 100,000 contacts
excess_stats <- excess_stats %>%
  mutate(
    Excess_per_100k = Excess_Mean * 100000,
    Excess_per_100k_CI_Lower = Excess_CI_Lower * 100000,
    Excess_per_100k_CI_Upper = Excess_CI_Upper * 100000
  )

# ——————————————————————————————————————————————————————
# 6) Final Results Output
# ——————————————————————————————————————————————————————

cat("=== SHORTENED TREATMENT ANALYSIS RESULTS ===\n")
cat("Excess TB Cases per 100,000 Contacts at 26 weeks (compared to Full TPT):\n\n")

# Create final results table
final_results <- excess_stats %>%
  select(Treatment_Weeks, Excess_per_100k, Excess_per_100k_CI_Lower, Excess_per_100k_CI_Upper) %>%
  arrange(as.numeric(ifelse(Treatment_Weeks == "No Treatment", 999, Treatment_Weeks)))

# Print results in a formatted table
cat(sprintf("%-15s %-15s %-20s\n", "Treatment", "Excess Cases", "95% Confidence Interval"))
cat(sprintf("%-15s %-15s %-20s\n", "(weeks)", "per 100,000", "(per 100,000)"))
cat(paste(rep("-", 50), collapse = ""), "\n")

for(i in 1:nrow(final_results)) {
  row <- final_results[i, ]
  cat(sprintf("%-15s %-15.0f (%-6.0f to %-6.0f)\n", 
              row$Treatment_Weeks, 
              row$Excess_per_100k,
              row$Excess_per_100k_CI_Lower,
              row$Excess_per_100k_CI_Upper))
}

cat("\n")
cat("Method: Monte Carlo Bootstrap with", n_simulations, "simulations\n")
cat("Baseline: Full TPT (26 weeks)\n")
cat("Sample size basis:", n_intervention, "intervention participants\n")

# ——————————————————————————————————————————————————————
# 7) Summary Statistics
# ——————————————————————————————————————————————————————

cat("\n=== KEY FINDINGS ===\n")

# Find minimum excess (best shortened treatment)
best_shortened <- final_results %>%
  filter(Treatment_Weeks != "No Treatment") %>%
  slice_min(Excess_per_100k, n = 1)

# Find maximum excess among shortened treatments
worst_shortened <- final_results %>%
  filter(Treatment_Weeks != "No Treatment") %>%
  slice_max(Excess_per_100k, n = 1)

# Control scenario
control_result <- final_results %>%
  filter(Treatment_Weeks == "No Treatment")

cat(sprintf("Best shortened treatment: %s weeks -> %.0f excess cases per 100,000 (95%% CI: %.0f to %.0f)\n",
            best_shortened$Treatment_Weeks, 
            best_shortened$Excess_per_100k,
            best_shortened$Excess_per_100k_CI_Lower,
            best_shortened$Excess_per_100k_CI_Upper))

cat(sprintf("Worst shortened treatment: %s weeks -> %.0f excess cases per 100,000 (95%% CI: %.0f to %.0f)\n",
            worst_shortened$Treatment_Weeks,
            worst_shortened$Excess_per_100k,
            worst_shortened$Excess_per_100k_CI_Lower,
            worst_shortened$Excess_per_100k_CI_Upper))

cat(sprintf("No treatment (Control): %.0f excess cases per 100,000 (95%% CI: %.0f to %.0f)\n",
            control_result$Excess_per_100k,
            control_result$Excess_per_100k_CI_Lower,
            control_result$Excess_per_100k_CI_Upper))

# Return final results for further use
return(final_results)

Getting the control value for the sense check compared to 26 week delay - confirms model accuracy if equal

library(mgcv)
library(ggplot2)
library(dplyr)
library(parallel)

# ——————————————————————————————————————————————————————
# 0) Input Parameters - VQUIN DATA
# ——————————————————————————————————————————————————————

# Sample sizes (VQUIN values)
n_control <- 897      
n_intervention <- 907 

# Person-years of follow-up (VQUIN values)
py_control <- 938     
py_intervention <- 952 

# Monte Carlo parameters
n_simulations <- 5000  # Number of bootstrap simulations
set.seed(123)         # For reproducibility

cat("Analysis Parameters (VQUIN):\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Scenario Builder (VQUIN p0 values)
# ——————————————————————————————————————————————————————

bootstrap_scenario <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with VQUIN p0 values
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # VQUIN p0 values
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  #enforce monotonicity (Simulated cumulative curves can wiggle; discrete hazards can become negative - preventing this)
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  
  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this delay scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 13  # VQUIN back_wk value
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention (Instant TPT) - FIXED: use p0_draw instead of p0
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # CORRECTED: Negative delays (pre-screen intervention) - FIXED: use p0_draw instead of p0
    # This means intervention starts BEFORE time 0, providing earlier protection
    
    # Find delay point index
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    # Before delay point: ramp (no intervention yet)
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At delay point: start intervention
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * sqrt(s_delay / back_wk)  # FIXED
    Sprev <- 1 - Fcum[id]
    
    # CORRECTED: After delay point, use intervention hazards consistently
    # Map the post-delay timeline to intervention hazards starting from time 0
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      
      # Calculate how long intervention has been active
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        # Use intervention hazard corresponding to this duration
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        # Use last available intervention hazard
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        # This shouldn't happen with negative delays, but use first intervention hazard as fallback
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays (post-screen delays) - FIXED: use p0_draw instead of p0
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # FIXED
    }
    
    # At screen
    Fcum[i0] <- p0_draw  # FIXED
    Sprev <- 1 - p0_draw  # FIXED
    
    # Post-screen: control until delay point, then intervention
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        # Use control hazard until delay point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        # Use intervention hazard after delay point
        # Map to intervention timeline: intervention has been active for (t0 - delay_weeks) weeks
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Monte Carlo Simulations - Every 2 Week Delays
# ——————————————————————————————————————————————————————

cat("Running Monte Carlo simulations for 2-week delay intervals (VQUIN)...\n")

# Reset seed right before simulations for exact reproducibility
set.seed(42)

# Set up time vectors (VQUIN parameters)
time0 <- 0:26
back_wk <- 13  # VQUIN back_wk
time_ext <- seq(-back_wk, 26, by = 1)

# Create 2-week delay intervals from -13 to +26 weeks (VQUIN range)
delays_2wk <- c(seq(-13, -1, by = 2), 0, seq(2, 26, by = 2))

# Generate simulated predictions for VQUIN models
cat("Generating GAM simulations (VQUIN models)...\n")
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all delay scenarios (including baseline at week 0 = no delay)
all_delays <- c(setNames(delays_2wk, paste0(delays_2wk, " wk")), 
                "Control" = Inf)  # Add pure control scenario

# Run simulations for all scenarios
cat("Running scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_delays)) {
  cat("Processing scenario:", scenario_name, "\n")
  delay_val <- all_delays[scenario_name]
  
  # Run simulations for this scenario
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario(i, control_sims, intervention_sims, delay_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# Check if "0 wk" exists and compare a few values
print(simulation_results[["0 wk"]][1:5])  # First 5 simulation results
print(mean(simulation_results[["0 wk"]]))   # Mean for "0 wk" scenario

# ——————————————————————————————————————————————————————
# 4) Calculate Key Metrics with Week 0 as Baseline
# ——————————————————————————————————————————————————————

# Get baseline (Week 0 = no delay) for relative calculations
baseline_sims <- simulation_results[["0 wk"]]
baseline_mean <- mean(baseline_sims)

# Calculate metrics for each scenario
results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  
  # Handle special case for Control scenario
  if(scenario_name == "Control") {
    delay_value <- Inf  # Or 999 for sorting purposes
  } else {
    delay_value <- as.numeric(sub(" wk", "", scenario_name))
  }
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Use intervention PY and sample size for all scenarios
  py <- py_intervention
  sample_size <- n_intervention
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases per 100,000 (95% CI)
  cases_per_100k <- mean_incidence * 100000
  cases_per_100k_ci_lower <- ci_lower * 100000
  cases_per_100k_ci_upper <- ci_upper * 100000
  
  # 3. Excess calculations (compared to Week 0)
  if(delay_value != 0) {
      # Excess incidence simulations
      excess_sims <- scenario_sims - baseline_sims
      excess_mean <- mean(excess_sims)
      excess_ci_lower <- quantile(excess_sims, 0.025)
      excess_ci_upper <- quantile(excess_sims, 0.975)
      
      # Excess rate per 100 person-yrs
      excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
      excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
      excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
      
      # Excess cases per 100,000
      excess_cases_per_100k <- excess_mean * 100000
      excess_cases_per_100k_ci_lower <- excess_ci_lower * 100000
      excess_cases_per_100k_ci_upper <- excess_ci_upper * 100000
  } else {
      # Week 0 is the baseline, so excess = 0
      excess_rate_per_100py <- 0
      excess_rate_ci_lower <- 0
      excess_rate_ci_upper <- 0
      excess_cases_per_100k <- 0
      excess_cases_per_100k_ci_lower <- 0
      excess_cases_per_100k_ci_upper <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / baseline_mean
  
  # Store results
  results_list[[scenario_name]] <- list(
    delay_weeks = delay_value,
    incidence_rate_per_100py = rate_per_100py,
    rate_ci_lower = rate_ci_lower,
    rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    cases_per_100k = cases_per_100k,
    cases_per_100k_ci_lower = cases_per_100k_ci_lower,
    cases_per_100k_ci_upper = cases_per_100k_ci_upper,
    excess_cases_per_100k = excess_cases_per_100k,
    excess_cases_per_100k_ci_lower = excess_cases_per_100k_ci_lower,
    excess_cases_per_100k_ci_upper = excess_cases_per_100k_ci_upper,
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# 5) Create Export Table for Word
# ——————————————————————————————————————————————————————

# Build the export table
export_table <- data.frame()

for(scenario_name in names(results_list)) {
  r <- results_list[[scenario_name]]
  
  # Format all confidence intervals
  incidence_rate_ci <- sprintf("%.2f (%.2f-%.2f)", 
                              r$incidence_rate_per_100py, r$rate_ci_lower, r$rate_ci_upper)
  
  if(r$delay_weeks == 0) {
    excess_rate_ci <- "BASELINE"
    excess_cases_ci <- "BASELINE"
  } else {
    excess_rate_ci <- sprintf("%+.2f (%+.2f to %+.2f)", 
                             r$excess_rate_per_100py, r$excess_rate_ci_lower, r$excess_rate_ci_upper)
    excess_cases_ci <- sprintf("%+.0f (%+.0f to %+.0f)", 
                              r$excess_cases_per_100k, r$excess_cases_per_100k_ci_lower, r$excess_cases_per_100k_ci_upper)
  }
  
  cases_per_100k_ci <- sprintf("%.0f (%.0f-%.0f)", 
                              r$cases_per_100k, r$cases_per_100k_ci_lower, r$cases_per_100k_ci_upper)
  
  relative_risk_formatted <- sprintf("%.2f", r$relative_risk)
  
  # Add row to export table
  export_table <- rbind(export_table, data.frame(
    `Delay (weeks)` = r$delay_weeks,
    `Incidence Rate per 100 Person-Yrs (95% CI)` = incidence_rate_ci,
    `Excess Incidence Rate per 100 Person-Yrs (95% CI)` = excess_rate_ci,
    `Mean Expected Cases per 100,000 (95% CI)` = cases_per_100k_ci,
    `Excess Cases per 100,000 (95% CI)` = excess_cases_ci,
    `Relative Risk` = relative_risk_formatted,
    stringsAsFactors = FALSE,
    check.names = FALSE
  ))
}

# Sort by delay weeks
export_table <- export_table[order(export_table$`Delay (weeks)`), ]

# Display the table
cat("\n=== EXPORT TABLE FOR WORD (VQUIN) ===\n\n")
print(export_table, row.names = FALSE)

# ——————————————————————————————————————————————————————
# 6) Export Options
# ——————————————————————————————————————————————————————

# Option 1: Save as CSV for easy import to Word
write.csv(export_table, "VQUIN_tb_delays_analysis_table.csv", row.names = FALSE)
cat("\nTable saved as 'VQUIN_tb_delays_analysis_table.csv'\n")

# Option 2: Create a formatted version for direct copy-paste
cat("\n=== FORMATTED FOR COPY-PASTE TO WORD ===\n")
cat("Delay (weeks)\tIncidence Rate per 100 Person-Yrs (95% CI)\tExcess Incidence Rate per 100 Person-Yrs (95% CI)\tMean Expected Cases per 100,000 (95% CI)\tExcess Cases per 100,000 (95% CI)\tRelative Risk\n")

for(i in 1:nrow(export_table)) {
  cat(paste(export_table[i, ], collapse = "\t"), "\n")
}

# Option 3: Summary statistics
cat("\n=== ANALYSIS SUMMARY (VQUIN) ===\n")
cat("Monte Carlo simulations:", n_simulations, "\n")
cat("Delay intervals: Every 2 weeks from -13 to +26 weeks\n")
cat("Baseline scenario: Week 0 (no delay)\n")
cat("Sample size:", n_intervention, "\n")
cat("Person-years:", py_intervention, "\n")
cat("Total scenarios analyzed:", nrow(export_table), "\n")

# Option 4: Key findings
best_scenario <- export_table[which.min(as.numeric(gsub("[^-0-9.]", "", export_table$`Delay (weeks)`))), ]
worst_scenario <- export_table[which.max(as.numeric(gsub("[^-0-9.]", "", export_table$`Delay (weeks)`))), ]

cat("\nBest scenario (most negative delay):", best_scenario$`Delay (weeks)`, "weeks\n")
cat("Worst scenario (most positive delay):", worst_scenario$`Delay (weeks)`, "weeks\n")

cat("\nNote: Negative delays = treatment starts before screening\n")
cat("Positive delays = treatment starts after screening\n")
cat("All excess calculations are relative to Week 0 (no delay)\n")

Method 2 – varying the time since diagnosis parameter (Figure 9)

Delays with L = 6 (already have L=3 in main analysis)

library(mgcv)
library(ggplot2)
library(dplyr)
library(readr)
library(tidyr)

# ——————————————————————————————————————————————————————
# VQUIN Pre-curve Comparison: Auto-detect CSV delays
# ——————————————————————————————————————————————————————

# Input Parameters - VQUIN
n_control <- 897  # Update with VQUIN values if different
n_intervention <- 907  # Update with VQUIN values if different
py_control <- 938  # Update with VQUIN values if different
py_intervention <- 952  # Update with VQUIN values if different
n_simulations <- 5000
set.seed(123)

cat("VQUIN Pre-curve Length Comparison:\n")
cat("Auto-detecting delays from CSV file for perfect alignment\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) First, load CSV and detect what delays are actually present
# ——————————————————————————————————————————————————————

cat("=== STEP 1: DETECTING CSV DELAYS ===\n")

if(file.exists("VQUIN_tb_delays_analysis_table.csv")) {
  # Load CSV and inspect structure
  vquin_csv_raw <- read_csv("VQUIN_tb_delays_analysis_table.csv")
  
  cat("CSV loaded successfully\n")
  cat("CSV columns found:\n")
  print(colnames(vquin_csv_raw))
  cat("\nFirst few delay values:\n")
  print(head(vquin_csv_raw$`Delay (weeks)`, 10))
  
  # Extract numeric delays from the CSV
  parse_ci_value <- function(ci_string) {
    as.numeric(sub(" \\(.*\\)", "", ci_string))
  }
  
  vquin_26wk_clean <- vquin_csv_raw %>%
    mutate(
      delay_weeks = case_when(
        `Delay (weeks)` == "Instant TPT" ~ -999,
        `Delay (weeks)` == "Control" ~ 999,
        TRUE ~ as.numeric(`Delay (weeks)`)
      ),
      vquin_26wk_excess_cases_per_100k = parse_ci_value(`Excess Cases per 100,000 (95% CI)`),
      vquin_26wk_incidence_rate = parse_ci_value(`Incidence Rate per 100 Person-Yrs (95% CI)`),
      vquin_26wk_excess_rate = parse_ci_value(`Excess Incidence Rate per 100 Person-Yrs (95% CI)`),
      vquin_26wk_relative_risk = `Relative Risk`
    ) %>%
    select(delay_weeks, vquin_26wk_excess_cases_per_100k, vquin_26wk_incidence_rate, vquin_26wk_excess_rate, vquin_26wk_relative_risk) %>%
    filter(!is.na(delay_weeks))  # Remove rows where delay parsing failed
  
  # Extract the actual delays present in CSV (excluding baselines)
  csv_delays <- vquin_26wk_clean %>%
    filter(delay_weeks != -999 & delay_weeks != 999) %>%
    arrange(delay_weeks) %>%
    pull(delay_weeks)
  
  cat("\nACTUAL DELAYS FOUND IN CSV:\n")
  cat("Delays:", paste(csv_delays, collapse = ", "), "\n")
  cat("Number of delay scenarios:", length(csv_delays), "\n\n")
  
} else {
  stop("ERROR: VQUIN_tb_delays_analysis_table.csv not found in working directory")
}

# ——————————————————————————————————————————————————————
# 2) Run NEW analysis using EXACTLY the same delays as CSV
# ——————————————————————————————————————————————————————

cat("=== STEP 2: RUNNING NEW ANALYSIS WITH MATCHED DELAYS ===\n")

# Monte Carlo Simulation Function
simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  Vp <- vcov(gam_model, unconditional = TRUE)
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  pred_sims <- tcrossprod(Xp, coef_sims)
  return(pred_sims)
}

# Bootstrap Scenario Builder - SQUARE ROOT PRE-CURVE (-26 week)
bootstrap_scenario_linear <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # VQUIN p0 values - Update with appropriate VQUIN values
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # Update these parameters for VQUIN
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  
  # Enforce monotonicity
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  # Calculate hazards
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  i0 <- which(time_ext == 0)
  back_wk <- 26  # -26 week pre-curve (changed from 13)
  
  if(delay_weeks == Inf) {
    # Pure control - SQUARE ROOT PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # SQUARE ROOT
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention - SQUARE ROOT PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # SQUARE ROOT
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # Negative delays - SQUARE ROOT PRE-CURVE
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * sqrt(s_up / back_wk)  # SQUARE ROOT
    }
    
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * sqrt(s_delay / back_wk)  # SQUARE ROOT
    Sprev <- 1 - Fcum[id]
    
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays - SQUARE ROOT PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * sqrt(s_up / back_wk)  # SQUARE ROOT
    }
    
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# Set up time vectors
time0 <- 0:26
back_wk <- 26  # Changed from 13 to 26
time_ext <- seq(-back_wk, 26, by = 1)

cat("Using EXACT CSV delays:", paste(csv_delays, collapse = ", "), "\n")

# Check GAM models
if(!exists("control_gam_k5_GCV_VQUIN") || !exists("intervention_gam_k5_GCV_VQUIN")) {
  stop("ERROR: VQUIN GAM models not found. Please load:\n  - control_gam_k5_GCV_VQUIN\n  - intervention_gam_k5_GCV_VQUIN")
}

# Generate simulations
control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

# Define scenarios using CSV delays
all_delays <- c("Instant TPT" = -Inf, setNames(csv_delays, paste0(csv_delays, " wk")), "Control" = Inf)

# Run simulations
cat("Running", length(all_delays), "scenarios...\n")
simulation_results <- list()

for(scenario_name in names(all_delays)) {
  delay_val <- all_delays[scenario_name]
  
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario_linear(i, control_sims, intervention_sims, delay_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Monte Carlo simulations completed!\n\n")

# ——————————————————————————————————————————————————————
# 3) Calculate metrics and create results for comparison
# ——————————————————————————————————————————————————————

baseline_sims <- simulation_results[["Instant TPT"]]
baseline_mean <- mean(baseline_sims)

results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  
  mean_incidence <- mean(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  py <- ifelse(scenario_name == "Control", py_control, py_intervention)
  sample_size <- ifelse(scenario_name == "Control", n_control, n_intervention)
  
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  
  if(scenario_name != "Instant TPT") {
    excess_sims <- scenario_sims - baseline_sims
    excess_mean <- mean(excess_sims)
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    # Convert excess cases to per 100,000 population
    excess_cases_per_100k <- excess_mean * 100000
  } else {
    excess_rate_per_100py <- 0
    excess_cases_per_100k <- 0
  }
  
  relative_risk <- mean_incidence / baseline_mean
  
  results_list[[scenario_name]] <- list(
    scenario = scenario_name,
    incidence_rate_per_100py = rate_per_100py,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_cases_per_100k = excess_cases_per_100k,
    relative_risk = relative_risk
  )
}

# Convert to comparable format
vquin_26wk_results <- data.frame()

for(scenario_name in names(results_list)) {
  r <- results_list[[scenario_name]]
  
  if(scenario_name == "Instant TPT") {
    delay_weeks <- -999
  } else if(scenario_name == "Control") {
    delay_weeks <- 999
  } else {
    delay_weeks <- as.numeric(sub(" wk", "", scenario_name))
  }
  
  vquin_26wk_results <- rbind(vquin_26wk_results, data.frame(
    scenario = scenario_name,
    delay_weeks = delay_weeks,
    vquin_26wk_excess_cases_per_100k = r$excess_cases_per_100k,
    vquin_26wk_incidence_rate = r$incidence_rate_per_100py,
    vquin_26wk_excess_rate = r$excess_rate_per_100py,
    vquin_26wk_relative_risk = r$relative_risk,
    stringsAsFactors = FALSE
  ))
}

# ——————————————————————————————————————————————————————
# 4) Perfect Comparison
# ——————————————————————————————————————————————————————

cat("=== STEP 3: PERFECT COMPARISON (NO NAs) ===\n")

# Merge datasets
comparison_data <- merge(vquin_26wk_clean, vquin_26wk_results, by = "delay_weeks", all = FALSE)  # all=FALSE ensures only matching rows

# Focus on early benefit scenarios
early_benefit_comparison <- comparison_data %>%
  filter(delay_weeks < 0 & delay_weeks != -999) %>%
  select(delay_weeks, scenario, 
         vquin_26wk_excess_cases_per_100k.x, vquin_26wk_excess_cases_per_100k.y,
         vquin_26wk_excess_rate.x, vquin_26wk_excess_rate.y,
         vquin_26wk_relative_risk.x, vquin_26wk_relative_risk.y) %>%
  rename(
    csv_excess_cases_per_100k = vquin_26wk_excess_cases_per_100k.x,
    new_excess_cases_per_100k = vquin_26wk_excess_cases_per_100k.y,
    csv_excess_rate = vquin_26wk_excess_rate.x,
    new_excess_rate = vquin_26wk_excess_rate.y,
    csv_relative_risk = vquin_26wk_relative_risk.x,
    new_relative_risk = vquin_26wk_relative_risk.y
  ) %>%
  mutate(
    diff_excess_cases_per_100k = new_excess_cases_per_100k - csv_excess_cases_per_100k,
    diff_excess_rate = new_excess_rate - csv_excess_rate,
    diff_relative_risk = new_relative_risk - csv_relative_risk,
    pct_diff_excess_cases = ifelse(!is.na(csv_excess_cases_per_100k) & csv_excess_cases_per_100k != 0,
                                  (diff_excess_cases_per_100k / abs(csv_excess_cases_per_100k)) * 100, NA),
    pct_diff_excess_rate = ifelse(!is.na(csv_excess_rate) & csv_excess_rate != 0,
                                 (diff_excess_rate / abs(csv_excess_rate)) * 100, NA)
  ) %>%
  arrange(delay_weeks)

cat("COMPARISON DETAILS:\n")
cat("CSV data: -26 week pre-curve (square-root function)\n")
cat("New analysis: -26 week pre-curve (square-root function)\n")  
cat("Population: Adults, South Africa (VQUIN)\n")
cat("Perfect delay alignment: NO NAs!\n\n")

if(nrow(early_benefit_comparison) > 0) {
  cat("EARLY BENEFIT SCENARIOS COMPARISON:\n")
  cat("===================================\n")
  cat("Number of matched scenarios:", nrow(early_benefit_comparison), "\n")
  cat("Delay range:", min(early_benefit_comparison$delay_weeks), "to", max(early_benefit_comparison$delay_weeks), "weeks\n\n")
  
  # Create detailed comparison table with clear per 100,000 labeling
  detailed_comparison <- early_benefit_comparison %>%
    mutate(
      `Delay (weeks)` = delay_weeks,
      `CSV Excess Cases per 100k` = sprintf("%.1f", csv_excess_cases_per_100k),
      `New Excess Cases per 100k` = sprintf("%.1f", new_excess_cases_per_100k),
      `Difference per 100k` = sprintf("%+.1f", diff_excess_cases_per_100k),
      `% Change` = sprintf("%+.1f%%", pct_diff_excess_cases),
      `CSV Rate` = sprintf("%.3f", csv_excess_rate),
      `New Rate` = sprintf("%.3f", new_excess_rate),
      `Rate Diff` = sprintf("%+.3f", diff_excess_rate)
    ) %>%
    select(`Delay (weeks)`, `CSV Excess Cases per 100k`, `New Excess Cases per 100k`, `Difference per 100k`, `% Change`,
           `CSV Rate`, `New Rate`, `Rate Diff`)
  
  print(detailed_comparison)
  
  # Summary statistics
  finite_cases_diff <- early_benefit_comparison$diff_excess_cases_per_100k[is.finite(early_benefit_comparison$diff_excess_cases_per_100k)]
  finite_pct_diff <- early_benefit_comparison$pct_diff_excess_cases[is.finite(early_benefit_comparison$pct_diff_excess_cases)]
  
  if(length(finite_cases_diff) > 0) {
    cat("\nSUMMARY STATISTICS:\n")
    cat("==================\n")
    cat("Mean difference (New - CSV):", sprintf("%.1f", mean(finite_cases_diff)), "excess cases per 100,000\n")
    cat("Range:", sprintf("%.1f to %.1f", min(finite_cases_diff), max(finite_cases_diff)), "excess cases per 100,000\n")
    
    if(length(finite_pct_diff) > 0) {
      cat("Mean percentage change:", sprintf("%.1f%%", mean(finite_pct_diff)), "\n")
      cat("Range:", sprintf("%.1f%% to %.1f%%", min(finite_pct_diff), max(finite_pct_diff)), "\n")
    }
    
    cat("\nINTERPRETATION:\n")
    mean_diff <- mean(finite_cases_diff)
    if(abs(mean_diff) < 50) {
      cat("• MINIMAL impact: Both analyses use -26 week pre-curve, showing <50 cases per 100,000 average difference\n")
    } else if(abs(mean_diff) < 200) {
      cat("• MODERATE difference: Both analyses use -26 week pre-curve, showing", sprintf("%.0f", abs(mean_diff)), "cases per 100,000 average difference\n")
    } else {
      cat("• SUBSTANTIAL difference: Both analyses use -26 week pre-curve, showing", sprintf("%.0f", abs(mean_diff)), "cases per 100,000 average difference\n")
    }
    
    if(abs(mean_diff) < 10) {
      cat("• Method consistency: Results are nearly identical, confirming methodological alignment\n")
    } else {
      cat("• Method differences: Some variation suggests potential methodological differences beyond pre-curve length\n")
    }
  }
  
} else {
  cat("No early benefit scenarios found for comparison.\n")
}

# ——————————————————————————————————————————————————————
# 5) Create VQUIN Sensitivity Analysis Barplot (like original format)
# ——————————————————————————————————————————————————————

cat("=== STEP 4: CREATING SENSITIVITY ANALYSIS BARPLOT ===\n")

# Calculate confidence intervals for all scenarios from simulation results
plot_results <- data.frame()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  baseline_sims <- simulation_results[["Instant TPT"]]
  
  if(scenario_name == "Instant TPT") {
    delay_weeks <- -999
    excess_cases_per_100k <- 0
    ci_lower <- 0
    ci_upper <- 0
  } else if(scenario_name == "Control") {
    delay_weeks <- 999
    excess_sims <- scenario_sims - baseline_sims
    excess_cases_per_100k <- mean(excess_sims) * 100000
    ci_lower <- quantile(excess_sims, 0.025) * 100000
    ci_upper <- quantile(excess_sims, 0.975) * 100000
  } else {
    delay_weeks <- as.numeric(sub(" wk", "", scenario_name))
    excess_sims <- scenario_sims - baseline_sims
    excess_cases_per_100k <- mean(excess_sims) * 100000
    ci_lower <- quantile(excess_sims, 0.025) * 100000
    ci_upper <- quantile(excess_sims, 0.975) * 100000
  }
  
  # Determine treatment timing category
  treatment_timing <- case_when(
    delay_weeks == 999 ~ "No Treatment",
    delay_weeks > 0 ~ "Post-screen",
    delay_weeks <= 0 & delay_weeks != -999 ~ "Pre-screen",
    delay_weeks == -999 ~ "Instant TPT"
  )
  
  plot_results <- rbind(plot_results, data.frame(
    delay_weeks = delay_weeks,
    scenario_name = scenario_name,
    excess_cases_per_100k = excess_cases_per_100k,
    ci_lower = ci_lower,
    ci_upper = ci_upper,
    treatment_timing = treatment_timing,
    stringsAsFactors = FALSE
  ))
}

# Filter and prepare for plotting (exclude instant TPT baseline and delays < -26)
plot_data <- plot_results %>%
  filter(delay_weeks != -999 & delay_weeks >= -26 & delay_weeks != 0 & delay_weeks !=26 & delay_weeks != Inf) %>%  # Exclude instant TPT, delays < -26, and 0 weeks
  arrange(delay_weeks) %>%
  mutate(
    delay_label = ifelse(delay_weeks == 999, "No TPT", as.character(delay_weeks)),
    delay_order = ifelse(delay_weeks == 999, max(delay_weeks[delay_weeks != 999]) + 5, delay_weeks)
  ) %>%
  arrange(delay_order)

# Create sensitivity analysis barplot
sensitivity_plot <- ggplot(plot_data, aes(x = reorder(delay_label, delay_order), y = excess_cases_per_100k, fill = treatment_timing)) +
  geom_col(alpha = 0.8, width = 0.7) +
  geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper), width = 0.2, color = "black", alpha = 0.7) +
  scale_fill_manual(values = c("Pre-screen" = "#2E86AB", "Post-screen" = "#E74C3C", "No Treatment" = "#F39C12")) +
  geom_text(aes(label = sprintf("%.0f", excess_cases_per_100k),
              y = ifelse(excess_cases_per_100k >= 0, 
                        pmax(ci_upper + 150, excess_cases_per_100k + 150),  # Above positive bars
                        pmin(ci_lower - 150, excess_cases_per_100k - 150))), # Below negative bars
          size = 4.2, fontface = "bold", color = "black") +
  labs(
    title = "SENSITIVITY ANALYSIS Method 2: VQUIN Excess TB Cases by Treatment Delay",
    subtitle = "26-week pre-curve assumption (square root) | Excess cases per 100,000 contacts at 26 weeks",
    x = "Treatment Delay (weeks)",
    y = "Excess TB Cases per 100,000 Contacts",
    fill = "Treatment Timing",
    caption = "Sample size: 451 | Negative delays = pre-screen treatment"
  ) +
  theme_minimal() +
  theme(
  plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 11, color = "black"),
    plot.caption = element_text(size = 9, color = "black"),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 13, color = "black"),
    axis.text.y = element_text(size = 13, color = "black"),
    axis.title = element_text(size = 11, face = "bold"),
    legend.position = "top",
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    panel.grid.major.x = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "gray80", fill = NA, size = 0.5) ) +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5, color = "gray50")

print(sensitivity_plot)

cat("\nSensitivity analysis barplot created!\n")
cat("Format matches original VQUIN analysis\n")
cat("26-week pre-curve with square root function\n")
cat("All values shown as excess cases per 100,000 population\n")
cat("Error bars show 95% confidence intervals\n\n")

cat("\n=== PERFECT ALIGNMENT ACHIEVED ===\n")
cat("No NAs in comparison - all delays perfectly matched!\n")
cat("All excess cases displayed as per 100,000 population\n")
cat("Barplot comparison included!\n")

Method 3 - trying a linear pre-curve with original trial L length

For delays :

library(mgcv)
library(ggplot2)
library(dplyr)
library(RColorBrewer)
library(scales)

# ——————————————————————————————————————————————————————
# Enhanced TB Delays Analysis with ALIGNED intervals - Using Real GAM Models
# LINEAR PRE-CURVE VERSION
# ——————————————————————————————————————————————————————

# Input Parameters (from your data) - VQUIN
n_control <- 897
n_intervention <- 907  
py_control <- 938
py_intervention <- 952
n_simulations <- 5000
set.seed(123)

cat("Enhanced Delays Analysis Parameters (VQUIN - LINEAR pre-curve):\n")
cat("Control sample size:", n_control, "\n")
cat("Intervention sample size:", n_intervention, "\n")
cat("Control person-years:", py_control, "\n")
cat("Intervention person-years:", py_intervention, "\n")
cat("Number of simulations:", n_simulations, "\n\n")

# ——————————————————————————————————————————————————————
# 1) Monte Carlo Simulation Function (from your working code)
# ——————————————————————————————————————————————————————

simulate_gam_predictions <- function(gam_model, newdata, n_sim = n_simulations) {
  # Get mean predictions and covariance matrix
  pred_mean <- predict(gam_model, newdata = newdata, type = "response")
  
  # Get the covariance matrix of the GAM coefficients
  Vp <- vcov(gam_model, unconditional = TRUE)
  
  # Get the linear predictor matrix
  Xp <- predict(gam_model, newdata = newdata, type = "lpmatrix")
  
  # Simulate from multivariate normal distribution
  # Generate random coefficients
  coef_sims <- MASS::mvrnorm(n_sim, coef(gam_model), Vp)
  
  # Calculate predictions for each simulation
  pred_sims <- tcrossprod(Xp, coef_sims)
  
  return(pred_sims)
}

# ——————————————————————————————————————————————————————
# 2) Bootstrap Scenario Builder - LINEAR PRE-CURVE VERSION
# ——————————————————————————————————————————————————————

bootstrap_scenario_linear <- function(sim_idx, control_sims, intervention_sims, delay_weeks) {
  # Extract simulated predictions for this iteration
  Fc0_sim <- control_sims[, sim_idx]
  Fi0_sim <- intervention_sims[, sim_idx]
  
  # Apply p0 transformation with a beta draw for p0 - VQUIN values
  p0_draw <- rbeta(1, 61 + 0.5, 3948 - 61 + 0.5)  # VQUIN p0 values 61 and 3948
  Fc0_sim <- p0_draw + (1 - p0_draw) * Fc0_sim
  Fi0_sim <- p0_draw + (1 - p0_draw) * Fi0_sim
  
  # Enforce monotonicity
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  # Calculate hazards for this simulation
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build curve for this delay scenario
  n <- length(time_ext)
  Fcum <- numeric(n)
  
  # Find indices
  i0 <- which(time_ext == 0)
  back_wk <- 26
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - LINEAR PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * (s_up / back_wk)  # LINEAR: no sqrt()
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention (Instant TPT) - LINEAR PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * (s_up / back_wk)  # LINEAR: no sqrt()
    }
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # Negative delays (pre-screen intervention) - LINEAR PRE-CURVE
    
    # Find delay point index
    id_match <- which(time_ext == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    # Before delay point: ramp (no intervention yet)
    if(id > 1) {
      s_up <- time_ext[1:(id-1)] + back_wk
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_draw * (s_up / back_wk)  # LINEAR: no sqrt()
    }
    
    # At delay point: start intervention
    s_delay <- max(0, delay_weeks + back_wk)
    Fcum[id] <- p0_draw * (s_delay / back_wk)  # LINEAR: no sqrt()
    Sprev <- 1 - Fcum[id]
    
    # After delay point, use intervention hazards consistently
    for(i in (id+1):n) {
      current_time <- time_ext[i]
      
      # Calculate how long intervention has been active
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        # Use intervention hazard corresponding to this duration
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        # Use last available intervention hazard
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        # This shouldn't happen with negative delays, but use first intervention hazard as fallback
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays (post-screen delays) - LINEAR PRE-CURVE
    # Pre-screen ramp
    if(i0 > 1) {
      s_up <- time_ext[1:(i0-1)] + back_wk
      Fcum[1:(i0-1)] <- p0_draw * (s_up / back_wk)  # LINEAR: no sqrt()
    }
    
    # At screen
    Fcum[i0] <- p0_draw
    Sprev <- 1 - p0_draw
    
    # Post-screen: control until delay point, then intervention
    for(i in (i0+1):n) {
      t0 <- time_ext[i]
      
      if(t0 <= delay_weeks) {
        # Use control hazard until delay point
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        # Use intervention hazard after delay point
        # Map to intervention timeline: intervention has been active for (t0 - delay_weeks) weeks
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  # Return final value at week 26
  week_26_idx <- which(time_ext == 26)
  return(Fcum[week_26_idx])
}

# ——————————————————————————————————————————————————————
# 3) Run Analysis with ALIGNED delays to match original dataset
# ——————————————————————————————————————————————————————

cat("Running Monte Carlo simulations with ALIGNED intervals to match original dataset...\n")

# Set up time vectors
time0 <- 0:26
back_wk <- 26
time_ext <- seq(-back_wk, 26, by = 1)

# ALIGNED DELAYS to match your original VQUIN dataset
delays_aligned <- c(-13, -11, -9, -7, -5, -3, -1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24)

cat("Using aligned delays:", paste(delays_aligned, collapse = ", "), "\n\n")

# Generate simulated predictions using your ACTUAL GAM models
cat("Generating GAM simulations using your trained VQUIN models...\n")
# NOTE: Make sure your GAM models are loaded: control_gam_k5_GCV_VQUIN and intervention_gam_k5_GCV_VQUIN

# Check if GAM models exist
if(!exists("control_gam_k5_GCV_VQUIN") || !exists("intervention_gam_k5_GCV_VQUIN")) {
  stop("ERROR: VQUIN GAM models not found. Please load:\n  - control_gam_k5_GCV_VQUIN\n  - intervention_gam_k5_GCV_VQUIN")
}

control_sims <- simulate_gam_predictions(control_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)
intervention_sims <- simulate_gam_predictions(intervention_gam_k5_GCV_VQUIN, data.frame(time = time0), n_simulations)

cat("Generated", ncol(control_sims), "control simulations and", ncol(intervention_sims), "intervention simulations\n")

# Define all scenarios including baseline
all_delays <- c("Instant TPT" = -Inf, setNames(delays_aligned, paste0(delays_aligned, " wk")), "Control" = Inf)

# Run simulations for all scenarios
cat("Running scenario simulations...\n")
simulation_results <- list()

for(scenario_name in names(all_delays)) {
  cat("Processing scenario:", scenario_name, "\n")
  delay_val <- all_delays[scenario_name]
  
  scenario_sims <- sapply(1:n_simulations, function(i) {
    bootstrap_scenario_linear(i, control_sims, intervention_sims, delay_val)
  })
  
  simulation_results[[scenario_name]] <- scenario_sims
}

cat("Completed all simulations!\n\n")

# ——————————————————————————————————————————————————————
# 4) Calculate Metrics and Create results_list in the expected format
# ——————————————————————————————————————————————————————

# Get baseline (Instant TPT) for relative calculations
baseline_sims <- simulation_results[["Instant TPT"]]
baseline_mean <- mean(baseline_sims)

# Calculate metrics for each scenario using your exact approach
results_list <- list()

for(scenario_name in names(simulation_results)) {
  scenario_sims <- simulation_results[[scenario_name]]
  
  # Basic statistics
  mean_incidence <- mean(scenario_sims)
  median_incidence <- median(scenario_sims)
  ci_lower <- quantile(scenario_sims, 0.025)
  ci_upper <- quantile(scenario_sims, 0.975)
  
  # Determine person-years (Control uses control PY, others use intervention PY)
  py <- ifelse(scenario_name == "Control", py_control, py_intervention)
  sample_size <- ifelse(scenario_name == "Control", n_control, n_intervention)
  
  # 1. Incidence rate per 100 person-yrs (95% CI)
  rate_per_100py <- (mean_incidence * sample_size / py) * 100
  rate_ci_lower <- (ci_lower * sample_size / py) * 100
  rate_ci_upper <- (ci_upper * sample_size / py) * 100
  
  # 2. Mean Expected Cases (95% CI)
  mean_cases <- mean_incidence * sample_size
  cases_ci_lower <- ci_lower * sample_size
  cases_ci_upper <- ci_upper * sample_size
  
  # 3. Excess calculations (compared to Instant TPT)
  if(scenario_name != "Instant TPT") {
    # Excess incidence simulations
    excess_sims <- scenario_sims - baseline_sims
    excess_mean <- mean(excess_sims)
    excess_ci_lower <- quantile(excess_sims, 0.025)
    excess_ci_upper <- quantile(excess_sims, 0.975)
    
    # Excess rate per 100 person-yrs
    excess_rate_per_100py <- (excess_mean * sample_size / py) * 100
    excess_rate_ci_lower <- (excess_ci_lower * sample_size / py) * 100
    excess_rate_ci_upper <- (excess_ci_upper * sample_size / py) * 100
    
    # Excess cases
    excess_cases_mean <- excess_mean * sample_size
    excess_cases_ci_lower <- excess_ci_lower * sample_size
    excess_cases_ci_upper <- excess_ci_upper * sample_size
  } else {
    # Instant TPT is the baseline, so excess = 0
    excess_rate_per_100py <- 0
    excess_rate_ci_lower <- 0
    excess_rate_ci_upper <- 0
    excess_cases_mean <- 0
    excess_cases_ci_lower <- 0
    excess_cases_ci_upper <- 0
  }
  
  # 4. Relative Risk
  relative_risk <- mean_incidence / baseline_mean
  
  # Store results in the expected format
  results_list[[scenario_name]] <- list(
    scenario = scenario_name,
    incidence_rate_per_100py = rate_per_100py,
    incidence_rate_ci_lower = rate_ci_lower,
    incidence_rate_ci_upper = rate_ci_upper,
    excess_rate_per_100py = excess_rate_per_100py,
    excess_rate_ci_lower = excess_rate_ci_lower,
    excess_rate_ci_upper = excess_rate_ci_upper,
    mean_cases = mean_cases,
    cases_ci_lower = cases_ci_lower,
    cases_ci_upper = cases_ci_upper,
    excess_cases_mean = excess_cases_mean,
    excess_cases_ci_lower = excess_cases_ci_lower,
    excess_cases_ci_upper = excess_cases_ci_upper,
    relative_risk = relative_risk
  )
}

# ——————————————————————————————————————————————————————
# 5) Verification and Summary
# ——————————————————————————————————————————————————————

cat("=== LINEAR PRE-CURVE ANALYSIS COMPLETE ===\n")
cat("results_list created with", length(results_list), "scenarios\n")
cat("Scenarios:", paste(names(results_list), collapse = ", "), "\n\n")

cat("ALIGNED DELAYS USED:\n")
cat("Negative delays (early benefit):", paste(delays_aligned[delays_aligned < 0], collapse = ", "), "\n")
cat("Positive delays (post-screen):", paste(delays_aligned[delays_aligned >= 0], collapse = ", "), "\n\n")

cat("Sample results verification:\n")
for(i in 1:min(3, length(results_list))) {
  scenario_name <- names(results_list)[i]
  r <- results_list[[scenario_name]]
  cat(sprintf("  %s: excess_cases_mean = %.2f, relative_risk = %.4f\n", 
              scenario_name, r$excess_cases_mean, r$relative_risk))
}

cat("\nNow you can run your comparison code to compare square-root vs linear pre-curves!\n")
cat("The results_list object is ready and uses the same delays as your original dataset.\n")

Comparison of the linear pre-curve to main results:

# ——————————————————————————————————————————————————————
# Fixed Early Benefit Analysis: Square-root vs Linear Pre-curve
# ——————————————————————————————————————————————————————

library(dplyr)
library(readr)

# ——————————————————————————————————————————————————————
# 1) Load Original Results (Square-root pre-curve) from CSV
# ——————————————————————————————————————————————————————

# Load the original results from your CSV file
original_results <- read_csv("VQUIN_tb_delays_analysis_table.csv")

# Clean and parse the original results
parse_ci_value <- function(ci_string) {
  # Extract the main value from "X.X (Y.Y to Z.Z)" format
  as.numeric(sub(" \\(.*\\)", "", ci_string))
}

original_clean <- original_results %>%
  mutate(
    delay_weeks = case_when(
      `Delay (weeks)` == "Instant TPT" ~ -999,
      `Delay (weeks)` == "Control" ~ 999,
      TRUE ~ as.numeric(`Delay (weeks)`)
    ),
    orig_excess_cases = parse_ci_value(`Excess Cases per 100,000 (95% CI)`),
    orig_incidence_rate = parse_ci_value(`Incidence Rate per 100 Person-Yrs (95% CI)`),
    orig_excess_rate = parse_ci_value(`Excess Incidence Rate per 100 Person-Yrs (95% CI)`),
    orig_relative_risk = `Relative Risk`
  ) %>%
  select(delay_weeks, orig_excess_cases, orig_incidence_rate, orig_excess_rate, orig_relative_risk)

cat("Original results (square-root pre-curve) loaded successfully\n")
cat("Number of scenarios:", nrow(original_clean), "\n\n")

# ——————————————————————————————————————————————————————
# 2) Extract Linear Pre-curve Results from Current Analysis
# ——————————————————————————————————————————————————————

# Check if results_list exists
if(!exists("results_list")) {
  stop("Error: results_list not found. Please run your linear pre-curve analysis first.")
}

cat("Checking results_list structure...\n")
cat("Number of scenarios in results_list:", length(results_list), "\n")
cat("Scenario names:", paste(names(results_list), collapse = ", "), "\n\n")

# Convert results_list to a data frame with error checking
linear_results <- data.frame()

for(scenario_name in names(results_list)) {
  cat("Processing scenario:", scenario_name, "\n")
  
  r <- results_list[[scenario_name]]
  
  # Check if r is a list and has the expected elements
  if(!is.list(r)) {
    cat("  Warning: Scenario", scenario_name, "is not a list. Skipping.\n")
    next
  }
  
  # Check for required elements
  required_elements <- c("excess_cases_mean", "incidence_rate_per_100py", "excess_rate_per_100py", "relative_risk")
  missing_elements <- required_elements[!required_elements %in% names(r)]
  
  if(length(missing_elements) > 0) {
    cat("  Warning: Scenario", scenario_name, "missing elements:", paste(missing_elements, collapse = ", "), ". Skipping.\n")
    next
  }
  
  # Extract delay value
  if(scenario_name == "Instant TPT") {
    delay_weeks <- -999
  } else if(scenario_name == "Control") {
    delay_weeks <- 999
  } else {
    # Try to extract delay weeks, handle potential errors
    tryCatch({
      delay_weeks <- as.numeric(sub(" wk", "", scenario_name))
      if(is.na(delay_weeks)) {
        cat("  Warning: Could not parse delay weeks from", scenario_name, ". Skipping.\n")
        next
      }
    }, error = function(e) {
      cat("  Warning: Error parsing delay weeks from", scenario_name, ":", e$message, ". Skipping.\n")
      next
    })
  }
  
  # Check that all values are scalar (length 1)
  values_to_check <- list(
    excess_cases_mean = r$excess_cases_mean,
    incidence_rate_per_100py = r$incidence_rate_per_100py,
    excess_rate_per_100py = r$excess_rate_per_100py,
    relative_risk = r$relative_risk
  )
  
  # Check each value
  all_ok <- TRUE
  for(val_name in names(values_to_check)) {
    val <- values_to_check[[val_name]]
    if(is.null(val) || length(val) != 1 || !is.numeric(val)) {
      cat("  Warning: Invalid", val_name, "for scenario", scenario_name, 
          "(length:", length(val), ", class:", class(val), "). Skipping.\n")
      all_ok <- FALSE
      break
    }
  }
  
  if(!all_ok) next
  
  # Convert to per 100,000 to match original scale
  tryCatch({
    new_row <- data.frame(
      scenario = scenario_name,
      delay_weeks = delay_weeks,
      linear_excess_cases = r$excess_cases_mean * (100000/907), # Scale to per 100,000
      linear_incidence_rate = r$incidence_rate_per_100py,
      linear_excess_rate = r$excess_rate_per_100py,
      linear_relative_risk = r$relative_risk,
      stringsAsFactors = FALSE
    )
    
    linear_results <- rbind(linear_results, new_row)
    cat("  Successfully processed scenario:", scenario_name, "\n")
    
  }, error = function(e) {
    cat("  Error creating data frame for scenario", scenario_name, ":", e$message, "\n")
  })
}

cat("\nLinear pre-curve results extraction complete\n")
cat("Successfully processed scenarios:", nrow(linear_results), "\n")

if(nrow(linear_results) == 0) {
  stop("Error: No scenarios were successfully processed. Please check your results_list structure.")
}

cat("Processed scenarios:", paste(linear_results$scenario, collapse = ", "), "\n\n")

# ——————————————————————————————————————————————————————
# 3) Robust Percentage Change Calculation
# ——————————————————————————————————————————————————————

# Function to calculate robust percentage change
robust_pct_change <- function(new_val, orig_val, threshold = 0.1) {
  # If original value is too small (close to zero), return absolute difference instead
  if(is.na(orig_val) || is.na(new_val) || abs(orig_val) < threshold) {
    return(new_val - orig_val)  # Absolute difference
  } else {
    pct_change <- ((new_val - orig_val) / abs(orig_val)) * 100
    # Cap extreme percentage changes
    if(!is.finite(pct_change) || abs(pct_change) > 1000) {
      return(new_val - orig_val)  # Return absolute difference for extreme cases
    } else {
      return(pct_change)
    }
  }
}

# Merge the datasets
comparison_data <- merge(original_clean, linear_results, by = "delay_weeks", all = TRUE)

# Calculate robust percentage changes
comparison_data <- comparison_data %>%
  mutate(
    # Use robust percentage change function
    pct_change_excess_cases = mapply(robust_pct_change, linear_excess_cases, orig_excess_cases),
    pct_change_incidence_rate = mapply(robust_pct_change, linear_incidence_rate, orig_incidence_rate),
    pct_change_excess_rate = mapply(robust_pct_change, linear_excess_rate, orig_excess_rate),
    
    # Change in relative risk (absolute difference)
    change_relative_risk = linear_relative_risk - orig_relative_risk,
    
    # Flag whether percentage or absolute difference was used
    excess_cases_is_pct = abs(orig_excess_cases) >= 0.1,
    excess_rate_is_pct = abs(orig_excess_rate) >= 0.1,
    incidence_rate_is_pct = abs(orig_incidence_rate) >= 0.1,
    
    # Classify delay type
    delay_type = case_when(
      delay_weeks == -999 ~ "Instant TPT",
      delay_weeks == 999 ~ "Control", 
      delay_weeks < 0 ~ "Pre-screen (Early Benefit)",
      delay_weeks >= 0 ~ "Post-screen"
    )
  ) %>%
  arrange(delay_weeks)

# ——————————————————————————————————————————————————————
# 4) Focus on Early Benefit Scenarios (Negative Delays)
# ——————————————————————————————————————————————————————

early_benefit_scenarios <- comparison_data %>%
  filter(delay_weeks < 0 & delay_weeks != -999) %>%  # Exclude Instant TPT baseline
  select(delay_weeks, scenario, 
         orig_excess_cases, linear_excess_cases, pct_change_excess_cases, excess_cases_is_pct,
         orig_excess_rate, linear_excess_rate, pct_change_excess_rate, excess_rate_is_pct,
         orig_relative_risk, linear_relative_risk, change_relative_risk)

cat("=== EARLY BENEFIT ANALYSIS: Square-root vs Linear Pre-curve ===\n\n")

cat("EARLY BENEFIT SCENARIOS (Pre-screen Treatment):\n")
cat("Delay scenarios where treatment starts BEFORE screening (negative weeks)\n\n")

# Check data quality first
cat("DATA QUALITY CHECK:\n")
cat("===================\n")
cat("Original excess cases range:", sprintf("%.3f to %.3f", 
                                           min(early_benefit_scenarios$orig_excess_cases, na.rm = TRUE),
                                           max(early_benefit_scenarios$orig_excess_cases, na.rm = TRUE)), "\n")
cat("Linear excess cases range:", sprintf("%.3f to %.3f", 
                                         min(early_benefit_scenarios$linear_excess_cases, na.rm = TRUE),
                                         max(early_benefit_scenarios$linear_excess_cases, na.rm = TRUE)), "\n")
cat("Number using percentage change:", sum(early_benefit_scenarios$excess_cases_is_pct, na.rm = TRUE), "\n")
cat("Number using absolute difference:", sum(!early_benefit_scenarios$excess_cases_is_pct, na.rm = TRUE), "\n\n")

# Summary statistics for early benefit changes
if(nrow(early_benefit_scenarios) > 0) {
  
  # Filter out non-finite values for statistics
  finite_excess_changes <- early_benefit_scenarios$pct_change_excess_cases[is.finite(early_benefit_scenarios$pct_change_excess_cases)]
  finite_rate_changes <- early_benefit_scenarios$pct_change_excess_rate[is.finite(early_benefit_scenarios$pct_change_excess_rate)]
  finite_rr_changes <- early_benefit_scenarios$change_relative_risk[is.finite(early_benefit_scenarios$change_relative_risk)]
  
  # Overall summary
  cat("SUMMARY STATISTICS FOR EARLY BENEFIT SCENARIOS:\n")
  cat("================================================\n")
  cat("Number of early benefit scenarios:", nrow(early_benefit_scenarios), "\n")
  cat("Delay range:", min(early_benefit_scenarios$delay_weeks), "to", max(early_benefit_scenarios$delay_weeks), "weeks\n\n")
  
  # Excess cases changes
  if(length(finite_excess_changes) > 0) {
    cat("EXCESS CASES CHANGES:\n")
    cat("Mean change:", sprintf("%.2f", mean(finite_excess_changes)), 
        ifelse(all(early_benefit_scenarios$excess_cases_is_pct), "%", " (absolute)"), "\n")
    cat("Range:", sprintf("%.2f to %.2f", min(finite_excess_changes), max(finite_excess_changes)), 
        ifelse(all(early_benefit_scenarios$excess_cases_is_pct), "%", " (absolute)"), "\n")
  } else {
    cat("EXCESS CASES CHANGES: No finite values available\n")
  }
  
  # Excess rate changes  
  if(length(finite_rate_changes) > 0) {
    cat("\nEXCESS INCIDENCE RATE CHANGES:\n")
    cat("Mean change:", sprintf("%.2f", mean(finite_rate_changes)), 
        ifelse(all(early_benefit_scenarios$excess_rate_is_pct), "%", " (absolute)"), "\n")
    cat("Range:", sprintf("%.2f to %.2f", min(finite_rate_changes), max(finite_rate_changes)), 
        ifelse(all(early_benefit_scenarios$excess_rate_is_pct), "%", " (absolute)"), "\n")
  } else {
    cat("\nEXCESS INCIDENCE RATE CHANGES: No finite values available\n")
  }
  
  # Relative risk changes
  if(length(finite_rr_changes) > 0) {
    cat("\nRELATIVE RISK CHANGES (absolute):\n")
    cat("Mean change:", sprintf("%.4f", mean(finite_rr_changes)), "\n")
    cat("Range:", sprintf("%.4f to %.4f", min(finite_rr_changes), max(finite_rr_changes)), "\n\n")
  } else {
    cat("\nRELATIVE RISK CHANGES: No finite values available\n\n")
  }
  
} else {
  cat("No early benefit scenarios found in the data.\n\n")
}

# ——————————————————————————————————————————————————————
# 5) Detailed Results Table
# ——————————————————————————————————————————————————————

cat("DETAILED COMPARISON TABLE:\n")
cat("==========================\n\n")

# Create detailed comparison table with robust formatting
detailed_table <- early_benefit_scenarios %>%
  mutate(
    `Delay (weeks)` = delay_weeks,
    `Original Excess Cases` = sprintf("%.2f", orig_excess_cases),
    `Linear Excess Cases` = sprintf("%.2f", linear_excess_cases),
    `Change Cases` = ifelse(excess_cases_is_pct, 
                           sprintf("%+.1f%%", pct_change_excess_cases),
                           sprintf("%+.2f", pct_change_excess_cases)),
    `Original Excess Rate` = sprintf("%.3f", orig_excess_rate),
    `Linear Excess Rate` = sprintf("%.3f", linear_excess_rate),
    `Change Rate` = ifelse(excess_rate_is_pct,
                          sprintf("%+.1f%%", pct_change_excess_rate),
                          sprintf("%+.3f", pct_change_excess_rate)),
    `Change in RR` = sprintf("%+.4f", change_relative_risk)
  ) %>%
  select(`Delay (weeks)`, `Original Excess Cases`, `Linear Excess Cases`, `Change Cases`,
         `Original Excess Rate`, `Linear Excess Rate`, `Change Rate`, `Change in RR`)

print(detailed_table)

# ——————————————————————————————————————————————————————
# 6) Key Findings Summary
# ——————————————————————————————————————————————————————

cat("\n=== KEY FINDINGS ===\n")

if(nrow(early_benefit_scenarios) > 0 && length(finite_excess_changes) > 0) {
  
  # Find scenarios with largest changes
  max_cases_change_idx <- which.max(abs(finite_excess_changes))
  
  cat("\nLARGEST CHANGES IN EARLY BENEFIT:\n")
  cat("• Excess cases: ", sprintf("%+.2f", finite_excess_changes[max_cases_change_idx]),
      ifelse(all(early_benefit_scenarios$excess_cases_is_pct), "%", " (absolute)"), "\n", sep="")
  
  # Overall interpretation
  mean_change <- mean(finite_excess_changes)
  
  cat("\nINTERPRETATION:\n")
  if(all(early_benefit_scenarios$excess_cases_is_pct)) {
    # Percentage-based interpretation
    if(abs(mean_change) < 5) {
      cat("• Minimal impact: Linear vs square-root pre-curve shows <5% average change\n")
    } else if(abs(mean_change) < 15) {
      cat("• Moderate impact: Linear vs square-root pre-curve shows ", sprintf("%.1f%%", abs(mean_change)), " average change\n", sep="")
    } else {
      cat("• Substantial impact: Linear vs square-root pre-curve shows ", sprintf("%.1f%%", abs(mean_change)), " average change\n", sep="")
    }
  } else {
    # Absolute difference interpretation
    cat("• Changes shown as absolute differences due to small baseline values\n")
    cat("• Mean absolute change: ", sprintf("%.3f", mean_change), " cases per 100,000\n", sep="")
  }
  
  if(mean_change > 0) {
    cat("• Direction: Linear pre-curve generally shows HIGHER excess cases (worse outcomes)\n")
  } else if(mean_change < 0) {
    cat("• Direction: Linear pre-curve generally shows LOWER excess cases (better outcomes)\n")
  } else {
    cat("• Direction: Minimal difference between curve shapes\n")
  }
  
} else {
  cat("No early benefit scenarios with finite values available for comparison.\n")
}

cat("\n=== ANALYSIS COMPLETE ===\n")

Plot of curves with linear pre-curve (Figure 10):

# ——————————————————————————————————————————————————————
# 6) CREATE PLOTTING CURVES - Full time series for visualization
# ——————————————————————————————————————————————————————

# Modified function to return full curve instead of just week 26 endpoint
build_full_curve_linear <- function(delay_weeks, control_sims, intervention_sims, sim_idx = 1) {
  # Extract simulated predictions for this iteration (use mean simulation)
  Fc0_sim <- rowMeans(control_sims)  # Use mean across simulations for plotting
  Fi0_sim <- rowMeans(intervention_sims)
  
  # Use mean p0 for plotting
  p0_mean <- 61/3948
  Fc0_sim <- p0_mean + (1 - p0_mean) * Fc0_sim
  Fi0_sim <- p0_mean + (1 - p0_mean) * Fi0_sim
  
  # Enforce monotonicity
  Fc0_sim <- pmin(pmax(Fc0_sim, 0), 0.999)
  Fi0_sim <- pmin(pmax(Fi0_sim, 0), 0.999)
  Fc0_sim <- cummax(Fc0_sim); Fi0_sim <- cummax(Fi0_sim)

  # Calculate hazards
  h_ctrl_sim <- diff(c(0, Fc0_sim)) / (1 - c(0, Fc0_sim)[-length(Fc0_sim)])
  h_int_sim <- diff(c(0, Fi0_sim)) / (1 - c(0, Fi0_sim)[-length(Fi0_sim)])
  
  # Build full curve
  n <- length(time_ext_plot)
  Fcum <- numeric(n)
  i0 <- which(time_ext_plot == 0)
  back_wk_plot <- 13  # Updated to 13 weeks
  
  # Special handling for pure scenarios
  if(delay_weeks == Inf) {
    # Pure control - LINEAR PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext_plot[1:(i0-1)] + back_wk_plot
      Fcum[1:(i0-1)] <- p0_mean * (s_up / back_wk_plot)  # LINEAR
    }
    Fcum[i0] <- p0_mean
    Sprev <- 1 - p0_mean
    
    for(i in (i0+1):n) {
      t0 <- time_ext_plot[i]
      h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks == -Inf) {
    # Pure intervention - LINEAR PRE-CURVE
    if(i0 > 1) {
      s_up <- time_ext_plot[1:(i0-1)] + back_wk_plot
      Fcum[1:(i0-1)] <- p0_mean * (s_up / back_wk_plot)  # LINEAR
    }
    Fcum[i0] <- p0_mean
    Sprev <- 1 - p0_mean
    
    for(i in (i0+1):n) {
      t0 <- time_ext_plot[i]
      h_t <- h_int_sim[min(t0 + 1, length(h_int_sim))]
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else if(delay_weeks < 0) {
    # Negative delays - similar logic as before
    id_match <- which(time_ext_plot == delay_weeks)
    if(length(id_match) == 0) {
      id <- which.min(abs(time_ext_plot - delay_weeks))
    } else {
      id <- id_match[1]
    }
    
    if(id > 1) {
      s_up <- time_ext_plot[1:(id-1)] + back_wk_plot
      s_up[s_up < 0] <- 0
      Fcum[1:(id-1)] <- p0_mean * (s_up / back_wk_plot)  # LINEAR
    }
    
    s_delay <- max(0, delay_weeks + back_wk_plot)
    Fcum[id] <- p0_mean * (s_delay / back_wk_plot)  # LINEAR
    Sprev <- 1 - Fcum[id]
    
    for(i in (id+1):n) {
      current_time <- time_ext_plot[i]
      intervention_duration <- current_time - delay_weeks
      
      if(intervention_duration >= 0 && intervention_duration <= 54) {
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      } else if(intervention_duration > 54) {
        h_t <- h_int_sim[length(h_int_sim)]
      } else {
        h_t <- h_int_sim[1]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
    
  } else {
    # Positive delays
    if(i0 > 1) {
      s_up <- time_ext_plot[1:(i0-1)] + back_wk_plot
      Fcum[1:(i0-1)] <- p0_mean * (s_up / back_wk_plot)  # LINEAR
    }
    
    Fcum[i0] <- p0_mean
    Sprev <- 1 - p0_mean
    
    for(i in (i0+1):n) {
      t0 <- time_ext_plot[i]
      
      if(t0 <= delay_weeks) {
        h_t <- h_ctrl_sim[min(t0 + 1, length(h_ctrl_sim))]
      } else {
        intervention_duration <- t0 - delay_weeks
        h_idx <- min(intervention_duration + 1, length(h_int_sim))
        h_t <- h_int_sim[max(1, h_idx)]
      }
      
      Snew <- Sprev * (1 - h_t)
      Fcum[i] <- 1 - Snew
      Sprev <- Snew
    }
  }
  
  return(Fcum)
}

# Updated parameters for plotting
delays_to_plot <- c(-13, -10, -5, 5, 10, 20)  # Only specified delays
back_wk_plot <- 13  # Updated to 13 weeks
time_ext_plot <- seq(-back_wk_plot, 26, by = 1)  # Updated time range

cat("Building curves for visualization with selected delays...\n")

# Build all curves
ctrl_ext <- build_full_curve_linear(Inf, control_sims, intervention_sims)
inst_ext <- build_full_curve_linear(-Inf, control_sims, intervention_sims)

# Build delay curves for selected delays only
delay_curves <- sapply(delays_to_plot, function(d) {
  build_full_curve_linear(d, control_sims, intervention_sims)
})

colnames(delay_curves) <- paste0(delays_to_plot, " wk delay")

# ——————————————————————————————————————————————————————
# 7) CREATE VISUALIZATION
# ——————————————————————————————————————————————————————

# Combine all curves
all_mat <- cbind(
  Control = ctrl_ext,
  `Instant TPT` = inst_ext,
  delay_curves
)

# Set up compressed x-axis (updated for -13 to +26)
neg_width <- 30

x_plot <- ifelse(
  time_ext_plot < 0,
  neg_width * (time_ext_plot + back_wk_plot) / back_wk_plot,  # −13…0 → 0…30
  neg_width + pmin(time_ext_plot, 54)                         #   0…26 → 30…56
)

# Subset to desired range
keep <- which(time_ext_plot <= 26)
x_sub <- x_plot[keep]
mat_sub <- all_mat[keep, , drop=FALSE]

# Convert to percentages
mat_sub_pct <- mat_sub * 100

# Create gradient colors from better (green) to worse (red) delays
n_delay_cols <- ncol(delay_curves)
delay_colors <- colorRampPalette(c("#00AA00", "#FFAA00", "#FF0000"))(n_delay_cols)
cols <- c("red", "blue", delay_colors)
ltys <- c(1, 1, rep(2, n_delay_cols))

# Create the plot
matplot(
  x_sub, mat_sub_pct,
  type="l", col=cols, lty=ltys, lwd=2,
  xaxt="n", 
  xlab="Weeks since randomisation",
  ylab="Cumulative incidence of TB (%)",
  main="Sensitivity Analysis: VQUIN Delayed-TPT scenarios with linear pre-curve",  # Updated title
  cex.axis = 1.0,  # Size 12 for axis numbers
  cex.lab = 1.0    # Size 12 for axis labels
)

# Custom x-axis ticks (updated for -13 to +26)
neg_ticks <- seq(-back_wk_plot, 0, by=2)     
neg_at <- neg_width * (neg_ticks + back_wk_plot) / back_wk_plot
pos_ticks <- seq(0, 26, by=2) 
pos_at <- neg_width + pos_ticks

axis(1,
     at = c(neg_at, pos_at),
     labels = c(neg_ticks, pos_ticks),
     cex.axis = 1.0)

# Vertical line at t=0
abline(v=neg_width, lty=3, col="gray50")

# Legend with larger text
legend("topleft",
       legend = colnames(all_mat),
       col = cols,
       lty = ltys,
       lwd = 2,
       bty = "n",
       ncol = 1,        
       cex = 1.2,      # Larger legend text
       y.intersp = 0.8,      
       x.intersp = 0.5,      
       inset = c(0.02, 0.02))

cat("Plotting complete!\n")