1 Introduction

1.1 Research Question

Among Phase 2 solid tumor oncology trials initiated during the early COVID-19 period (March 2019 through March 2021), does sponsor type (commercial, academic, or government) predict time to study completion?

1.2 Methodological Approach

This analysis employs Generalized Linear Models (GLMs) to examine the relationship between sponsor type and trial completion time. The dependent variable (time to completion in days) is continuous, strictly positive, and expected to be right-skewed with variance increasing with the mean. These characteristics suggest distributions from the exponential family that accommodate positive, skewed outcomes:

  • Gamma distribution: Appropriate for positive continuous data where variance is proportional to the square of the mean
  • Inverse Gaussian distribution: Appropriate for positive continuous data with heavier right tails, where variance is proportional to the cube of the mean

Both models use a log link function, which ensures predicted values remain positive and allows interpretation of coefficients as multiplicative effects on completion time.

1.3 Covariates

In addition to sponsor type, the following covariates are included:

  • Number of clinical trial sites: Proxy for trial complexity and geographic scope
  • Enrollment: Target sample size as an indicator of trial scale
  • Number of arms: Reflects study design complexity
  • Months from COVID start: Continuous measure of timing relative to March 2020 (WHO pandemic declaration)

2 Data Acquisition

2.1 Load Required Packages

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(patchwork)
library(statmod)
library(MASS)
library(car)
library(knitr)
library(kableExtra)

theme_set(theme_bw(base_size = 12))

2.2 Load Previously Downloaded Data

load("phase2_solid_tumor_trials.RData")
cat("Loaded dataset with", nrow(trials), "trials\n")
## Loaded dataset with 2125 trials

3 Data Cleaning

3.1 Initial Processing

covid_start <- as.Date("2020-03-01")

trials_processed <- trials %>%
  mutate(
    sponsor_type_clean = case_when(
      sponsor_type == "INDUSTRY" ~ "Commercial",
      sponsor_type %in% c("NIH", "FED", "OTHER_GOV") ~ "Government",
      sponsor_type %in% c("OTHER", "NETWORK", "UNKNOWN", "INDIV", "AMBIG") ~ "Academic",
      TRUE ~ "Academic"
    ),
    start_date = as.Date(start_date),
    completion_date = as.Date(completion_date),
    time_to_final = as.numeric(completion_date - start_date),
    months_from_covid = as.numeric(start_date - covid_start) / 30.44,
    num_sites = as.numeric(num_sites),
    sponsor_type_clean = factor(sponsor_type_clean, 
                                 levels = c("Commercial", "Academic", "Government"))
  )

cat("Initial dataset:", nrow(trials_processed), "trials\n")
## Initial dataset: 2125 trials

3.2 Apply Inclusion Criteria

trials_clean <- trials_processed %>%
  filter(overall_status == "COMPLETED") %>%
  filter(time_to_final > 0) %>%
  filter(!is.na(time_to_final)) %>%
  filter(!is.na(num_sites)) %>%
  filter(!is.na(enrollment)) %>%
  filter(!is.na(number_of_arms))

cat("After filtering to COMPLETED trials:", nrow(trials_clean), "trials\n")
## After filtering to COMPLETED trials: 444 trials

3.3 Create Scaled Variables for GLM

trials_scaled <- trials_clean %>%
  mutate(
    num_sites_scaled = scale(num_sites)[,1],
    enrollment_scaled = scale(enrollment)[,1],
    number_of_arms_scaled = scale(number_of_arms)[,1],
    months_from_covid_scaled = scale(months_from_covid)[,1]
  )

# Store scaling parameters for interpretation
scale_params <- data.frame(
  Variable = c("num_sites", "enrollment", "number_of_arms", "months_from_covid"),
  Mean = c(mean(trials_clean$num_sites), mean(trials_clean$enrollment),
           mean(trials_clean$number_of_arms), mean(trials_clean$months_from_covid)),
  SD = c(sd(trials_clean$num_sites), sd(trials_clean$enrollment),
         sd(trials_clean$number_of_arms), sd(trials_clean$months_from_covid))
)

kable(scale_params, digits = 2, caption = "Scaling Parameters (for back-transformation)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Scaling Parameters (for back-transformation)
Variable Mean SD
num_sites 15.93 69.59
enrollment 63.48 71.68
number_of_arms 1.62 1.18
months_from_covid 0.71 7.63

3.4 Cleaning Summary

cleaning_summary <- data.frame(
  Step = c("Raw dataset from AACT",
           "After removing non-COMPLETED status",
           "After removing invalid completion times",
           "After removing missing covariates"),
  N = c(nrow(trials_processed),
        sum(trials_processed$overall_status == "COMPLETED"),
        sum(trials_processed$overall_status == "COMPLETED" & 
            trials_processed$time_to_final > 0, na.rm = TRUE),
        nrow(trials_clean))
)

kable(cleaning_summary, caption = "Data Cleaning Steps") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Data Cleaning Steps
Step N
Raw dataset from AACT 2125
After removing non-COMPLETED status 445
After removing invalid completion times 444
After removing missing covariates 444

4 Exploratory Data Analysis

4.1 Sample Characteristics

4.1.2 Descriptive Statistics for Continuous Variables

desc_stats <- data.frame(
  Variable = c("Time to Completion (days)", "Number of Sites", 
               "Enrollment", "Number of Arms", "Months from COVID"),
  N = rep(nrow(trials_clean), 5),
  Mean = c(mean(trials_clean$time_to_final), mean(trials_clean$num_sites), 
           mean(trials_clean$enrollment), mean(trials_clean$number_of_arms), 
           mean(trials_clean$months_from_covid)),
  SD = c(sd(trials_clean$time_to_final), sd(trials_clean$num_sites), 
         sd(trials_clean$enrollment), sd(trials_clean$number_of_arms), 
         sd(trials_clean$months_from_covid)),
  Median = c(median(trials_clean$time_to_final), median(trials_clean$num_sites), 
             median(trials_clean$enrollment), median(trials_clean$number_of_arms), 
             median(trials_clean$months_from_covid)),
  Min = c(min(trials_clean$time_to_final), min(trials_clean$num_sites), 
          min(trials_clean$enrollment), min(trials_clean$number_of_arms), 
          min(trials_clean$months_from_covid)),
  Max = c(max(trials_clean$time_to_final), max(trials_clean$num_sites), 
          max(trials_clean$enrollment), max(trials_clean$number_of_arms), 
          max(trials_clean$months_from_covid))
)

kable(desc_stats, digits = 2, caption = "Descriptive Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Descriptive Statistics
Variable N Mean SD Median Min Max
Time to Completion (days) 444 1211.34 452.68 1222.00 13.00 2313.00
Number of Sites 444 15.93 69.59 1.00 1.00 950.00
Enrollment 444 63.48 71.68 42.00 1.00 627.00
Number of Arms 444 1.62 1.18 1.00 1.00 15.00
Months from COVID 444 0.71 7.63 1.92 -12.02 12.98

4.1.3 Time to Completion by Sponsor Type

completion_by_sponsor <- trials_clean %>%
  group_by(sponsor_type_clean) %>%
  summarise(
    N = n(),
    `Mean (days)` = mean(time_to_final),
    `SD` = sd(time_to_final),
    `Median (days)` = median(time_to_final),
    `Mean (months)` = mean(time_to_final) / 30.44,
    `Median (months)` = median(time_to_final) / 30.44
  ) %>%
  rename(`Sponsor Type` = sponsor_type_clean)

kable(completion_by_sponsor, digits = 1, 
      caption = "Time to Completion by Sponsor Type") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Time to Completion by Sponsor Type
Sponsor Type N Mean (days) SD Median (days) Mean (months) Median (months)
Commercial 114 1176.2 429.6 1148.5 38.6 37.7
Academic 318 1229.3 460.2 1236.0 40.4 40.6
Government 12 1069.8 457.3 1115.5 35.1 36.6

4.2 Visualizations

4.2.1 Distribution of Dependent Variable

p_hist <- ggplot(trials_clean, aes(x = time_to_final)) +
  geom_histogram(aes(y = after_stat(density)), bins = 40, 
                 fill = "#2E86AB", alpha = 0.7, color = "white") +
  geom_density(color = "#A23B72", linewidth = 1.2) +
  labs(
    title = "Distribution of Time to Final Completion",
    subtitle = "Right-skewed distribution supports Gamma or Inverse Gaussian GLM",
    x = "Time to Completion (days)",
    y = "Density"
  ) +
  scale_x_continuous(labels = comma)

p_hist

4.2.2 Box Plots: Completion Time by Sponsor Type

p_box <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = time_to_final, 
                                   fill = sponsor_type_clean)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.4) +
  scale_fill_manual(values = c("Commercial" = "#2E86AB", 
                                "Academic" = "#A23B72", 
                                "Government" = "#F18F01")) +
  labs(
    title = "Time to Completion by Sponsor Type",
    x = "Sponsor Type",
    y = "Time to Completion (days)"
  ) +
  theme(legend.position = "none") +
  scale_y_continuous(labels = comma)

p_box

4.2.3 Violin Plots: Distribution Shape by Sponsor Type

p_violin <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = time_to_final, 
                                      fill = sponsor_type_clean)) +
  geom_violin(alpha = 0.5, trim = FALSE) +
  geom_boxplot(width = 0.15, alpha = 0.8, outlier.size = 1) +
  scale_fill_manual(values = c("Commercial" = "#2E86AB", 
                                "Academic" = "#A23B72", 
                                "Government" = "#F18F01")) +
  labs(
    title = "Distribution of Completion Time by Sponsor Type",
    x = "Sponsor Type",
    y = "Time to Completion (days)"
  ) +
  theme(legend.position = "none") +
  scale_y_continuous(labels = comma)

p_violin

4.2.4 Covariate Relationships

p_sites <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = num_sites, 
                                     fill = sponsor_type_clean)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("Commercial" = "#2E86AB", 
                                "Academic" = "#A23B72", 
                                "Government" = "#F18F01")) +
  labs(title = "Number of Sites by Sponsor Type", x = "", y = "Number of Sites") +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(0, quantile(trials_clean$num_sites, 0.95)))

p_sites_time <- ggplot(trials_clean, aes(x = num_sites, y = time_to_final, 
                                          color = sponsor_type_clean)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(title = "Completion Time vs Number of Sites", 
       x = "Number of Sites", y = "Time to Completion (days)", color = "Sponsor") +
  coord_cartesian(xlim = c(0, quantile(trials_clean$num_sites, 0.95)))

p_enroll_time <- ggplot(trials_clean, aes(x = enrollment, y = time_to_final, 
                                           color = sponsor_type_clean)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(title = "Completion Time vs Enrollment", 
       x = "Enrollment", y = "Time to Completion (days)", color = "Sponsor") +
  coord_cartesian(xlim = c(0, quantile(trials_clean$enrollment, 0.95)))

p_covid <- ggplot(trials_clean, aes(x = months_from_covid, y = time_to_final, 
                                     color = sponsor_type_clean)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(title = "Completion Time vs COVID Timing", 
       subtitle = "Dashed line = March 2020 (WHO pandemic declaration)",
       x = "Months from COVID Start", y = "Time to Completion (days)", color = "Sponsor")

(p_sites | p_sites_time) / (p_enroll_time | p_covid)

4.2.5 Mean-Variance Relationship

mv_check <- trials_clean %>%
  mutate(bin = cut(time_to_final, breaks = 15)) %>%
  group_by(bin) %>%
  summarise(
    bin_mean = mean(time_to_final),
    bin_var = var(time_to_final),
    n = n()
  ) %>%
  filter(n >= 5)

p_mv <- ggplot(mv_check, aes(x = bin_mean, y = bin_var)) +
  geom_point(size = 3, color = "#2E86AB") +
  geom_smooth(method = "lm", se = TRUE, color = "#A23B72") +
  labs(
    title = "Mean-Variance Relationship",
    subtitle = "Positive relationship supports Gamma/Inverse Gaussian over Normal",
    x = "Bin Mean (days)",
    y = "Bin Variance"
  ) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma)

p_mv

4.2.6 Correlation Matrix

cor_vars <- trials_clean %>%
  dplyr::select(time_to_final, num_sites, enrollment, number_of_arms, months_from_covid)

cor_matrix <- cor(cor_vars, use = "complete.obs")

kable(round(cor_matrix, 3), caption = "Correlation Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Correlation Matrix
time_to_final num_sites enrollment number_of_arms months_from_covid
time_to_final 1.000 0.170 -0.038 0.015 -0.227
num_sites 0.170 1.000 0.121 0.037 -0.100
enrollment -0.038 0.121 1.000 0.235 0.009
number_of_arms 0.015 0.037 0.235 1.000 0.017
months_from_covid -0.227 -0.100 0.009 0.017 1.000

5 Generalized Linear Models

5.1 Model Specification

The GLM takes the form:

\[\log(\mu_i) = \beta_0 + \beta_1 \cdot \text{Academic}_i + \beta_2 \cdot \text{Government}_i + \beta_3 \cdot \text{NumSites}_i + \beta_4 \cdot \text{Enrollment}_i + \beta_5 \cdot \text{NumArms}_i + \beta_6 \cdot \text{MonthsFromCOVID}_i\]

where \(\mu_i = E(Y_i)\) is the expected completion time for trial \(i\).

Note: Continuous predictors are standardized (mean-centered and scaled by SD) to improve model convergence and allow comparison of effect sizes.


6 Model Comparison

6.1 Side-by-Side Fit Statistics

comparison <- data.frame(
  Metric = c("AIC", "BIC", "Residual Deviance", "Null Deviance", 
             "Deviance Explained (%)", "Dispersion Parameter"),
  Gamma = c(
    AIC(model_gamma),
    BIC(model_gamma),
    model_gamma$deviance,
    model_gamma$null.deviance,
    (1 - model_gamma$deviance / model_gamma$null.deviance) * 100,
    summary(model_gamma)$dispersion
  ),
  Inverse_Gaussian = c(
    AIC(model_ig),
    BIC(model_ig),
    model_ig$deviance,
    model_ig$null.deviance,
    (1 - model_ig$deviance / model_ig$null.deviance) * 100,
    summary(model_ig)$dispersion
  )
)

comparison$Difference <- comparison$Gamma - comparison$Inverse_Gaussian
comparison$Better <- ifelse(comparison$Difference > 0, "Inverse Gaussian", "Gamma")
comparison$Better[comparison$Metric == "Deviance Explained (%)"] <- 
  ifelse(comparison$Difference[comparison$Metric == "Deviance Explained (%)"] < 0, 
         "Inverse Gaussian", "Gamma")

kable(comparison, digits = 2, 
      caption = "Model Comparison: Gamma vs Inverse Gaussian") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  column_spec(5, bold = TRUE)
Model Comparison: Gamma vs Inverse Gaussian
Metric Gamma Inverse_Gaussian Difference Better
AIC 6765.58 4.573167e+04 -3.896609e+04 Gamma
BIC 6798.35 4.576444e+04 -3.896609e+04 Gamma
Residual Deviance 83.81 1.026203e+37 -1.026203e+37 Gamma
Null Deviance 88.44 1.800000e-01 8.826000e+01 Inverse Gaussian
Deviance Explained (%) 5.24 -5.674058e+39 5.674058e+39 Gamma
Dispersion Parameter 0.13 1.458158e+53 -1.458158e+53 Gamma

6.2 Coefficient Comparison

coef_compare <- data.frame(
  Term = names(coef(model_gamma)),
  Gamma_Est = coef(model_gamma),
  Gamma_Exp = exp(coef(model_gamma)),
  IG_Est = coef(model_ig),
  IG_Exp = exp(coef(model_ig))
)
rownames(coef_compare) <- NULL

kable(coef_compare, digits = 4, col.names = c("Term", "Gamma Est", "Gamma exp(Est)", 
                                                "IG Est", "IG exp(Est)"),
      caption = "Coefficient Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Coefficient Comparison
Term Gamma Est Gamma exp(Est) IG Est IG exp(Est)
(Intercept) 7.0869 1196.2471 -1994.2730 0.000000e+00
sponsor_type_cleanAcademic 0.0152 1.0154 178.3788 2.943907e+77
sponsor_type_cleanGovernment -0.1331 0.8753 356.2027 4.976036e+154
num_sites_scaled 0.0502 1.0515 -408.6918 0.000000e+00
enrollment_scaled -0.0221 0.9782 193.0600 6.996602e+83
number_of_arms_scaled 0.0108 1.0109 -1824.0322 0.000000e+00
months_from_covid_scaled -0.0788 0.9242 -597.6835 0.000000e+00

6.3 Residual Comparison

resid_compare <- data.frame(
  Model = rep(c("Gamma", "Inverse Gaussian"), each = nrow(trials_scaled)),
  Fitted = c(gamma_fitted, ig_fitted),
  Deviance_Residuals = c(gamma_deviance_resid, ig_deviance_resid)
)

p_qq_compare <- ggplot(resid_compare, aes(sample = Deviance_Residuals, color = Model)) +
  stat_qq(alpha = 0.5) +
  stat_qq_line() +
  scale_color_manual(values = c("Gamma" = "#2E86AB", "Inverse Gaussian" = "#A23B72")) +
  labs(title = "Q-Q Plot Comparison",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  facet_wrap(~Model)

p_resid_compare <- ggplot(resid_compare, aes(x = Fitted, y = Deviance_Residuals, color = Model)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(method = "loess", se = TRUE) +
  scale_color_manual(values = c("Gamma" = "#2E86AB", "Inverse Gaussian" = "#A23B72")) +
  labs(title = "Residuals vs Fitted Comparison",
       x = "Fitted Values", y = "Deviance Residuals") +
  facet_wrap(~Model)

p_qq_compare / p_resid_compare

6.4 Model Selection Summary

aic_winner <- ifelse(AIC(model_gamma) < AIC(model_ig), "Gamma", "Inverse Gaussian")
bic_winner <- ifelse(BIC(model_gamma) < BIC(model_ig), "Gamma", "Inverse Gaussian")

cat("=== MODEL SELECTION SUMMARY ===\n\n")
## === MODEL SELECTION SUMMARY ===
cat("AIC prefers:", aic_winner, "\n")
## AIC prefers: Gamma
cat("  Gamma AIC:", round(AIC(model_gamma), 2), "\n")
##   Gamma AIC: 6765.58
cat("  Inverse Gaussian AIC:", round(AIC(model_ig), 2), "\n")
##   Inverse Gaussian AIC: 45731.67
cat("  Difference:", round(abs(AIC(model_gamma) - AIC(model_ig)), 2), "\n\n")
##   Difference: 38966.09
cat("BIC prefers:", bic_winner, "\n")
## BIC prefers: Gamma
cat("  Gamma BIC:", round(BIC(model_gamma), 2), "\n")
##   Gamma BIC: 6798.35
cat("  Inverse Gaussian BIC:", round(BIC(model_ig), 2), "\n")
##   Inverse Gaussian BIC: 45764.44
cat("  Difference:", round(abs(BIC(model_gamma) - BIC(model_ig)), 2), "\n")
##   Difference: 38966.09

7 Interaction Model: Sponsor Type x Number of Sites

Based on the model comparison above, we fit an interaction model using the better-fitting distribution.

best_family <- if(AIC(model_gamma) <= AIC(model_ig)) {
  cat("Using Gamma distribution (lower AIC)\n\n")
  Gamma(link = "log")
} else {
  cat("Using Inverse Gaussian distribution (lower AIC)\n\n")
  inverse.gaussian(link = "log")
}
## Using Gamma distribution (lower AIC)
model_interaction <- glm(time_to_final ~ sponsor_type_clean * num_sites_scaled + enrollment_scaled + 
                           number_of_arms_scaled + months_from_covid_scaled,
                         data = trials_scaled,
                         family = best_family,
                         control = glm.control(maxit = 100))

summary(model_interaction)
## 
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean * num_sites_scaled + 
##     enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled, 
##     family = best_family, data = trials_scaled, control = glm.control(maxit = 100))
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                    7.063732   0.036011 196.154
## sponsor_type_cleanAcademic                     0.036410   0.042219   0.862
## sponsor_type_cleanGovernment                   0.699054   0.676852   1.033
## num_sites_scaled                               0.269275   0.089623   3.005
## enrollment_scaled                             -0.027583   0.018308  -1.507
## number_of_arms_scaled                          0.006153   0.017763   0.346
## months_from_covid_scaled                      -0.079975   0.017357  -4.608
## sponsor_type_cleanAcademic:num_sites_scaled   -0.229341   0.091221  -2.514
## sponsor_type_cleanGovernment:num_sites_scaled  3.774063   3.257933   1.158
##                                               Pr(>|t|)    
## (Intercept)                                    < 2e-16 ***
## sponsor_type_cleanAcademic                     0.38893    
## sponsor_type_cleanGovernment                   0.30227    
## num_sites_scaled                               0.00281 ** 
## enrollment_scaled                              0.13264    
## number_of_arms_scaled                          0.72921    
## months_from_covid_scaled                      5.36e-06 ***
## sponsor_type_cleanAcademic:num_sites_scaled    0.01229 *  
## sponsor_type_cleanGovernment:num_sites_scaled  0.24733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1277541)
## 
##     Null deviance: 88.442  on 443  degrees of freedom
## Residual deviance: 82.764  on 435  degrees of freedom
## AIC: 6763.8
## 
## Number of Fisher Scoring iterations: 5

7.1 Interaction Model Coefficients

int_coefs <- data.frame(
  Term = names(coef(model_interaction)),
  Estimate = coef(model_interaction),
  SE = summary(model_interaction)$coefficients[, 2],
  z_value = summary(model_interaction)$coefficients[, 3],
  p_value = summary(model_interaction)$coefficients[, 4],
  Exp_Estimate = exp(coef(model_interaction))
)
rownames(int_coefs) <- NULL

kable(int_coefs, digits = 4, 
      caption = "Interaction Model Coefficients") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Interaction Model Coefficients
Term Estimate SE z_value p_value Exp_Estimate
(Intercept) 7.0637 0.0360 196.1543 0.0000 1168.7985
sponsor_type_cleanAcademic 0.0364 0.0422 0.8624 0.3889 1.0371
sponsor_type_cleanGovernment 0.6991 0.6769 1.0328 0.3023 2.0118
num_sites_scaled 0.2693 0.0896 3.0045 0.0028 1.3090
enrollment_scaled -0.0276 0.0183 -1.5066 0.1326 0.9728
number_of_arms_scaled 0.0062 0.0178 0.3464 0.7292 1.0062
months_from_covid_scaled -0.0800 0.0174 -4.6077 0.0000 0.9231
sponsor_type_cleanAcademic:num_sites_scaled -0.2293 0.0912 -2.5141 0.0123 0.7951
sponsor_type_cleanGovernment:num_sites_scaled 3.7741 3.2579 1.1584 0.2473 43.5567

7.2 Test Interaction Significance

if(AIC(model_gamma) <= AIC(model_ig)) {
  main_model <- model_gamma
} else {
  main_model <- model_ig
}

lr_test <- anova(main_model, model_interaction, test = "F")
print(lr_test)
## Analysis of Deviance Table
## 
## Model 1: time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled + 
##     number_of_arms_scaled + months_from_covid_scaled
## Model 2: time_to_final ~ sponsor_type_clean * num_sites_scaled + enrollment_scaled + 
##     number_of_arms_scaled + months_from_covid_scaled
##   Resid. Df Resid. Dev Df Deviance      F  Pr(>F)  
## 1       437     83.807                             
## 2       435     82.764  2   1.0431 4.0825 0.01752 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n=== Interaction Test ===\n")
## 
## === Interaction Test ===
cat("Does adding the sponsor x sites interaction significantly improve fit?\n")
## Does adding the sponsor x sites interaction significantly improve fit?
cat("F-test p-value:", round(lr_test$`Pr(>F)`[2], 4), "\n")
## F-test p-value: 0.0175
if(lr_test$`Pr(>F)`[2] < 0.05) {
  cat("Conclusion: YES - Interaction is statistically significant\n")
} else {
  cat("Conclusion: NO - Interaction is not statistically significant\n")
}
## Conclusion: YES - Interaction is statistically significant

7.3 Visualize Interaction

pred_data <- expand.grid(
  sponsor_type_clean = levels(trials_scaled$sponsor_type_clean),
  num_sites_scaled = seq(min(trials_scaled$num_sites_scaled), 
                          quantile(trials_scaled$num_sites_scaled, 0.90), 
                          length.out = 50),
  enrollment_scaled = 0,
  number_of_arms_scaled = 0,
  months_from_covid_scaled = 0
)

pred_data$predicted <- predict(model_interaction, newdata = pred_data, type = "response")

# Back-transform num_sites for x-axis
pred_data$num_sites_original <- pred_data$num_sites_scaled * scale_params$SD[1] + scale_params$Mean[1]

ggplot(pred_data, aes(x = num_sites_original, y = predicted, color = sponsor_type_clean)) +
  geom_line(linewidth = 1.2) +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(
    title = "Predicted Completion Time by Number of Sites and Sponsor Type",
    subtitle = "Interaction Model (other covariates held at mean)",
    x = "Number of Clinical Sites",
    y = "Predicted Time to Completion (days)",
    color = "Sponsor Type"
  ) +
  scale_y_continuous(labels = comma) +
  theme(legend.position = "bottom")


8 Conclusions

8.1 Summary of Findings

cat("=== FINAL SUMMARY ===\n\n")
## === FINAL SUMMARY ===
cat("1. RESEARCH QUESTION\n")
## 1. RESEARCH QUESTION
cat("   Does sponsor type predict Phase 2 oncology trial completion time during COVID-19?\n\n")
##    Does sponsor type predict Phase 2 oncology trial completion time during COVID-19?
cat("2. SAMPLE\n")
## 2. SAMPLE
cat("   N =", nrow(trials_clean), "completed Phase 2 solid tumor trials\n")
##    N = 444 completed Phase 2 solid tumor trials
cat("   Period: March 2019 - March 2021\n\n")
##    Period: March 2019 - March 2021
cat("3. MODEL SELECTION\n")
## 3. MODEL SELECTION
cat("   Best fitting distribution:", aic_winner, "\n")
##    Best fitting distribution: Gamma
cat("   AIC difference:", round(abs(AIC(model_gamma) - AIC(model_ig)), 2), "\n\n")
##    AIC difference: 38966.09
cat("4. KEY FINDINGS (from", aic_winner, "model)\n")
## 4. KEY FINDINGS (from Gamma model)
if(aic_winner == "Gamma") {
  best_model <- model_gamma
} else {
  best_model <- model_ig
}

p_academic <- summary(best_model)$coefficients["sponsor_type_cleanAcademic", 4]
p_gov <- summary(best_model)$coefficients["sponsor_type_cleanGovernment", 4]

cat("   Academic vs Commercial: ", 
    round((exp(coef(best_model)["sponsor_type_cleanAcademic"]) - 1) * 100, 1), 
    "% difference (p = ", round(p_academic, 4), ")\n", sep = "")
##    Academic vs Commercial: 1.5% difference (p = 0.7159)
cat("   Government vs Commercial: ", 
    round((exp(coef(best_model)["sponsor_type_cleanGovernment"]) - 1) * 100, 1), 
    "% difference (p = ", round(p_gov, 4), ")\n\n", sep = "")
##    Government vs Commercial: -12.5% difference (p = 0.2265)
cat("5. SIGNIFICANT COVARIATES\n")
## 5. SIGNIFICANT COVARIATES
coef_summary <- summary(best_model)$coefficients
sig_covars <- rownames(coef_summary)[coef_summary[, 4] < 0.05]
sig_covars <- sig_covars[sig_covars != "(Intercept)"]
for(cv in sig_covars) {
  cat("   ", cv, ": exp(b) = ", round(exp(coef(best_model)[cv]), 4), 
      ", p = ", round(coef_summary[cv, 4], 4), "\n", sep = "")
}
##    num_sites_scaled: exp(b) = 1.0515, p = 0.0039
##    months_from_covid_scaled: exp(b) = 0.9242, p = 0

8.2 Why One Distribution Fits Better

cat("\n=== DISTRIBUTION CHOICE RATIONALE ===\n\n")
## 
## === DISTRIBUTION CHOICE RATIONALE ===
if(aic_winner == "Inverse Gaussian") {
  cat("The Inverse Gaussian distribution provided better fit, likely because:\n\n")
  cat("1. HEAVIER RIGHT TAIL: Clinical trial completion times often have extreme\n")
  cat("   outliers where a few trials run much longer than expected. The Inverse\n")
  cat("   Gaussian's heavier tail better accommodates these observations.\n\n")
  cat("2. VARIANCE-MEAN RELATIONSHIP: The Inverse Gaussian assumes variance is\n")
  cat("   proportional to mu^3 (vs mu^2 for Gamma). This may better reflect the\n")
  cat("   reality that uncertainty compounds more dramatically for longer trials.\n\n")
  cat("3. PROCESS INTERPRETATION: The Inverse Gaussian arises naturally as the\n")
  cat("   first passage time distribution, which aligns conceptually with trial\n")
  cat("   completion as a 'time to reach a threshold' process.\n")
} else {
  cat("The Gamma distribution provided better fit, likely because:\n\n")
  cat("1. MODERATE SKEWNESS: The completion time distribution shows right skew\n")
  cat("   but not extreme outliers, which Gamma handles well.\n\n")
  cat("2. VARIANCE-MEAN RELATIONSHIP: The Gamma's assumption that variance is\n")
  cat("   proportional to mu^2 appears to match the observed pattern in the data.\n\n")
  cat("3. ROBUSTNESS: Gamma GLMs are generally more stable and less sensitive\n")
  cat("   to influential observations than Inverse Gaussian models.\n")
}
## The Gamma distribution provided better fit, likely because:
## 
## 1. MODERATE SKEWNESS: The completion time distribution shows right skew
##    but not extreme outliers, which Gamma handles well.
## 
## 2. VARIANCE-MEAN RELATIONSHIP: The Gamma's assumption that variance is
##    proportional to mu^2 appears to match the observed pattern in the data.
## 
## 3. ROBUSTNESS: Gamma GLMs are generally more stable and less sensitive
##    to influential observations than Inverse Gaussian models.

9 Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.50       car_3.1-3        carData_3.0-5   
##  [5] MASS_7.3-65      statmod_1.5.1    patchwork_1.3.2  scales_1.4.0    
##  [9] ggplot2_4.0.0    tidyr_1.3.1      dplyr_1.1.4     
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.4.0         lattice_0.22-7    
##  [5] stringi_1.8.7      digest_0.6.37      magrittr_2.0.3     evaluate_1.0.5    
##  [9] grid_4.5.1         RColorBrewer_1.1-3 fastmap_1.2.0      Matrix_1.7-3      
## [13] jsonlite_2.0.0     Formula_1.2-5      mgcv_1.9-3         purrr_1.1.0       
## [17] viridisLite_0.4.2  textshaping_1.0.3  jquerylib_0.1.4    abind_1.4-8       
## [21] cli_3.6.5          rlang_1.1.6        splines_4.5.1      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        tools_4.5.1        vctrs_0.6.5       
## [29] R6_2.6.1           lifecycle_1.0.4    stringr_1.5.1      pkgconfig_2.0.3   
## [33] pillar_1.11.0      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
## [37] systemfonts_1.3.1  xfun_0.52          tibble_3.3.0       tidyselect_1.2.1  
## [41] rstudioapi_0.17.1  farver_2.1.2       nlme_3.1-168       htmltools_0.5.8.1 
## [45] rmarkdown_2.30     svglite_2.2.1      labeling_0.4.3     compiler_4.5.1    
## [49] S7_0.2.0