Among Phase 2 solid tumor oncology trials initiated during the early COVID-19 period (March 2019 through March 2021), does sponsor type (commercial, academic, or government) predict time to study completion?
This analysis employs Generalized Linear Models (GLMs) to examine the relationship between sponsor type and trial completion time. The dependent variable (time to completion in days) is continuous, strictly positive, and expected to be right-skewed with variance increasing with the mean. These characteristics suggest distributions from the exponential family that accommodate positive, skewed outcomes:
Both models use a log link function, which ensures predicted values remain positive and allows interpretation of coefficients as multiplicative effects on completion time.
In addition to sponsor type, the following covariates are included:
covid_start <- as.Date("2020-03-01")
trials_processed <- trials %>%
mutate(
sponsor_type_clean = case_when(
sponsor_type == "INDUSTRY" ~ "Commercial",
sponsor_type %in% c("NIH", "FED", "OTHER_GOV") ~ "Government",
sponsor_type %in% c("OTHER", "NETWORK", "UNKNOWN", "INDIV", "AMBIG") ~ "Academic",
TRUE ~ "Academic"
),
start_date = as.Date(start_date),
completion_date = as.Date(completion_date),
time_to_final = as.numeric(completion_date - start_date),
months_from_covid = as.numeric(start_date - covid_start) / 30.44,
num_sites = as.numeric(num_sites),
sponsor_type_clean = factor(sponsor_type_clean,
levels = c("Commercial", "Academic", "Government"))
)
cat("Initial dataset:", nrow(trials_processed), "trials\n")## Initial dataset: 2125 trials
trials_clean <- trials_processed %>%
filter(overall_status == "COMPLETED") %>%
filter(time_to_final > 0) %>%
filter(!is.na(time_to_final)) %>%
filter(!is.na(num_sites)) %>%
filter(!is.na(enrollment)) %>%
filter(!is.na(number_of_arms))
cat("After filtering to COMPLETED trials:", nrow(trials_clean), "trials\n")## After filtering to COMPLETED trials: 444 trials
trials_scaled <- trials_clean %>%
mutate(
num_sites_scaled = scale(num_sites)[,1],
enrollment_scaled = scale(enrollment)[,1],
number_of_arms_scaled = scale(number_of_arms)[,1],
months_from_covid_scaled = scale(months_from_covid)[,1]
)
# Store scaling parameters for interpretation
scale_params <- data.frame(
Variable = c("num_sites", "enrollment", "number_of_arms", "months_from_covid"),
Mean = c(mean(trials_clean$num_sites), mean(trials_clean$enrollment),
mean(trials_clean$number_of_arms), mean(trials_clean$months_from_covid)),
SD = c(sd(trials_clean$num_sites), sd(trials_clean$enrollment),
sd(trials_clean$number_of_arms), sd(trials_clean$months_from_covid))
)
kable(scale_params, digits = 2, caption = "Scaling Parameters (for back-transformation)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | Mean | SD |
|---|---|---|
| num_sites | 15.93 | 69.59 |
| enrollment | 63.48 | 71.68 |
| number_of_arms | 1.62 | 1.18 |
| months_from_covid | 0.71 | 7.63 |
cleaning_summary <- data.frame(
Step = c("Raw dataset from AACT",
"After removing non-COMPLETED status",
"After removing invalid completion times",
"After removing missing covariates"),
N = c(nrow(trials_processed),
sum(trials_processed$overall_status == "COMPLETED"),
sum(trials_processed$overall_status == "COMPLETED" &
trials_processed$time_to_final > 0, na.rm = TRUE),
nrow(trials_clean))
)
kable(cleaning_summary, caption = "Data Cleaning Steps") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Step | N |
|---|---|
| Raw dataset from AACT | 2125 |
| After removing non-COMPLETED status | 445 |
| After removing invalid completion times | 444 |
| After removing missing covariates | 444 |
sponsor_table <- trials_clean %>%
group_by(sponsor_type_clean) %>%
summarise(
N = n(),
Percent = round(n() / nrow(trials_clean) * 100, 1)
) %>%
rename(`Sponsor Type` = sponsor_type_clean)
kable(sponsor_table, caption = "Distribution of Trials by Sponsor Type") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Sponsor Type | N | Percent |
|---|---|---|
| Commercial | 114 | 25.7 |
| Academic | 318 | 71.6 |
| Government | 12 | 2.7 |
desc_stats <- data.frame(
Variable = c("Time to Completion (days)", "Number of Sites",
"Enrollment", "Number of Arms", "Months from COVID"),
N = rep(nrow(trials_clean), 5),
Mean = c(mean(trials_clean$time_to_final), mean(trials_clean$num_sites),
mean(trials_clean$enrollment), mean(trials_clean$number_of_arms),
mean(trials_clean$months_from_covid)),
SD = c(sd(trials_clean$time_to_final), sd(trials_clean$num_sites),
sd(trials_clean$enrollment), sd(trials_clean$number_of_arms),
sd(trials_clean$months_from_covid)),
Median = c(median(trials_clean$time_to_final), median(trials_clean$num_sites),
median(trials_clean$enrollment), median(trials_clean$number_of_arms),
median(trials_clean$months_from_covid)),
Min = c(min(trials_clean$time_to_final), min(trials_clean$num_sites),
min(trials_clean$enrollment), min(trials_clean$number_of_arms),
min(trials_clean$months_from_covid)),
Max = c(max(trials_clean$time_to_final), max(trials_clean$num_sites),
max(trials_clean$enrollment), max(trials_clean$number_of_arms),
max(trials_clean$months_from_covid))
)
kable(desc_stats, digits = 2, caption = "Descriptive Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Variable | N | Mean | SD | Median | Min | Max |
|---|---|---|---|---|---|---|
| Time to Completion (days) | 444 | 1211.34 | 452.68 | 1222.00 | 13.00 | 2313.00 |
| Number of Sites | 444 | 15.93 | 69.59 | 1.00 | 1.00 | 950.00 |
| Enrollment | 444 | 63.48 | 71.68 | 42.00 | 1.00 | 627.00 |
| Number of Arms | 444 | 1.62 | 1.18 | 1.00 | 1.00 | 15.00 |
| Months from COVID | 444 | 0.71 | 7.63 | 1.92 | -12.02 | 12.98 |
completion_by_sponsor <- trials_clean %>%
group_by(sponsor_type_clean) %>%
summarise(
N = n(),
`Mean (days)` = mean(time_to_final),
`SD` = sd(time_to_final),
`Median (days)` = median(time_to_final),
`Mean (months)` = mean(time_to_final) / 30.44,
`Median (months)` = median(time_to_final) / 30.44
) %>%
rename(`Sponsor Type` = sponsor_type_clean)
kable(completion_by_sponsor, digits = 1,
caption = "Time to Completion by Sponsor Type") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Sponsor Type | N | Mean (days) | SD | Median (days) | Mean (months) | Median (months) |
|---|---|---|---|---|---|---|
| Commercial | 114 | 1176.2 | 429.6 | 1148.5 | 38.6 | 37.7 |
| Academic | 318 | 1229.3 | 460.2 | 1236.0 | 40.4 | 40.6 |
| Government | 12 | 1069.8 | 457.3 | 1115.5 | 35.1 | 36.6 |
p_hist <- ggplot(trials_clean, aes(x = time_to_final)) +
geom_histogram(aes(y = after_stat(density)), bins = 40,
fill = "#2E86AB", alpha = 0.7, color = "white") +
geom_density(color = "#A23B72", linewidth = 1.2) +
labs(
title = "Distribution of Time to Final Completion",
subtitle = "Right-skewed distribution supports Gamma or Inverse Gaussian GLM",
x = "Time to Completion (days)",
y = "Density"
) +
scale_x_continuous(labels = comma)
p_histp_box <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = time_to_final,
fill = sponsor_type_clean)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.4) +
scale_fill_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(
title = "Time to Completion by Sponsor Type",
x = "Sponsor Type",
y = "Time to Completion (days)"
) +
theme(legend.position = "none") +
scale_y_continuous(labels = comma)
p_boxp_violin <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = time_to_final,
fill = sponsor_type_clean)) +
geom_violin(alpha = 0.5, trim = FALSE) +
geom_boxplot(width = 0.15, alpha = 0.8, outlier.size = 1) +
scale_fill_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(
title = "Distribution of Completion Time by Sponsor Type",
x = "Sponsor Type",
y = "Time to Completion (days)"
) +
theme(legend.position = "none") +
scale_y_continuous(labels = comma)
p_violinp_sites <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = num_sites,
fill = sponsor_type_clean)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(title = "Number of Sites by Sponsor Type", x = "", y = "Number of Sites") +
theme(legend.position = "none") +
coord_cartesian(ylim = c(0, quantile(trials_clean$num_sites, 0.95)))
p_sites_time <- ggplot(trials_clean, aes(x = num_sites, y = time_to_final,
color = sponsor_type_clean)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
scale_color_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(title = "Completion Time vs Number of Sites",
x = "Number of Sites", y = "Time to Completion (days)", color = "Sponsor") +
coord_cartesian(xlim = c(0, quantile(trials_clean$num_sites, 0.95)))
p_enroll_time <- ggplot(trials_clean, aes(x = enrollment, y = time_to_final,
color = sponsor_type_clean)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
scale_color_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(title = "Completion Time vs Enrollment",
x = "Enrollment", y = "Time to Completion (days)", color = "Sponsor") +
coord_cartesian(xlim = c(0, quantile(trials_clean$enrollment, 0.95)))
p_covid <- ggplot(trials_clean, aes(x = months_from_covid, y = time_to_final,
color = sponsor_type_clean)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
scale_color_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(title = "Completion Time vs COVID Timing",
subtitle = "Dashed line = March 2020 (WHO pandemic declaration)",
x = "Months from COVID Start", y = "Time to Completion (days)", color = "Sponsor")
(p_sites | p_sites_time) / (p_enroll_time | p_covid)mv_check <- trials_clean %>%
mutate(bin = cut(time_to_final, breaks = 15)) %>%
group_by(bin) %>%
summarise(
bin_mean = mean(time_to_final),
bin_var = var(time_to_final),
n = n()
) %>%
filter(n >= 5)
p_mv <- ggplot(mv_check, aes(x = bin_mean, y = bin_var)) +
geom_point(size = 3, color = "#2E86AB") +
geom_smooth(method = "lm", se = TRUE, color = "#A23B72") +
labs(
title = "Mean-Variance Relationship",
subtitle = "Positive relationship supports Gamma/Inverse Gaussian over Normal",
x = "Bin Mean (days)",
y = "Bin Variance"
) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma)
p_mvcor_vars <- trials_clean %>%
dplyr::select(time_to_final, num_sites, enrollment, number_of_arms, months_from_covid)
cor_matrix <- cor(cor_vars, use = "complete.obs")
kable(round(cor_matrix, 3), caption = "Correlation Matrix") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| time_to_final | num_sites | enrollment | number_of_arms | months_from_covid | |
|---|---|---|---|---|---|
| time_to_final | 1.000 | 0.170 | -0.038 | 0.015 | -0.227 |
| num_sites | 0.170 | 1.000 | 0.121 | 0.037 | -0.100 |
| enrollment | -0.038 | 0.121 | 1.000 | 0.235 | 0.009 |
| number_of_arms | 0.015 | 0.037 | 0.235 | 1.000 | 0.017 |
| months_from_covid | -0.227 | -0.100 | 0.009 | 0.017 | 1.000 |
The GLM takes the form:
\[\log(\mu_i) = \beta_0 + \beta_1 \cdot \text{Academic}_i + \beta_2 \cdot \text{Government}_i + \beta_3 \cdot \text{NumSites}_i + \beta_4 \cdot \text{Enrollment}_i + \beta_5 \cdot \text{NumArms}_i + \beta_6 \cdot \text{MonthsFromCOVID}_i\]
where \(\mu_i = E(Y_i)\) is the expected completion time for trial \(i\).
Note: Continuous predictors are standardized (mean-centered and scaled by SD) to improve model convergence and allow comparison of effect sizes.
model_gamma <- glm(time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled +
number_of_arms_scaled + months_from_covid_scaled,
data = trials_scaled,
family = Gamma(link = "log"))
summary(model_gamma)##
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean + num_sites_scaled +
## enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled,
## family = Gamma(link = "log"), data = trials_scaled)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.08694 0.03534 200.545 < 2e-16 ***
## sponsor_type_cleanAcademic 0.01524 0.04185 0.364 0.71591
## sponsor_type_cleanGovernment -0.13313 0.10992 -1.211 0.22648
## num_sites_scaled 0.05021 0.01731 2.900 0.00392 **
## enrollment_scaled -0.02209 0.01822 -1.212 0.22608
## number_of_arms_scaled 0.01079 0.01777 0.607 0.54399
## months_from_covid_scaled -0.07882 0.01737 -4.538 7.37e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.1290495)
##
## Null deviance: 88.442 on 443 degrees of freedom
## Residual deviance: 83.807 on 437 degrees of freedom
## AIC: 6765.6
##
## Number of Fisher Scoring iterations: 5
gamma_coefs <- data.frame(
Term = names(coef(model_gamma)),
Estimate = coef(model_gamma),
SE = summary(model_gamma)$coefficients[, 2],
z_value = summary(model_gamma)$coefficients[, 3],
p_value = summary(model_gamma)$coefficients[, 4],
Exp_Estimate = exp(coef(model_gamma))
)
rownames(gamma_coefs) <- NULL
kable(gamma_coefs, digits = 4,
caption = "Gamma GLM Coefficients (Log Link)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | z_value | p_value | Exp_Estimate |
|---|---|---|---|---|---|
| (Intercept) | 7.0869 | 0.0353 | 200.5454 | 0.0000 | 1196.2471 |
| sponsor_type_cleanAcademic | 0.0152 | 0.0419 | 0.3642 | 0.7159 | 1.0154 |
| sponsor_type_cleanGovernment | -0.1331 | 0.1099 | -1.2112 | 0.2265 | 0.8753 |
| num_sites_scaled | 0.0502 | 0.0173 | 2.8997 | 0.0039 | 1.0515 |
| enrollment_scaled | -0.0221 | 0.0182 | -1.2122 | 0.2261 | 0.9782 |
| number_of_arms_scaled | 0.0108 | 0.0178 | 0.6073 | 0.5440 | 1.0109 |
| months_from_covid_scaled | -0.0788 | 0.0174 | -4.5375 | 0.0000 | 0.9242 |
## Multiplicative Effects (Gamma Model):
academic_effect <- exp(coef(model_gamma)["sponsor_type_cleanAcademic"])
gov_effect <- exp(coef(model_gamma)["sponsor_type_cleanGovernment"])
cat("Academic vs Commercial: ", round((academic_effect - 1) * 100, 1),
"% change in completion time\n", sep = "")## Academic vs Commercial: 1.5% change in completion time
cat("Government vs Commercial: ", round((gov_effect - 1) * 100, 1),
"% change in completion time\n\n", sep = "")## Government vs Commercial: -12.5% change in completion time
cat("Per 1 SD increase in num_sites: ", round((exp(coef(model_gamma)["num_sites_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in num_sites: 5.15% change
cat("Per 1 SD increase in enrollment: ", round((exp(coef(model_gamma)["enrollment_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in enrollment: -2.18% change
cat("Per 1 SD increase in number_of_arms: ", round((exp(coef(model_gamma)["number_of_arms_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in number_of_arms: 1.09% change
cat("Per 1 SD increase in months_from_covid: ", round((exp(coef(model_gamma)["months_from_covid_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in months_from_covid: -7.58% change
gamma_deviance_resid <- residuals(model_gamma, type = "deviance")
gamma_pearson_resid <- residuals(model_gamma, type = "pearson")
gamma_fitted <- fitted(model_gamma)
gamma_diag <- data.frame(
fitted = gamma_fitted,
deviance_resid = gamma_deviance_resid,
pearson_resid = gamma_pearson_resid,
observed = trials_scaled$time_to_final
)
p1_gamma <- ggplot(gamma_diag, aes(x = fitted, y = deviance_resid)) +
geom_point(alpha = 0.4, color = "#2E86AB") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
labs(title = "Gamma: Deviance Residuals vs Fitted",
x = "Fitted Values", y = "Deviance Residuals")
p2_gamma <- ggplot(gamma_diag, aes(sample = deviance_resid)) +
stat_qq(color = "#2E86AB", alpha = 0.5) +
stat_qq_line(color = "#A23B72", linewidth = 1) +
labs(title = "Gamma: Q-Q Plot of Deviance Residuals",
x = "Theoretical Quantiles", y = "Sample Quantiles")
p3_gamma <- ggplot(gamma_diag, aes(x = fitted, y = sqrt(abs(deviance_resid)))) +
geom_point(alpha = 0.4, color = "#2E86AB") +
geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
labs(title = "Gamma: Scale-Location Plot",
x = "Fitted Values", y = "sqrt|Deviance Residuals|")
p4_gamma <- ggplot(gamma_diag, aes(x = fitted, y = observed)) +
geom_point(alpha = 0.4, color = "#2E86AB") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Gamma: Observed vs Fitted",
x = "Fitted Values", y = "Observed Values") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma)
(p1_gamma | p2_gamma) / (p3_gamma | p4_gamma)gamma_stats <- data.frame(
Metric = c("Null Deviance", "Residual Deviance", "AIC", "BIC",
"Dispersion Parameter", "Degrees of Freedom (Residual)"),
Value = c(
model_gamma$null.deviance,
model_gamma$deviance,
AIC(model_gamma),
BIC(model_gamma),
summary(model_gamma)$dispersion,
model_gamma$df.residual
)
)
kable(gamma_stats, digits = 2, caption = "Gamma GLM Fit Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Null Deviance | 88.44 |
| Residual Deviance | 83.81 |
| AIC | 6765.58 |
| BIC | 6798.35 |
| Dispersion Parameter | 0.13 |
| Degrees of Freedom (Residual) | 437.00 |
model_ig <- glm(time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled +
number_of_arms_scaled + months_from_covid_scaled,
data = trials_scaled,
family = inverse.gaussian(link = "log"),
control = glm.control(maxit = 100))
summary(model_ig)##
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean + num_sites_scaled +
## enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled,
## family = inverse.gaussian(link = "log"), data = trials_scaled,
## control = glm.control(maxit = 100))
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.994e+03 5.762e+17 0 1
## sponsor_type_cleanAcademic 1.784e+02 6.856e+17 0 1
## sponsor_type_cleanGovernment 3.562e+02 1.813e+18 0 1
## num_sites_scaled -4.087e+02 2.766e+17 0 1
## enrollment_scaled 1.931e+02 3.507e+17 0 1
## number_of_arms_scaled -1.824e+03 2.874e+17 0 1
## months_from_covid_scaled -5.977e+02 2.968e+17 0 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 1.458158e+53)
##
## Null deviance: 1.8086e-01 on 443 degrees of freedom
## Residual deviance: 1.0262e+37 on 437 degrees of freedom
## AIC: 45732
##
## Number of Fisher Scoring iterations: 7
ig_coefs <- data.frame(
Term = names(coef(model_ig)),
Estimate = coef(model_ig),
SE = summary(model_ig)$coefficients[, 2],
z_value = summary(model_ig)$coefficients[, 3],
p_value = summary(model_ig)$coefficients[, 4],
Exp_Estimate = exp(coef(model_ig))
)
rownames(ig_coefs) <- NULL
kable(ig_coefs, digits = 4,
caption = "Inverse Gaussian GLM Coefficients (Log Link)") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | z_value | p_value | Exp_Estimate |
|---|---|---|---|---|---|
| (Intercept) | -1994.2730 | 5.762382e+17 | 0 | 1 | 0.000000e+00 |
| sponsor_type_cleanAcademic | 178.3788 | 6.855617e+17 | 0 | 1 | 2.943907e+77 |
| sponsor_type_cleanGovernment | 356.2027 | 1.812901e+18 | 0 | 1 | 4.976036e+154 |
| num_sites_scaled | -408.6918 | 2.766446e+17 | 0 | 1 | 0.000000e+00 |
| enrollment_scaled | 193.0600 | 3.507351e+17 | 0 | 1 | 6.996602e+83 |
| number_of_arms_scaled | -1824.0322 | 2.873803e+17 | 0 | 1 | 0.000000e+00 |
| months_from_covid_scaled | -597.6835 | 2.967745e+17 | 0 | 1 | 0.000000e+00 |
## Multiplicative Effects (Inverse Gaussian Model):
academic_effect_ig <- exp(coef(model_ig)["sponsor_type_cleanAcademic"])
gov_effect_ig <- exp(coef(model_ig)["sponsor_type_cleanGovernment"])
cat("Academic vs Commercial: ", round((academic_effect_ig - 1) * 100, 1),
"% change in completion time\n", sep = "")## Academic vs Commercial: 2.943907e+79% change in completion time
cat("Government vs Commercial: ", round((gov_effect_ig - 1) * 100, 1),
"% change in completion time\n\n", sep = "")## Government vs Commercial: 4.976036e+156% change in completion time
cat("Per 1 SD increase in num_sites: ", round((exp(coef(model_ig)["num_sites_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in num_sites: -100% change
cat("Per 1 SD increase in enrollment: ", round((exp(coef(model_ig)["enrollment_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in enrollment: 6.996602e+85% change
cat("Per 1 SD increase in number_of_arms: ", round((exp(coef(model_ig)["number_of_arms_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in number_of_arms: -100% change
cat("Per 1 SD increase in months_from_covid: ", round((exp(coef(model_ig)["months_from_covid_scaled"]) - 1) * 100, 2),
"% change\n", sep = "")## Per 1 SD increase in months_from_covid: -100% change
ig_deviance_resid <- residuals(model_ig, type = "deviance")
ig_pearson_resid <- residuals(model_ig, type = "pearson")
ig_fitted <- fitted(model_ig)
ig_diag <- data.frame(
fitted = ig_fitted,
deviance_resid = ig_deviance_resid,
pearson_resid = ig_pearson_resid,
observed = trials_scaled$time_to_final
)
p1_ig <- ggplot(ig_diag, aes(x = fitted, y = deviance_resid)) +
geom_point(alpha = 0.4, color = "#2E86AB") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
labs(title = "Inverse Gaussian: Deviance Residuals vs Fitted",
x = "Fitted Values", y = "Deviance Residuals")
p2_ig <- ggplot(ig_diag, aes(sample = deviance_resid)) +
stat_qq(color = "#2E86AB", alpha = 0.5) +
stat_qq_line(color = "#A23B72", linewidth = 1) +
labs(title = "Inverse Gaussian: Q-Q Plot of Deviance Residuals",
x = "Theoretical Quantiles", y = "Sample Quantiles")
p3_ig <- ggplot(ig_diag, aes(x = fitted, y = sqrt(abs(deviance_resid)))) +
geom_point(alpha = 0.4, color = "#2E86AB") +
geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
labs(title = "Inverse Gaussian: Scale-Location Plot",
x = "Fitted Values", y = "sqrt|Deviance Residuals|")
p4_ig <- ggplot(ig_diag, aes(x = fitted, y = observed)) +
geom_point(alpha = 0.4, color = "#2E86AB") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Inverse Gaussian: Observed vs Fitted",
x = "Fitted Values", y = "Observed Values") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma)
(p1_ig | p2_ig) / (p3_ig | p4_ig)ig_stats <- data.frame(
Metric = c("Null Deviance", "Residual Deviance", "AIC", "BIC",
"Dispersion Parameter", "Degrees of Freedom (Residual)"),
Value = c(
model_ig$null.deviance,
model_ig$deviance,
AIC(model_ig),
BIC(model_ig),
summary(model_ig)$dispersion,
model_ig$df.residual
)
)
kable(ig_stats, digits = 2, caption = "Inverse Gaussian GLM Fit Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Metric | Value |
|---|---|
| Null Deviance | 1.800000e-01 |
| Residual Deviance | 1.026203e+37 |
| AIC | 4.573167e+04 |
| BIC | 4.576444e+04 |
| Dispersion Parameter | 1.458158e+53 |
| Degrees of Freedom (Residual) | 4.370000e+02 |
comparison <- data.frame(
Metric = c("AIC", "BIC", "Residual Deviance", "Null Deviance",
"Deviance Explained (%)", "Dispersion Parameter"),
Gamma = c(
AIC(model_gamma),
BIC(model_gamma),
model_gamma$deviance,
model_gamma$null.deviance,
(1 - model_gamma$deviance / model_gamma$null.deviance) * 100,
summary(model_gamma)$dispersion
),
Inverse_Gaussian = c(
AIC(model_ig),
BIC(model_ig),
model_ig$deviance,
model_ig$null.deviance,
(1 - model_ig$deviance / model_ig$null.deviance) * 100,
summary(model_ig)$dispersion
)
)
comparison$Difference <- comparison$Gamma - comparison$Inverse_Gaussian
comparison$Better <- ifelse(comparison$Difference > 0, "Inverse Gaussian", "Gamma")
comparison$Better[comparison$Metric == "Deviance Explained (%)"] <-
ifelse(comparison$Difference[comparison$Metric == "Deviance Explained (%)"] < 0,
"Inverse Gaussian", "Gamma")
kable(comparison, digits = 2,
caption = "Model Comparison: Gamma vs Inverse Gaussian") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
column_spec(5, bold = TRUE)| Metric | Gamma | Inverse_Gaussian | Difference | Better |
|---|---|---|---|---|
| AIC | 6765.58 | 4.573167e+04 | -3.896609e+04 | Gamma |
| BIC | 6798.35 | 4.576444e+04 | -3.896609e+04 | Gamma |
| Residual Deviance | 83.81 | 1.026203e+37 | -1.026203e+37 | Gamma |
| Null Deviance | 88.44 | 1.800000e-01 | 8.826000e+01 | Inverse Gaussian |
| Deviance Explained (%) | 5.24 | -5.674058e+39 | 5.674058e+39 | Gamma |
| Dispersion Parameter | 0.13 | 1.458158e+53 | -1.458158e+53 | Gamma |
coef_compare <- data.frame(
Term = names(coef(model_gamma)),
Gamma_Est = coef(model_gamma),
Gamma_Exp = exp(coef(model_gamma)),
IG_Est = coef(model_ig),
IG_Exp = exp(coef(model_ig))
)
rownames(coef_compare) <- NULL
kable(coef_compare, digits = 4, col.names = c("Term", "Gamma Est", "Gamma exp(Est)",
"IG Est", "IG exp(Est)"),
caption = "Coefficient Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Gamma Est | Gamma exp(Est) | IG Est | IG exp(Est) |
|---|---|---|---|---|
| (Intercept) | 7.0869 | 1196.2471 | -1994.2730 | 0.000000e+00 |
| sponsor_type_cleanAcademic | 0.0152 | 1.0154 | 178.3788 | 2.943907e+77 |
| sponsor_type_cleanGovernment | -0.1331 | 0.8753 | 356.2027 | 4.976036e+154 |
| num_sites_scaled | 0.0502 | 1.0515 | -408.6918 | 0.000000e+00 |
| enrollment_scaled | -0.0221 | 0.9782 | 193.0600 | 6.996602e+83 |
| number_of_arms_scaled | 0.0108 | 1.0109 | -1824.0322 | 0.000000e+00 |
| months_from_covid_scaled | -0.0788 | 0.9242 | -597.6835 | 0.000000e+00 |
resid_compare <- data.frame(
Model = rep(c("Gamma", "Inverse Gaussian"), each = nrow(trials_scaled)),
Fitted = c(gamma_fitted, ig_fitted),
Deviance_Residuals = c(gamma_deviance_resid, ig_deviance_resid)
)
p_qq_compare <- ggplot(resid_compare, aes(sample = Deviance_Residuals, color = Model)) +
stat_qq(alpha = 0.5) +
stat_qq_line() +
scale_color_manual(values = c("Gamma" = "#2E86AB", "Inverse Gaussian" = "#A23B72")) +
labs(title = "Q-Q Plot Comparison",
x = "Theoretical Quantiles", y = "Sample Quantiles") +
facet_wrap(~Model)
p_resid_compare <- ggplot(resid_compare, aes(x = Fitted, y = Deviance_Residuals, color = Model)) +
geom_point(alpha = 0.3) +
geom_hline(yintercept = 0, linetype = "dashed") +
geom_smooth(method = "loess", se = TRUE) +
scale_color_manual(values = c("Gamma" = "#2E86AB", "Inverse Gaussian" = "#A23B72")) +
labs(title = "Residuals vs Fitted Comparison",
x = "Fitted Values", y = "Deviance Residuals") +
facet_wrap(~Model)
p_qq_compare / p_resid_compareaic_winner <- ifelse(AIC(model_gamma) < AIC(model_ig), "Gamma", "Inverse Gaussian")
bic_winner <- ifelse(BIC(model_gamma) < BIC(model_ig), "Gamma", "Inverse Gaussian")
cat("=== MODEL SELECTION SUMMARY ===\n\n")## === MODEL SELECTION SUMMARY ===
## AIC prefers: Gamma
## Gamma AIC: 6765.58
## Inverse Gaussian AIC: 45731.67
## Difference: 38966.09
## BIC prefers: Gamma
## Gamma BIC: 6798.35
## Inverse Gaussian BIC: 45764.44
## Difference: 38966.09
Based on the model comparison above, we fit an interaction model using the better-fitting distribution.
best_family <- if(AIC(model_gamma) <= AIC(model_ig)) {
cat("Using Gamma distribution (lower AIC)\n\n")
Gamma(link = "log")
} else {
cat("Using Inverse Gaussian distribution (lower AIC)\n\n")
inverse.gaussian(link = "log")
}## Using Gamma distribution (lower AIC)
model_interaction <- glm(time_to_final ~ sponsor_type_clean * num_sites_scaled + enrollment_scaled +
number_of_arms_scaled + months_from_covid_scaled,
data = trials_scaled,
family = best_family,
control = glm.control(maxit = 100))
summary(model_interaction)##
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean * num_sites_scaled +
## enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled,
## family = best_family, data = trials_scaled, control = glm.control(maxit = 100))
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 7.063732 0.036011 196.154
## sponsor_type_cleanAcademic 0.036410 0.042219 0.862
## sponsor_type_cleanGovernment 0.699054 0.676852 1.033
## num_sites_scaled 0.269275 0.089623 3.005
## enrollment_scaled -0.027583 0.018308 -1.507
## number_of_arms_scaled 0.006153 0.017763 0.346
## months_from_covid_scaled -0.079975 0.017357 -4.608
## sponsor_type_cleanAcademic:num_sites_scaled -0.229341 0.091221 -2.514
## sponsor_type_cleanGovernment:num_sites_scaled 3.774063 3.257933 1.158
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## sponsor_type_cleanAcademic 0.38893
## sponsor_type_cleanGovernment 0.30227
## num_sites_scaled 0.00281 **
## enrollment_scaled 0.13264
## number_of_arms_scaled 0.72921
## months_from_covid_scaled 5.36e-06 ***
## sponsor_type_cleanAcademic:num_sites_scaled 0.01229 *
## sponsor_type_cleanGovernment:num_sites_scaled 0.24733
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.1277541)
##
## Null deviance: 88.442 on 443 degrees of freedom
## Residual deviance: 82.764 on 435 degrees of freedom
## AIC: 6763.8
##
## Number of Fisher Scoring iterations: 5
int_coefs <- data.frame(
Term = names(coef(model_interaction)),
Estimate = coef(model_interaction),
SE = summary(model_interaction)$coefficients[, 2],
z_value = summary(model_interaction)$coefficients[, 3],
p_value = summary(model_interaction)$coefficients[, 4],
Exp_Estimate = exp(coef(model_interaction))
)
rownames(int_coefs) <- NULL
kable(int_coefs, digits = 4,
caption = "Interaction Model Coefficients") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| Term | Estimate | SE | z_value | p_value | Exp_Estimate |
|---|---|---|---|---|---|
| (Intercept) | 7.0637 | 0.0360 | 196.1543 | 0.0000 | 1168.7985 |
| sponsor_type_cleanAcademic | 0.0364 | 0.0422 | 0.8624 | 0.3889 | 1.0371 |
| sponsor_type_cleanGovernment | 0.6991 | 0.6769 | 1.0328 | 0.3023 | 2.0118 |
| num_sites_scaled | 0.2693 | 0.0896 | 3.0045 | 0.0028 | 1.3090 |
| enrollment_scaled | -0.0276 | 0.0183 | -1.5066 | 0.1326 | 0.9728 |
| number_of_arms_scaled | 0.0062 | 0.0178 | 0.3464 | 0.7292 | 1.0062 |
| months_from_covid_scaled | -0.0800 | 0.0174 | -4.6077 | 0.0000 | 0.9231 |
| sponsor_type_cleanAcademic:num_sites_scaled | -0.2293 | 0.0912 | -2.5141 | 0.0123 | 0.7951 |
| sponsor_type_cleanGovernment:num_sites_scaled | 3.7741 | 3.2579 | 1.1584 | 0.2473 | 43.5567 |
if(AIC(model_gamma) <= AIC(model_ig)) {
main_model <- model_gamma
} else {
main_model <- model_ig
}
lr_test <- anova(main_model, model_interaction, test = "F")
print(lr_test)## Analysis of Deviance Table
##
## Model 1: time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled +
## number_of_arms_scaled + months_from_covid_scaled
## Model 2: time_to_final ~ sponsor_type_clean * num_sites_scaled + enrollment_scaled +
## number_of_arms_scaled + months_from_covid_scaled
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 437 83.807
## 2 435 82.764 2 1.0431 4.0825 0.01752 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## === Interaction Test ===
## Does adding the sponsor x sites interaction significantly improve fit?
## F-test p-value: 0.0175
if(lr_test$`Pr(>F)`[2] < 0.05) {
cat("Conclusion: YES - Interaction is statistically significant\n")
} else {
cat("Conclusion: NO - Interaction is not statistically significant\n")
}## Conclusion: YES - Interaction is statistically significant
pred_data <- expand.grid(
sponsor_type_clean = levels(trials_scaled$sponsor_type_clean),
num_sites_scaled = seq(min(trials_scaled$num_sites_scaled),
quantile(trials_scaled$num_sites_scaled, 0.90),
length.out = 50),
enrollment_scaled = 0,
number_of_arms_scaled = 0,
months_from_covid_scaled = 0
)
pred_data$predicted <- predict(model_interaction, newdata = pred_data, type = "response")
# Back-transform num_sites for x-axis
pred_data$num_sites_original <- pred_data$num_sites_scaled * scale_params$SD[1] + scale_params$Mean[1]
ggplot(pred_data, aes(x = num_sites_original, y = predicted, color = sponsor_type_clean)) +
geom_line(linewidth = 1.2) +
scale_color_manual(values = c("Commercial" = "#2E86AB",
"Academic" = "#A23B72",
"Government" = "#F18F01")) +
labs(
title = "Predicted Completion Time by Number of Sites and Sponsor Type",
subtitle = "Interaction Model (other covariates held at mean)",
x = "Number of Clinical Sites",
y = "Predicted Time to Completion (days)",
color = "Sponsor Type"
) +
scale_y_continuous(labels = comma) +
theme(legend.position = "bottom")## === FINAL SUMMARY ===
## 1. RESEARCH QUESTION
## Does sponsor type predict Phase 2 oncology trial completion time during COVID-19?
## 2. SAMPLE
## N = 444 completed Phase 2 solid tumor trials
## Period: March 2019 - March 2021
## 3. MODEL SELECTION
## Best fitting distribution: Gamma
## AIC difference: 38966.09
## 4. KEY FINDINGS (from Gamma model)
if(aic_winner == "Gamma") {
best_model <- model_gamma
} else {
best_model <- model_ig
}
p_academic <- summary(best_model)$coefficients["sponsor_type_cleanAcademic", 4]
p_gov <- summary(best_model)$coefficients["sponsor_type_cleanGovernment", 4]
cat(" Academic vs Commercial: ",
round((exp(coef(best_model)["sponsor_type_cleanAcademic"]) - 1) * 100, 1),
"% difference (p = ", round(p_academic, 4), ")\n", sep = "")## Academic vs Commercial: 1.5% difference (p = 0.7159)
cat(" Government vs Commercial: ",
round((exp(coef(best_model)["sponsor_type_cleanGovernment"]) - 1) * 100, 1),
"% difference (p = ", round(p_gov, 4), ")\n\n", sep = "")## Government vs Commercial: -12.5% difference (p = 0.2265)
## 5. SIGNIFICANT COVARIATES
coef_summary <- summary(best_model)$coefficients
sig_covars <- rownames(coef_summary)[coef_summary[, 4] < 0.05]
sig_covars <- sig_covars[sig_covars != "(Intercept)"]
for(cv in sig_covars) {
cat(" ", cv, ": exp(b) = ", round(exp(coef(best_model)[cv]), 4),
", p = ", round(coef_summary[cv, 4], 4), "\n", sep = "")
}## num_sites_scaled: exp(b) = 1.0515, p = 0.0039
## months_from_covid_scaled: exp(b) = 0.9242, p = 0
##
## === DISTRIBUTION CHOICE RATIONALE ===
if(aic_winner == "Inverse Gaussian") {
cat("The Inverse Gaussian distribution provided better fit, likely because:\n\n")
cat("1. HEAVIER RIGHT TAIL: Clinical trial completion times often have extreme\n")
cat(" outliers where a few trials run much longer than expected. The Inverse\n")
cat(" Gaussian's heavier tail better accommodates these observations.\n\n")
cat("2. VARIANCE-MEAN RELATIONSHIP: The Inverse Gaussian assumes variance is\n")
cat(" proportional to mu^3 (vs mu^2 for Gamma). This may better reflect the\n")
cat(" reality that uncertainty compounds more dramatically for longer trials.\n\n")
cat("3. PROCESS INTERPRETATION: The Inverse Gaussian arises naturally as the\n")
cat(" first passage time distribution, which aligns conceptually with trial\n")
cat(" completion as a 'time to reach a threshold' process.\n")
} else {
cat("The Gamma distribution provided better fit, likely because:\n\n")
cat("1. MODERATE SKEWNESS: The completion time distribution shows right skew\n")
cat(" but not extreme outliers, which Gamma handles well.\n\n")
cat("2. VARIANCE-MEAN RELATIONSHIP: The Gamma's assumption that variance is\n")
cat(" proportional to mu^2 appears to match the observed pattern in the data.\n\n")
cat("3. ROBUSTNESS: Gamma GLMs are generally more stable and less sensitive\n")
cat(" to influential observations than Inverse Gaussian models.\n")
}## The Gamma distribution provided better fit, likely because:
##
## 1. MODERATE SKEWNESS: The completion time distribution shows right skew
## but not extreme outliers, which Gamma handles well.
##
## 2. VARIANCE-MEAN RELATIONSHIP: The Gamma's assumption that variance is
## proportional to mu^2 appears to match the observed pattern in the data.
##
## 3. ROBUSTNESS: Gamma GLMs are generally more stable and less sensitive
## to influential observations than Inverse Gaussian models.
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] kableExtra_1.4.0 knitr_1.50 car_3.1-3 carData_3.0-5
## [5] MASS_7.3-65 statmod_1.5.1 patchwork_1.3.2 scales_1.4.0
## [9] ggplot2_4.0.0 tidyr_1.3.1 dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 xml2_1.4.0 lattice_0.22-7
## [5] stringi_1.8.7 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.5
## [9] grid_4.5.1 RColorBrewer_1.1-3 fastmap_1.2.0 Matrix_1.7-3
## [13] jsonlite_2.0.0 Formula_1.2-5 mgcv_1.9-3 purrr_1.1.0
## [17] viridisLite_0.4.2 textshaping_1.0.3 jquerylib_0.1.4 abind_1.4-8
## [21] cli_3.6.5 rlang_1.1.6 splines_4.5.1 withr_3.0.2
## [25] cachem_1.1.0 yaml_2.3.10 tools_4.5.1 vctrs_0.6.5
## [29] R6_2.6.1 lifecycle_1.0.4 stringr_1.5.1 pkgconfig_2.0.3
## [33] pillar_1.11.0 bslib_0.9.0 gtable_0.3.6 glue_1.8.0
## [37] systemfonts_1.3.1 xfun_0.52 tibble_3.3.0 tidyselect_1.2.1
## [41] rstudioapi_0.17.1 farver_2.1.2 nlme_3.1-168 htmltools_0.5.8.1
## [45] rmarkdown_2.30 svglite_2.2.1 labeling_0.4.3 compiler_4.5.1
## [49] S7_0.2.0