library(dplyr)
library(ggplot2)
library(statmod) # For Inverse Gaussian
load("trials_analysis.RData")
cat("Loaded", nrow(mydata), "trials\n")
## Loaded 127 trials
mydata %>%
group_by(sponsor) %>%
summarise(
N = n(),
Percent = round(n() / nrow(mydata) * 100, 1)
)
## # A tibble: 2 × 3
## sponsor N Percent
## <fct> <int> <dbl>
## 1 Commercial 83 65.4
## 2 Non-Commercial 44 34.6
mean_time <- mean(mydata$completion_time)
median_time <- median(mydata$completion_time)
ggplot(mydata, aes(x = completion_time)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
geom_vline(xintercept = mean_time, color = "red", linetype = "dashed", linewidth = 1) +
geom_vline(xintercept = median_time, color = "darkgreen", linewidth = 1) +
labs(title = "Distribution of Trial Completion Time",
subtitle = "Red dashed = Mean | Green = Median",
x = "Completion Time (days)",
y = "Count") +
theme_minimal()
cat("Mean:", round(mean_time, 0), "days\n")
## Mean: 736 days
cat("Median:", round(median_time, 0), "days\n")
## Median: 637 days
cat("Mean > Median confirms right skew\n")
## Mean > Median confirms right skew
ggplot(mydata, aes(x = sponsor, y = completion_time, fill = sponsor)) +
geom_boxplot() +
scale_fill_manual(values = c("steelblue", "coral")) +
labs(title = "Completion Time by Sponsor Type",
x = "Sponsor Type",
y = "Completion Time (days)") +
theme_minimal() +
theme(legend.position = "none")
mydata %>%
group_by(sponsor) %>%
summarise(
N = n(),
Mean = round(mean(completion_time), 0),
Median = round(median(completion_time), 0),
SD = round(sd(completion_time), 0),
Min = min(completion_time),
Max = max(completion_time)
)
## # A tibble: 2 × 7
## sponsor N Mean Median SD Min Max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Commercial 83 646 609 307 89 1646
## 2 Non-Commercial 44 906 834 473 288 1917
For Gamma GLM, we expect variance to increase with the mean.
simple_fit <- lm(completion_time ~ sponsor + num_sites + enrollment, data = mydata)
plot_data <- data.frame(
fitted = fitted(simple_fit),
abs_resid = abs(residuals(simple_fit))
)
ggplot(plot_data, aes(x = fitted, y = abs_resid)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Variance Check",
subtitle = "Upward slope supports Gamma GLM",
x = "Fitted Values",
y = "Absolute Residuals") +
theme_minimal()
Regular linear regression assumes normal distribution and constant variance. Completion time violates both:
gamma_model <- glm(
completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
data = mydata,
family = Gamma(link = "log")
)
summary(gamma_model)
##
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z +
## arms_z + covid_timing_z, family = Gamma(link = "log"), data = mydata)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.43633 0.04888 131.671 < 2e-16 ***
## sponsorNon-Commercial 0.35112 0.09065 3.874 0.000175 ***
## num_sites_z 0.11667 0.06446 1.810 0.072759 .
## enrollment_z 0.05245 0.06180 0.849 0.397685
## arms_z -0.03267 0.04021 -0.813 0.418095
## covid_timing_z -0.19290 0.03888 -4.961 2.32e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Gamma family taken to be 0.1782071)
##
## Null deviance: 38.083 on 126 degrees of freedom
## Residual deviance: 27.201 on 121 degrees of freedom
## AIC: 1821.8
##
## Number of Fisher Scoring iterations: 6
With a log link, we exponentiate coefficients to get multiplicative effects.
coefs <- coef(gamma_model)
pvals <- summary(gamma_model)$coefficients[, 4]
results <- data.frame(
Variable = names(coefs),
Coefficient = round(coefs, 4),
Exp_Coef = round(exp(coefs), 3),
Pct_Change = round((exp(coefs) - 1) * 100, 1),
P_Value = round(pvals, 4)
)
rownames(results) <- NULL
print(results)
## Variable Coefficient Exp_Coef Pct_Change P_Value
## 1 (Intercept) 6.4363 624.114 62311.4 0.0000
## 2 sponsorNon-Commercial 0.3511 1.421 42.1 0.0002
## 3 num_sites_z 0.1167 1.124 12.4 0.0728
## 4 enrollment_z 0.0525 1.054 5.4 0.3977
## 5 arms_z -0.0327 0.968 -3.2 0.4181
## 6 covid_timing_z -0.1929 0.825 -17.5 0.0000
par(mfrow = c(1, 2))
plot(fitted(gamma_model), residuals(gamma_model, type = "deviance"),
xlab = "Fitted Values", ylab = "Deviance Residuals",
main = "Residuals vs Fitted", col = "steelblue", pch = 16)
abline(h = 0, col = "red", lty = 2)
qqnorm(residuals(gamma_model, type = "deviance"),
main = "Q-Q Plot", col = "steelblue", pch = 16)
qqline(residuals(gamma_model, type = "deviance"), col = "red")
par(mfrow = c(1, 1))
data.frame(
Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance", "Deviance Explained"),
Value = c(
round(AIC(gamma_model), 1),
round(BIC(gamma_model), 1),
round(gamma_model$null.deviance, 2),
round(gamma_model$deviance, 2),
paste0(round((1 - gamma_model$deviance/gamma_model$null.deviance) * 100, 1), "%")
)
)
## Metric Value
## 1 AIC 1821.8
## 2 BIC 1841.7
## 3 Null Deviance 38.08
## 4 Residual Deviance 27.2
## 5 Deviance Explained 28.6%
ig_model <- glm(
completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
data = mydata,
family = inverse.gaussian(link = "log")
)
summary(ig_model)
##
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z +
## arms_z + covid_timing_z, family = inverse.gaussian(link = "log"),
## data = mydata)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.45296 0.04682 137.839 < 2e-16 ***
## sponsorNon-Commercial 0.31233 0.09147 3.415 0.00087 ***
## num_sites_z 0.09709 0.06911 1.405 0.16263
## enrollment_z 0.10994 0.08059 1.364 0.17505
## arms_z -0.02851 0.03954 -0.721 0.47220
## covid_timing_z -0.20436 0.03852 -5.305 5.16e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for inverse.gaussian family taken to be 0.000256135)
##
## Null deviance: 0.068246 on 126 degrees of freedom
## Residual deviance: 0.052925 on 121 degrees of freedom
## AIC: 1843.8
##
## Number of Fisher Scoring iterations: 9
data.frame(
Metric = c("AIC", "BIC"),
Gamma = c(round(AIC(gamma_model), 1), round(BIC(gamma_model), 1)),
Inverse_Gaussian = c(round(AIC(ig_model), 1), round(BIC(ig_model), 1))
)
## Metric Gamma Inverse_Gaussian
## 1 AIC 1821.8 1843.8
## 2 BIC 1841.7 1863.7
Lower AIC and BIC indicate better fit.
# Use better-fitting model
best_model <- gamma_model
sponsor_coef <- coef(best_model)["sponsorNon-Commercial"]
sponsor_pval <- summary(best_model)$coefficients["sponsorNon-Commercial", 4]
sponsor_pct <- round((exp(sponsor_coef) - 1) * 100, 0)
cat("=== KEY FINDING ===\n")
## === KEY FINDING ===
cat("Non-commercial trials take", sponsor_pct, "% longer than commercial trials\n")
## Non-commercial trials take 42 % longer than commercial trials
cat("P-value:", round(sponsor_pval, 4), "\n")
## P-value: 2e-04
cat("Statistically significant:", ifelse(sponsor_pval < 0.05, "Yes", "No"), "\n")
## Statistically significant: Yes
coefs <- coef(best_model)
pvals <- summary(best_model)$coefficients[, 4]
data.frame(
Variable = c("Sponsor (Non-Commercial)", "Number of Sites", "Enrollment",
"Number of Arms", "COVID Timing"),
Pct_Change = paste0(round((exp(coefs[-1]) - 1) * 100, 1), "%"),
P_Value = round(pvals[-1], 4),
Significant = ifelse(pvals[-1] < 0.05, "Yes", "No")
)
## Variable Pct_Change P_Value Significant
## sponsorNon-Commercial Sponsor (Non-Commercial) 42.1% 0.0002 Yes
## num_sites_z Number of Sites 12.4% 0.0728 No
## enrollment_z Enrollment 5.4% 0.3977 No
## arms_z Number of Arms -3.2% 0.4181 No
## covid_timing_z COVID Timing -17.5% 0.0000 Yes