1 Load Packages and Data

library(dplyr)
library(ggplot2)
library(statmod)  # For Inverse Gaussian

load("trials_analysis.RData")
cat("Loaded", nrow(mydata), "trials\n")

## Loaded 127 trials

2 Exploratory Data Analysis

2.1 Sample Composition

mydata %>%
  group_by(sponsor) %>%
  summarise(
    N = n(),
    Percent = round(n() / nrow(mydata) * 100, 1)
  )

## # A tibble: 2 × 3
##   sponsor            N Percent
##   <fct>          <int>   <dbl>
## 1 Commercial        83    65.4
## 2 Non-Commercial    44    34.6

2.2 Completion Time Distribution

mean_time <- mean(mydata$completion_time)
median_time <- median(mydata$completion_time)

ggplot(mydata, aes(x = completion_time)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "white") +
  geom_vline(xintercept = mean_time, color = "red", linetype = "dashed", linewidth = 1) +
  geom_vline(xintercept = median_time, color = "darkgreen", linewidth = 1) +
  labs(title = "Distribution of Trial Completion Time",
       subtitle = "Red dashed = Mean | Green = Median",
       x = "Completion Time (days)",
       y = "Count") +
  theme_minimal()

cat("Mean:", round(mean_time, 0), "days\n")

## Mean: 736 days

cat("Median:", round(median_time, 0), "days\n")

## Median: 637 days

cat("Mean > Median confirms right skew\n")

## Mean > Median confirms right skew

2.3 Completion Time by Sponsor Type

ggplot(mydata, aes(x = sponsor, y = completion_time, fill = sponsor)) +
  geom_boxplot() +
  scale_fill_manual(values = c("steelblue", "coral")) +
  labs(title = "Completion Time by Sponsor Type",
       x = "Sponsor Type",
       y = "Completion Time (days)") +
  theme_minimal() +
  theme(legend.position = "none")

mydata %>%
  group_by(sponsor) %>%
  summarise(
    N = n(),
    Mean = round(mean(completion_time), 0),
    Median = round(median(completion_time), 0),
    SD = round(sd(completion_time), 0),
    Min = min(completion_time),
    Max = max(completion_time)
  )

## # A tibble: 2 × 7
##   sponsor            N  Mean Median    SD   Min   Max
##   <fct>          <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Commercial        83   646    609   307    89  1646
## 2 Non-Commercial    44   906    834   473   288  1917

2.4 Heteroscedasticity Check

For Gamma GLM, we expect variance to increase with the mean.

simple_fit <- lm(completion_time ~ sponsor + num_sites + enrollment, data = mydata)

plot_data <- data.frame(
  fitted = fitted(simple_fit),
  abs_resid = abs(residuals(simple_fit))
)

ggplot(plot_data, aes(x = fitted, y = abs_resid)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Variance Check",
       subtitle = "Upward slope supports Gamma GLM",
       x = "Fitted Values",
       y = "Absolute Residuals") +
  theme_minimal()

3 Model Fitting

3.1 Why GLM?

Regular linear regression assumes normal distribution and constant variance. Completion time violates both:

Always positive and right-skewed
Variance increases with mean

3.2 Gamma GLM

gamma_model <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = Gamma(link = "log")
)

summary(gamma_model)

## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = Gamma(link = "log"), data = mydata)
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.43633    0.04888 131.671  < 2e-16 ***
## sponsorNon-Commercial  0.35112    0.09065   3.874 0.000175 ***
## num_sites_z            0.11667    0.06446   1.810 0.072759 .  
## enrollment_z           0.05245    0.06180   0.849 0.397685    
## arms_z                -0.03267    0.04021  -0.813 0.418095    
## covid_timing_z        -0.19290    0.03888  -4.961 2.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1782071)
## 
##     Null deviance: 38.083  on 126  degrees of freedom
## Residual deviance: 27.201  on 121  degrees of freedom
## AIC: 1821.8
## 
## Number of Fisher Scoring iterations: 6

3.3 Interpret Coefficients

With a log link, we exponentiate coefficients to get multiplicative effects.

coefs <- coef(gamma_model)
pvals <- summary(gamma_model)$coefficients[, 4]

results <- data.frame(
  Variable = names(coefs),
  Coefficient = round(coefs, 4),
  Exp_Coef = round(exp(coefs), 3),
  Pct_Change = round((exp(coefs) - 1) * 100, 1),
  P_Value = round(pvals, 4)
)
rownames(results) <- NULL
print(results)

##                Variable Coefficient Exp_Coef Pct_Change P_Value
## 1           (Intercept)      6.4363  624.114    62311.4  0.0000
## 2 sponsorNon-Commercial      0.3511    1.421       42.1  0.0002
## 3           num_sites_z      0.1167    1.124       12.4  0.0728
## 4          enrollment_z      0.0525    1.054        5.4  0.3977
## 5                arms_z     -0.0327    0.968       -3.2  0.4181
## 6        covid_timing_z     -0.1929    0.825      -17.5  0.0000

3.4 Model Diagnostics

par(mfrow = c(1, 2))

plot(fitted(gamma_model), residuals(gamma_model, type = "deviance"),
     xlab = "Fitted Values", ylab = "Deviance Residuals",
     main = "Residuals vs Fitted", col = "steelblue", pch = 16)
abline(h = 0, col = "red", lty = 2)

qqnorm(residuals(gamma_model, type = "deviance"), 
       main = "Q-Q Plot", col = "steelblue", pch = 16)
qqline(residuals(gamma_model, type = "deviance"), col = "red")

par(mfrow = c(1, 1))

3.5 Model Fit Statistics

data.frame(
  Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance", "Deviance Explained"),
  Value = c(
    round(AIC(gamma_model), 1),
    round(BIC(gamma_model), 1),
    round(gamma_model$null.deviance, 2),
    round(gamma_model$deviance, 2),
    paste0(round((1 - gamma_model$deviance/gamma_model$null.deviance) * 100, 1), "%")
  )
)

##               Metric  Value
## 1                AIC 1821.8
## 2                BIC 1841.7
## 3      Null Deviance  38.08
## 4  Residual Deviance   27.2
## 5 Deviance Explained  28.6%

4 Alternative Model: Inverse Gaussian

ig_model <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = inverse.gaussian(link = "log")
)

summary(ig_model)

## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = inverse.gaussian(link = "log"), 
##     data = mydata)
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.45296    0.04682 137.839  < 2e-16 ***
## sponsorNon-Commercial  0.31233    0.09147   3.415  0.00087 ***
## num_sites_z            0.09709    0.06911   1.405  0.16263    
## enrollment_z           0.10994    0.08059   1.364  0.17505    
## arms_z                -0.02851    0.03954  -0.721  0.47220    
## covid_timing_z        -0.20436    0.03852  -5.305 5.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.000256135)
## 
##     Null deviance: 0.068246  on 126  degrees of freedom
## Residual deviance: 0.052925  on 121  degrees of freedom
## AIC: 1843.8
## 
## Number of Fisher Scoring iterations: 9

4.1 Model Comparison

data.frame(
  Metric = c("AIC", "BIC"),
  Gamma = c(round(AIC(gamma_model), 1), round(BIC(gamma_model), 1)),
  Inverse_Gaussian = c(round(AIC(ig_model), 1), round(BIC(ig_model), 1))
)

##   Metric  Gamma Inverse_Gaussian
## 1    AIC 1821.8           1843.8
## 2    BIC 1841.7           1863.7

Lower AIC and BIC indicate better fit.

5 Conclusions

# Use better-fitting model
best_model <- gamma_model

sponsor_coef <- coef(best_model)["sponsorNon-Commercial"]
sponsor_pval <- summary(best_model)$coefficients["sponsorNon-Commercial", 4]
sponsor_pct <- round((exp(sponsor_coef) - 1) * 100, 0)

cat("=== KEY FINDING ===\n")

## === KEY FINDING ===

cat("Non-commercial trials take", sponsor_pct, "% longer than commercial trials\n")

## Non-commercial trials take 42 % longer than commercial trials

cat("P-value:", round(sponsor_pval, 4), "\n")

## P-value: 2e-04

cat("Statistically significant:", ifelse(sponsor_pval < 0.05, "Yes", "No"), "\n")

## Statistically significant: Yes

5.1 Summary of All Effects

coefs <- coef(best_model)
pvals <- summary(best_model)$coefficients[, 4]

data.frame(
  Variable = c("Sponsor (Non-Commercial)", "Number of Sites", "Enrollment", 
               "Number of Arms", "COVID Timing"),
  Pct_Change = paste0(round((exp(coefs[-1]) - 1) * 100, 1), "%"),
  P_Value = round(pvals[-1], 4),
  Significant = ifelse(pvals[-1] < 0.05, "Yes", "No")
)

##                                       Variable Pct_Change P_Value Significant
## sponsorNon-Commercial Sponsor (Non-Commercial)      42.1%  0.0002         Yes
## num_sites_z                    Number of Sites      12.4%  0.0728          No
## enrollment_z                        Enrollment       5.4%  0.3977          No
## arms_z                          Number of Arms      -3.2%  0.4181          No
## covid_timing_z                    COVID Timing     -17.5%  0.0000         Yes

Step 3: Exploratory Data Analysis and Model Fitting

PHD 1502: Quantitative Analysis I

Amanda McEwen

2025-11-30