0.1 Research Question

Does trial sponsor type (commercial vs non-commercial) predict how long Phase 2 respiratory disease trials take to complete?

We examine trials initiated during March 2019 through December 2022, a period spanning the onset and evolution of COVID-19. Respiratory trials are particularly interesting during this period because:

Hypotheses: - Hypothesis 1: Sponsor Type H₀: Sponsor type does not predict trial completion time H₁: Sponsor type does predict trial completion time

Variables:

1 Load Packages and Data

library(dplyr)      # Dataset creation, joining, variable creation
library(ggplot2)    # Data visualization
library(scales)     # Formatting for plot axes
library(statmod)    # Inverse Gaussian distribution for GLM
load("trials_analysis.RData")

Sample size: 127 trials


2 Exploratory Data Analysis

2.1 Sample Composition

# Count trials by sponsor type
sponsor_summary <- data.frame(
  Sponsor_Type = c("Commercial", "Non-Commercial", "Total"),
  N_Trials = c(
    sum(mydata$sponsor == "Commercial"),
    sum(mydata$sponsor == "Non-Commercial"),
    nrow(mydata)
  ),
  Percent = c(
    round(sum(mydata$sponsor == "Commercial") / nrow(mydata) * 100, 1),
    round(sum(mydata$sponsor == "Non-Commercial") / nrow(mydata) * 100, 1),
    100
  )
)

print(sponsor_summary)
##     Sponsor_Type N_Trials Percent
## 1     Commercial       83    65.4
## 2 Non-Commercial       44    34.6
## 3          Total      127   100.0

2.2 Distribution of Total Trial Completion Time

# Calculate mean and median
mean_time <- mean(mydata$completion_time)
median_time <- median(mydata$completion_time)

# Create histogram
ggplot(mydata, aes(x = completion_time)) +
  geom_histogram(bins = 30, fill = "blue", color = "white") +
  geom_vline(xintercept = mean_time, color = "red", linewidth = 1) +
  geom_vline(xintercept = median_time, color = "green", linewidth = 1) +
  labs(
    title = "Distribution of Trial Completion Time",
    subtitle = "Red = Mean | Green = Median",
    x = "Completion Time (days)",
    y = "Count"
  ) +
  theme_minimal()

Mean: 736 days | Median: 637 days

Mean > Median confirms right skew. This is why we need Gamma or Inverse Gaussian, not Normal.

2.3 Completion Time by Sponsor Type

# Create boxplot
ggplot(mydata, aes(x = sponsor, y = completion_time, fill = sponsor)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Commercial" = "blue", "Non-Commercial" = "yellow")) +
  labs(
    title = "Completion Time by Sponsor Type",
    x = "Sponsor Type",
    y = "Completion Time (days)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Summary statistics by sponsor
summary_by_sponsor <- mydata %>%
  group_by(sponsor) %>%
  summarise(
    N = n(),
    Mean = round(mean(completion_time), 0),
    Median = round(median(completion_time), 0),
    SD = round(sd(completion_time), 0),
    Min = min(completion_time),
    Max = max(completion_time)
  )

print(summary_by_sponsor)
## # A tibble: 2 × 7
##   sponsor            N  Mean Median    SD   Min   Max
##   <fct>          <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Commercial        83   646    609   307    89  1646
## 2 Non-Commercial    44   906    834   473   288  1917

2.4 Heteroscedasticity Check

# Fit simple linear model to check variance pattern
simple_fit <- lm(completion_time ~ sponsor + num_sites + enrollment, data = mydata)

# Create plot data
plot_data <- data.frame(
  fitted = fitted(simple_fit),
  abs_resid = abs(residuals(simple_fit))
)

# Plot
ggplot(plot_data, aes(x = fitted, y = abs_resid)) +
  geom_point(size = 2, color = "blue", alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    title = "Heteroscedasticity Check",
    subtitle = "Upward slope indicates variance increases with predicted completion time",
    x = "Fitted Values (Predicted Completion Time)",
    y = "Absolute Residuals"
  ) +
  theme_minimal()

The upward trend supports using Gamma or Inverse Gaussian GLMs.


3 Model 1: Gamma GLM

3.1 Model Equation

\[\text{CompletionTime}_i \sim \text{Gamma}(\mu_i, \phi)\]

\[\log(\mu_i) = \beta_0 + \beta_1(\text{NonCommercial}) + \beta_2(\text{NumSites}) + \beta_3(\text{Enrollment}) + \beta_4(\text{Arms}) + \beta_5(\text{CovidTiming})\]

3.2 Fitting the Model

# Fit Gamma GLM with log link
gamma_model <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = Gamma(link = "log")
)

# View summary
summary(gamma_model)
## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = Gamma(link = "log"), data = mydata)
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.43633    0.04888 131.671  < 2e-16 ***
## sponsorNon-Commercial  0.35112    0.09065   3.874 0.000175 ***
## num_sites_z            0.11667    0.06446   1.810 0.072759 .  
## enrollment_z           0.05245    0.06180   0.849 0.397685    
## arms_z                -0.03267    0.04021  -0.813 0.418095    
## covid_timing_z        -0.19290    0.03888  -4.961 2.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1782071)
## 
##     Null deviance: 38.083  on 126  degrees of freedom
## Residual deviance: 27.201  on 121  degrees of freedom
## AIC: 1821.8
## 
## Number of Fisher Scoring iterations: 6

3.3 Model Diagnostics

par(mfrow = c(1, 2))

# Residuals vs Fitted
plot(fitted(gamma_model), residuals(gamma_model, type = "deviance"),
     xlab = "Fitted Values", ylab = "Deviance Residuals",
     main = "Gamma: Residuals vs Fitted", col = "blue", pch = 16)
abline(h = 0, col = "red", lty = 2)

# Q-Q Plot
qqnorm(residuals(gamma_model, type = "deviance"), 
       main = "Gamma: Q-Q Plot", col = "blue", pch = 16)
qqline(residuals(gamma_model, type = "deviance"), col = "red")

par(mfrow = c(1, 1))

What we look for:

  • Residuals vs Fitted: Random scatter around zero (no patterns)
  • Q-Q Plot: Points should follow the diagonal line

3.4 Model Fit Statistics

# Calculate fit statistics
gamma_fit_stats <- data.frame(
  Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance", "Deviance Explained"),
  Value = c(
    round(AIC(gamma_model), 1),
    round(BIC(gamma_model), 1),
    round(gamma_model$null.deviance, 2),
    round(gamma_model$deviance, 2),
    paste0(round((1 - gamma_model$deviance/gamma_model$null.deviance) * 100, 1), "%")
  )
)

print(gamma_fit_stats)
##               Metric  Value
## 1                AIC 1821.8
## 2                BIC 1841.7
## 3      Null Deviance  38.08
## 4  Residual Deviance   27.2
## 5 Deviance Explained  28.6%

4 Model 2: Inverse Gaussian GLM

4.1 Fitting the Model

# Fit Inverse Gaussian GLM with log link
ig_model <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = inverse.gaussian(link = "log"),
  control = glm.control(maxit = 100)
)

# View summary
summary(ig_model)
## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = inverse.gaussian(link = "log"), 
##     data = mydata, control = glm.control(maxit = 100))
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.45296    0.04682 137.839  < 2e-16 ***
## sponsorNon-Commercial  0.31233    0.09147   3.415  0.00087 ***
## num_sites_z            0.09709    0.06911   1.405  0.16263    
## enrollment_z           0.10994    0.08059   1.364  0.17505    
## arms_z                -0.02851    0.03954  -0.721  0.47220    
## covid_timing_z        -0.20436    0.03852  -5.305 5.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.000256135)
## 
##     Null deviance: 0.068246  on 126  degrees of freedom
## Residual deviance: 0.052925  on 121  degrees of freedom
## AIC: 1843.8
## 
## Number of Fisher Scoring iterations: 9

4.2 Model Diagnostics

par(mfrow = c(1, 2))

# Residuals vs Fitted
plot(fitted(ig_model), residuals(ig_model, type = "deviance"),
     xlab = "Fitted Values", ylab = "Deviance Residuals",
     main = "Inverse Gaussian: Residuals vs Fitted", col = "blue", pch = 16)
abline(h = 0, col = "red", lty = 2)

# Q-Q Plot
qqnorm(residuals(ig_model, type = "deviance"), 
       main = "Inverse Gaussian: Q-Q Plot", col = "blue", pch = 16)
qqline(residuals(ig_model, type = "deviance"), col = "red")

par(mfrow = c(1, 1))

4.3 Model Fit Statistics

# Calculate fit statistics
ig_fit_stats <- data.frame(
  Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance", "Deviance Explained"),
  Value = c(
    round(AIC(ig_model), 1),
    round(BIC(ig_model), 1),
    round(ig_model$null.deviance, 4),
    round(ig_model$deviance, 4),
    paste0(round((1 - ig_model$deviance/ig_model$null.deviance) * 100, 1), "%")
  )
)

print(ig_fit_stats)
##               Metric  Value
## 1                AIC 1843.8
## 2                BIC 1863.7
## 3      Null Deviance 0.0682
## 4  Residual Deviance 0.0529
## 5 Deviance Explained  22.5%

5 Gamma vs Inverse Gaussian Model Comparison

# Compare the two models
comparison_table <- data.frame(
  Metric = c("AIC", "BIC", "Deviance Explained"),
  Gamma = c(
    round(AIC(gamma_model), 1),
    round(BIC(gamma_model), 1),
    paste0(round((1 - gamma_model$deviance/gamma_model$null.deviance) * 100, 1), "%")
  ),
  Inverse_Gaussian = c(
    round(AIC(ig_model), 1),
    round(BIC(ig_model), 1),
    paste0(round((1 - ig_model$deviance/ig_model$null.deviance) * 100, 1), "%")
  )
)

print(comparison_table)
##               Metric  Gamma Inverse_Gaussian
## 1                AIC 1821.8           1843.8
## 2                BIC 1841.7           1863.7
## 3 Deviance Explained  28.6%            22.5%

The Gamma model has lower AIC and BIC, indicating better fit.

The Gamma’s variance assumption (Var ∝ μ²) is a better match for these data than the Inverse Gaussian’s more extreme assumption (Var ∝ μ³).