0.1 Research Question

Does trial sponsor type (commercial vs non-commercial) predict how long Phase 2 respiratory disease trials take to complete?

We examine trials initiated during March 2019 through December 2022, a period spanning the onset and evolution of COVID-19. Respiratory trials are particularly interesting during this period because:

Hypotheses: - Hypothesis 1: Sponsor Type H₀: Sponsor type does not predict trial completion time H₁: Sponsor type does predict trial completion time

Variables:

1 Load Packages and Data

library(dplyr)      # Dataset creation, joining, variable creation
library(ggplot2)    # Data visualization
library(scales)     # Formatting for plot axes
library(statmod)    # Inverse Gaussian distribution for GLM
load("trials_analysis.RData")

Sample size: 127 trials


2 Exploratory Data Analysis

2.1 Sample Composition

# Count trials by sponsor type
sponsor_summary <- data.frame(
  Sponsor_Type = c("Commercial", "Non-Commercial", "Total"),
  N_Trials = c(
    sum(mydata$sponsor == "Commercial"),
    sum(mydata$sponsor == "Non-Commercial"),
    nrow(mydata)
  ),
  Percent = c(
    round(sum(mydata$sponsor == "Commercial") / nrow(mydata) * 100, 1),
    round(sum(mydata$sponsor == "Non-Commercial") / nrow(mydata) * 100, 1),
    100
  )
)

print(sponsor_summary)
##     Sponsor_Type N_Trials Percent
## 1     Commercial       83    65.4
## 2 Non-Commercial       44    34.6
## 3          Total      127   100.0

2.2 Distribution of Total Trial Completion Time

# mean and median
mean_time <- mean(mydata$completion_time)
median_time <- median(mydata$completion_time)

# histogram
ggplot(mydata, aes(x = completion_time)) +
  geom_histogram(bins = 30, fill = "blue", color = "white") +
  geom_vline(xintercept = mean_time, color = "red", linewidth = 1) +
  geom_vline(xintercept = median_time, color = "green", linewidth = 1) +
  labs(
    title = "Distribution of Trial Completion Time",
    subtitle = "Red = Mean | Green = Median",
    x = "Completion Time (days)",
    y = "Count"
  ) +
  theme_minimal()

Mean: 736 days | Median: 637 days

Mean > Median confirms right skew. This is why we need Gamma or Inverse Gaussian, not Normal.

2.3 Completion Time by Sponsor Type

# boxplot
ggplot(mydata, aes(x = sponsor, y = completion_time, fill = sponsor)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Commercial" = "blue", "Non-Commercial" = "yellow")) +
  labs(
    title = "Completion Time by Sponsor Type",
    x = "Sponsor Type",
    y = "Completion Time (days)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Summary stats by sponsor
summary_by_sponsor <- mydata %>%
  group_by(sponsor) %>%
  summarise(
    N = n(),
    Mean = round(mean(completion_time), 0),
    Median = round(median(completion_time), 0),
    SD = round(sd(completion_time), 0),
    Min = min(completion_time),
    Max = max(completion_time)
  )

print(summary_by_sponsor)
## # A tibble: 2 × 7
##   sponsor            N  Mean Median    SD   Min   Max
##   <fct>          <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Commercial        83   646    609   307    89  1646
## 2 Non-Commercial    44   906    834   473   288  1917

2.4 OLS attempt

# Fit OLS model
ols_model <- lm(completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z, 
                data = mydata)

# View summary
summary(ols_model)
## 
## Call:
## lm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -805.31 -206.11   32.18  164.45  747.92 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             637.11      38.21  16.676  < 2e-16 ***
## sponsorNon-Commercial   284.56      70.85   4.016 0.000103 ***
## num_sites_z              81.75      50.38   1.623 0.107246    
## enrollment_z             51.62      48.30   1.069 0.287329    
## arms_z                  -28.00      31.43  -0.891 0.374713    
## covid_timing_z         -134.09      30.39  -4.412 2.23e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 329.9 on 121 degrees of freedom
## Multiple R-squared:  0.3161, Adjusted R-squared:  0.2878 
## F-statistic: 11.18 on 5 and 121 DF,  p-value: 7.118e-09
(min(fitted(ols_model)))
## [1] 310.6041

3 Model 1: Gamma GLM

3.1 Model Equation

\[\text{CompletionTime}_i \sim \text{Gamma}(\mu_i, \phi)\]

\[\log(\mu_i) = \beta_0 + \beta_1(\text{NonCommercial}) + \beta_2(\text{NumSites}) + \beta_3(\text{Enrollment}) + \beta_4(\text{Arms}) + \beta_5(\text{CovidTiming})\]

3.2 Fitting the Model

# Fit Gamma GLM with log link
gamma_model <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = Gamma(link = "log")
)

# View summary
summary(gamma_model)
## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = Gamma(link = "log"), data = mydata)
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.43633    0.04888 131.671  < 2e-16 ***
## sponsorNon-Commercial  0.35112    0.09065   3.874 0.000175 ***
## num_sites_z            0.11667    0.06446   1.810 0.072759 .  
## enrollment_z           0.05245    0.06180   0.849 0.397685    
## arms_z                -0.03267    0.04021  -0.813 0.418095    
## covid_timing_z        -0.19290    0.03888  -4.961 2.32e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1782071)
## 
##     Null deviance: 38.083  on 126  degrees of freedom
## Residual deviance: 27.201  on 121  degrees of freedom
## AIC: 1821.8
## 
## Number of Fisher Scoring iterations: 6

3.3 Model Diagnostics

par(mfrow = c(1, 2))

# Residuals vs Fitted
plot(fitted(gamma_model), residuals(gamma_model, type = "deviance"),
     xlab = "Fitted Values", ylab = "Deviance Residuals",
     main = "Gamma: Residuals vs Fitted", col = "blue", pch = 16)
abline(h = 0, col = "red", lty = 2)

# Q-Q Plot
qqnorm(residuals(gamma_model, type = "deviance"), 
       main = "Gamma: Q-Q Plot", col = "blue", pch = 16)
qqline(residuals(gamma_model, type = "deviance"), col = "red")

par(mfrow = c(1, 1))

What to look for:

  • Residuals vs Fitted: Random scatter around zero (no patterns)
  • Q-Q Plot: Points should follow the diagonal line

3.4 Model Fit Statistics

# Calculate fit statistics
gamma_fit_stats <- data.frame(
  Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance", "Deviance Explained"),
  Value = c(
    round(AIC(gamma_model), 1),
    round(BIC(gamma_model), 1),
    round(gamma_model$null.deviance, 2),
    round(gamma_model$deviance, 2),
    paste0(round((1 - gamma_model$deviance/gamma_model$null.deviance) * 100, 1), "%")
  )
)

print(gamma_fit_stats)
##               Metric  Value
## 1                AIC 1821.8
## 2                BIC 1841.7
## 3      Null Deviance  38.08
## 4  Residual Deviance   27.2
## 5 Deviance Explained  28.6%

4 Model 2: Inverse Gaussian GLM

4.1 Fitting the Model

# Fit Inverse Gaussian GLM with log link
ig_model <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = inverse.gaussian(link = "log"),
  control = glm.control(maxit = 100)
)

# View summary
summary(ig_model)
## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = inverse.gaussian(link = "log"), 
##     data = mydata, control = glm.control(maxit = 100))
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.45296    0.04682 137.839  < 2e-16 ***
## sponsorNon-Commercial  0.31233    0.09147   3.415  0.00087 ***
## num_sites_z            0.09709    0.06911   1.405  0.16263    
## enrollment_z           0.10994    0.08059   1.364  0.17505    
## arms_z                -0.02851    0.03954  -0.721  0.47220    
## covid_timing_z        -0.20436    0.03852  -5.305 5.16e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.000256135)
## 
##     Null deviance: 0.068246  on 126  degrees of freedom
## Residual deviance: 0.052925  on 121  degrees of freedom
## AIC: 1843.8
## 
## Number of Fisher Scoring iterations: 9

4.2 Model Diagnostics

par(mfrow = c(1, 2))

# Residuals vs Fitted
plot(fitted(ig_model), residuals(ig_model, type = "deviance"),
     xlab = "Fitted Values", ylab = "Deviance Residuals",
     main = "Inverse Gaussian: Residuals vs Fitted", col = "blue", pch = 16)
abline(h = 0, col = "red", lty = 2)

# Q-Q Plot
qqnorm(residuals(ig_model, type = "deviance"), 
       main = "Inverse Gaussian: Q-Q Plot", col = "blue", pch = 16)
qqline(residuals(ig_model, type = "deviance"), col = "red")

par(mfrow = c(1, 1))

4.3 Model Fit Statistics

# Calculate fit statistics
ig_fit_stats <- data.frame(
  Metric = c("AIC", "BIC", "Null Deviance", "Residual Deviance", "Deviance Explained"),
  Value = c(
    round(AIC(ig_model), 1),
    round(BIC(ig_model), 1),
    round(ig_model$null.deviance, 4),
    round(ig_model$deviance, 4),
    paste0(round((1 - ig_model$deviance/ig_model$null.deviance) * 100, 1), "%")
  )
)

print(ig_fit_stats)
##               Metric  Value
## 1                AIC 1843.8
## 2                BIC 1863.7
## 3      Null Deviance 0.0682
## 4  Residual Deviance 0.0529
## 5 Deviance Explained  22.5%
# Fit Inverse Gaussian GLM with identity link
ig_model2 <- glm(
  completion_time ~ sponsor + num_sites_z + enrollment_z + arms_z + covid_timing_z,
  data = mydata,
  family = inverse.gaussian(link = "identity"),
  control = glm.control(maxit = 100)
)

# View summary
summary(ig_model2)
## 
## Call:
## glm(formula = completion_time ~ sponsor + num_sites_z + enrollment_z + 
##     arms_z + covid_timing_z, family = inverse.gaussian(link = "identity"), 
##     data = mydata, control = glm.control(maxit = 100))
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             675.24      33.22  20.327  < 2e-16 ***
## sponsorNon-Commercial   195.97      61.86   3.168  0.00194 ** 
## num_sites_z              69.11      52.75   1.310  0.19260    
## enrollment_z            131.68      75.49   1.744  0.08364 .  
## arms_z                  -15.21      24.63  -0.617  0.53809    
## covid_timing_z         -138.46      24.33  -5.691 8.97e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 0.0002564909)
## 
##     Null deviance: 0.068246  on 126  degrees of freedom
## Residual deviance: 0.052311  on 121  degrees of freedom
## AIC: 1842.3
## 
## Number of Fisher Scoring iterations: 12

5 Gamma vs Inverse Gaussian Model Comparison

# Compare the two models
comparison_table <- data.frame(
  Metric = c("AIC", "BIC", "Deviance Explained"),
  Gamma = c(
    round(AIC(gamma_model), 1),
    round(BIC(gamma_model), 1),
    paste0(round((1 - gamma_model$deviance/gamma_model$null.deviance) * 100, 1), "%")
  ),
  Inverse_Gaussian = c(
    round(AIC(ig_model), 1),
    round(BIC(ig_model), 1),
    paste0(round((1 - ig_model$deviance/ig_model$null.deviance) * 100, 1), "%")
  ),
  Inverse_Gaussian2 = c(
    round(AIC(ig_model2), 1),
    round(BIC(ig_model2), 1),
    paste0(round((1 - ig_model$deviance/ig_model$null.deviance) * 100, 1), "%")
  )
)

print(comparison_table)
##               Metric  Gamma Inverse_Gaussian Inverse_Gaussian2
## 1                AIC 1821.8           1843.8            1842.3
## 2                BIC 1841.7           1863.7            1862.2
## 3 Deviance Explained  28.6%            22.5%             22.5%

The Gamma model has lower AIC and BIC, indicating better fit.

The Gamma’s variance assumption (Var ∝ μ²) is a better match for these data than the Inverse Gaussian’s more extreme assumption (Var ∝ μ³).