1 Introduction

1.1 Research Question

Among Phase 2 solid tumor oncology trials initiated during the early COVID-19 period (March 2019 through March 2021), does sponsor type (commercial, academic, or government) predict time to study completion?

1.2 Methodological Approach

This analysis employs Generalized Linear Models (GLMs) to examine the relationship between sponsor type and trial completion time. The dependent variable (time to completion in days) is continuous, strictly positive, and expected to be right-skewed with variance increasing with the mean. These characteristics suggest distributions from the exponential family that accommodate positive, skewed outcomes:

Gamma distribution: Appropriate for positive continuous data where variance is proportional to the square of the mean
Inverse Gaussian distribution: Appropriate for positive continuous data with heavier right tails, where variance is proportional to the cube of the mean

Both models use a log link function, which ensures predicted values remain positive and allows interpretation of coefficients as multiplicative effects on completion time.

1.3 Covariates

In addition to sponsor type, the following covariates are included:

Number of clinical trial sites: Proxy for trial complexity and geographic scope
Enrollment: Target sample size as an indicator of trial scale
Number of arms: Reflects study design complexity
Months from COVID start: Continuous measure of timing relative to March 2020 (WHO pandemic declaration)

2 Data Acquisition

2.1 Load Required Packages

library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(patchwork)
library(statmod)
library(MASS)
library(car)
library(knitr)
library(kableExtra)

theme_set(theme_bw(base_size = 12))

2.2 Load Previously Downloaded Data

load("phase2_solid_tumor_trials.RData")
cat("Loaded dataset with", nrow(trials), "trials\n")

## Loaded dataset with 2125 trials

3 Data Cleaning

3.1 Initial Processing

covid_start <- as.Date("2020-03-01")

trials_processed <- trials %>%
  mutate(
    sponsor_type_clean = case_when(
      sponsor_type == "INDUSTRY" ~ "Commercial",
      sponsor_type %in% c("NIH", "FED", "OTHER_GOV") ~ "Government",
      sponsor_type %in% c("OTHER", "NETWORK", "UNKNOWN", "INDIV", "AMBIG") ~ "Academic",
      TRUE ~ "Academic"
    ),
    start_date = as.Date(start_date),
    completion_date = as.Date(completion_date),
    time_to_final = as.numeric(completion_date - start_date),
    months_from_covid = as.numeric(start_date - covid_start) / 30.44,
    num_sites = as.numeric(num_sites),
    sponsor_type_clean = factor(sponsor_type_clean, 
                                 levels = c("Commercial", "Academic", "Government"))
  )

cat("Initial dataset:", nrow(trials_processed), "trials\n")

## Initial dataset: 2125 trials

3.2 Apply Inclusion Criteria

trials_clean <- trials_processed %>%
  filter(overall_status == "COMPLETED") %>%
  filter(time_to_final > 0) %>%
  filter(!is.na(time_to_final)) %>%
  filter(!is.na(num_sites)) %>%
  filter(!is.na(enrollment)) %>%
  filter(!is.na(number_of_arms))

cat("After filtering to COMPLETED trials:", nrow(trials_clean), "trials\n")

## After filtering to COMPLETED trials: 444 trials

3.3 Create Scaled Variables for GLM

trials_scaled <- trials_clean %>%
  mutate(
    num_sites_scaled = scale(num_sites)[,1],
    enrollment_scaled = scale(enrollment)[,1],
    number_of_arms_scaled = scale(number_of_arms)[,1],
    months_from_covid_scaled = scale(months_from_covid)[,1]
  )

# Store scaling parameters for interpretation
scale_params <- data.frame(
  Variable = c("num_sites", "enrollment", "number_of_arms", "months_from_covid"),
  Mean = c(mean(trials_clean$num_sites), mean(trials_clean$enrollment),
           mean(trials_clean$number_of_arms), mean(trials_clean$months_from_covid)),
  SD = c(sd(trials_clean$num_sites), sd(trials_clean$enrollment),
         sd(trials_clean$number_of_arms), sd(trials_clean$months_from_covid))
)

kable(scale_params, digits = 2, caption = "Scaling Parameters (for back-transformation)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Scaling Parameters (for back-transformation)
Variable	Mean	SD
num_sites	15.93	69.59
enrollment	63.48	71.68
number_of_arms	1.62	1.18
months_from_covid	0.71	7.63

3.4 Cleaning Summary

cleaning_summary <- data.frame(
  Step = c("Raw dataset from AACT",
           "After removing non-COMPLETED status",
           "After removing invalid completion times",
           "After removing missing covariates"),
  N = c(nrow(trials_processed),
        sum(trials_processed$overall_status == "COMPLETED"),
        sum(trials_processed$overall_status == "COMPLETED" & 
            trials_processed$time_to_final > 0, na.rm = TRUE),
        nrow(trials_clean))
)

kable(cleaning_summary, caption = "Data Cleaning Steps") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Data Cleaning Steps
Step	N
Raw dataset from AACT	2125
After removing non-COMPLETED status	445
After removing invalid completion times	444
After removing missing covariates	444

4 Exploratory Data Analysis

4.1 Sample Characteristics

4.1.1 Sponsor Type Distribution

sponsor_table <- trials_clean %>%
  group_by(sponsor_type_clean) %>%
  summarise(
    N = n(),
    Percent = round(n() / nrow(trials_clean) * 100, 1)
  ) %>%
  rename(`Sponsor Type` = sponsor_type_clean)

kable(sponsor_table, caption = "Distribution of Trials by Sponsor Type") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Distribution of Trials by Sponsor Type
Sponsor Type	N	Percent
Commercial	114	25.7
Academic	318	71.6
Government	12	2.7

4.1.2 Descriptive Statistics for Continuous Variables

desc_stats <- data.frame(
  Variable = c("Time to Completion (days)", "Number of Sites", 
               "Enrollment", "Number of Arms", "Months from COVID"),
  N = rep(nrow(trials_clean), 5),
  Mean = c(mean(trials_clean$time_to_final), mean(trials_clean$num_sites), 
           mean(trials_clean$enrollment), mean(trials_clean$number_of_arms), 
           mean(trials_clean$months_from_covid)),
  SD = c(sd(trials_clean$time_to_final), sd(trials_clean$num_sites), 
         sd(trials_clean$enrollment), sd(trials_clean$number_of_arms), 
         sd(trials_clean$months_from_covid)),
  Median = c(median(trials_clean$time_to_final), median(trials_clean$num_sites), 
             median(trials_clean$enrollment), median(trials_clean$number_of_arms), 
             median(trials_clean$months_from_covid)),
  Min = c(min(trials_clean$time_to_final), min(trials_clean$num_sites), 
          min(trials_clean$enrollment), min(trials_clean$number_of_arms), 
          min(trials_clean$months_from_covid)),
  Max = c(max(trials_clean$time_to_final), max(trials_clean$num_sites), 
          max(trials_clean$enrollment), max(trials_clean$number_of_arms), 
          max(trials_clean$months_from_covid))
)

kable(desc_stats, digits = 2, caption = "Descriptive Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Descriptive Statistics
Variable	N	Mean	SD	Median	Min	Max
Time to Completion (days)	444	1211.34	452.68	1222.00	13.00	2313.00
Number of Sites	444	15.93	69.59	1.00	1.00	950.00
Enrollment	444	63.48	71.68	42.00	1.00	627.00
Number of Arms	444	1.62	1.18	1.00	1.00	15.00
Months from COVID	444	0.71	7.63	1.92	-12.02	12.98

4.1.3 Time to Completion by Sponsor Type

completion_by_sponsor <- trials_clean %>%
  group_by(sponsor_type_clean) %>%
  summarise(
    N = n(),
    `Mean (days)` = mean(time_to_final),
    `SD` = sd(time_to_final),
    `Median (days)` = median(time_to_final),
    `Mean (months)` = mean(time_to_final) / 30.44,
    `Median (months)` = median(time_to_final) / 30.44
  ) %>%
  rename(`Sponsor Type` = sponsor_type_clean)

kable(completion_by_sponsor, digits = 1, 
      caption = "Time to Completion by Sponsor Type") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Time to Completion by Sponsor Type
Sponsor Type	N	Mean (days)	SD	Median (days)	Mean (months)	Median (months)
Commercial	114	1176.2	429.6	1148.5	38.6	37.7
Academic	318	1229.3	460.2	1236.0	40.4	40.6
Government	12	1069.8	457.3	1115.5	35.1	36.6

4.2 Visualizations

4.2.1 Distribution of Dependent Variable

p_hist <- ggplot(trials_clean, aes(x = time_to_final)) +
  geom_histogram(aes(y = after_stat(density)), bins = 40, 
                 fill = "#2E86AB", alpha = 0.7, color = "white") +
  geom_density(color = "#A23B72", linewidth = 1.2) +
  labs(
    title = "Distribution of Time to Final Completion",
    subtitle = "Right-skewed distribution supports Gamma or Inverse Gaussian GLM",
    x = "Time to Completion (days)",
    y = "Density"
  ) +
  scale_x_continuous(labels = comma)

p_hist

4.2.2 Box Plots: Completion Time by Sponsor Type

p_box <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = time_to_final, 
                                   fill = sponsor_type_clean)) +
  geom_boxplot(alpha = 0.7, outlier.alpha = 0.4) +
  scale_fill_manual(values = c("Commercial" = "#2E86AB", 
                                "Academic" = "#A23B72", 
                                "Government" = "#F18F01")) +
  labs(
    title = "Time to Completion by Sponsor Type",
    x = "Sponsor Type",
    y = "Time to Completion (days)"
  ) +
  theme(legend.position = "none") +
  scale_y_continuous(labels = comma)

p_box

4.2.3 Violin Plots: Distribution Shape by Sponsor Type

p_violin <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = time_to_final, 
                                      fill = sponsor_type_clean)) +
  geom_violin(alpha = 0.5, trim = FALSE) +
  geom_boxplot(width = 0.15, alpha = 0.8, outlier.size = 1) +
  scale_fill_manual(values = c("Commercial" = "#2E86AB", 
                                "Academic" = "#A23B72", 
                                "Government" = "#F18F01")) +
  labs(
    title = "Distribution of Completion Time by Sponsor Type",
    x = "Sponsor Type",
    y = "Time to Completion (days)"
  ) +
  theme(legend.position = "none") +
  scale_y_continuous(labels = comma)

p_violin

4.2.4 Covariate Relationships

p_sites <- ggplot(trials_clean, aes(x = sponsor_type_clean, y = num_sites, 
                                     fill = sponsor_type_clean)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("Commercial" = "#2E86AB", 
                                "Academic" = "#A23B72", 
                                "Government" = "#F18F01")) +
  labs(title = "Number of Sites by Sponsor Type", x = "", y = "Number of Sites") +
  theme(legend.position = "none") +
  coord_cartesian(ylim = c(0, quantile(trials_clean$num_sites, 0.95)))

p_sites_time <- ggplot(trials_clean, aes(x = num_sites, y = time_to_final, 
                                          color = sponsor_type_clean)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(title = "Completion Time vs Number of Sites", 
       x = "Number of Sites", y = "Time to Completion (days)", color = "Sponsor") +
  coord_cartesian(xlim = c(0, quantile(trials_clean$num_sites, 0.95)))

p_enroll_time <- ggplot(trials_clean, aes(x = enrollment, y = time_to_final, 
                                           color = sponsor_type_clean)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(title = "Completion Time vs Enrollment", 
       x = "Enrollment", y = "Time to Completion (days)", color = "Sponsor") +
  coord_cartesian(xlim = c(0, quantile(trials_clean$enrollment, 0.95)))

p_covid <- ggplot(trials_clean, aes(x = months_from_covid, y = time_to_final, 
                                     color = sponsor_type_clean)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(title = "Completion Time vs COVID Timing", 
       subtitle = "Dashed line = March 2020 (WHO pandemic declaration)",
       x = "Months from COVID Start", y = "Time to Completion (days)", color = "Sponsor")

(p_sites | p_sites_time) / (p_enroll_time | p_covid)

4.2.5 Mean-Variance Relationship

mv_check <- trials_clean %>%
  mutate(bin = cut(time_to_final, breaks = 15)) %>%
  group_by(bin) %>%
  summarise(
    bin_mean = mean(time_to_final),
    bin_var = var(time_to_final),
    n = n()
  ) %>%
  filter(n >= 5)

p_mv <- ggplot(mv_check, aes(x = bin_mean, y = bin_var)) +
  geom_point(size = 3, color = "#2E86AB") +
  geom_smooth(method = "lm", se = TRUE, color = "#A23B72") +
  labs(
    title = "Mean-Variance Relationship",
    subtitle = "Positive relationship supports Gamma/Inverse Gaussian over Normal",
    x = "Bin Mean (days)",
    y = "Bin Variance"
  ) +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma)

p_mv

4.2.6 Correlation Matrix

cor_vars <- trials_clean %>%
  dplyr::select(time_to_final, num_sites, enrollment, number_of_arms, months_from_covid)

cor_matrix <- cor(cor_vars, use = "complete.obs")

kable(round(cor_matrix, 3), caption = "Correlation Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Correlation Matrix
	time_to_final	num_sites	enrollment	number_of_arms	months_from_covid
time_to_final	1.000	0.170	-0.038	0.015	-0.227
num_sites	0.170	1.000	0.121	0.037	-0.100
enrollment	-0.038	0.121	1.000	0.235	0.009
number_of_arms	0.015	0.037	0.235	1.000	0.017
months_from_covid	-0.227	-0.100	0.009	0.017	1.000

5 Generalized Linear Models

5.1 Model Specification

The GLM takes the form:

\[\log(\mu_i) = \beta_0 + \beta_1 \cdot \text{Academic}_i + \beta_2 \cdot \text{Government}_i + \beta_3 \cdot \text{NumSites}_i + \beta_4 \cdot \text{Enrollment}_i + \beta_5 \cdot \text{NumArms}_i + \beta_6 \cdot \text{MonthsFromCOVID}_i\]

where \(\mu_i = E(Y_i)\) is the expected completion time for trial \(i\).

Note: Continuous predictors are standardized (mean-centered and scaled by SD) to improve model convergence and allow comparison of effect sizes.

5.2 Model 1: Gamma GLM with Log Link

5.2.1 Fit the Model

model_gamma <- glm(time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled + 
                     number_of_arms_scaled + months_from_covid_scaled,
                   data = trials_scaled,
                   family = Gamma(link = "log"))

summary(model_gamma)

## 
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean + num_sites_scaled + 
##     enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled, 
##     family = Gamma(link = "log"), data = trials_scaled)
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7.08694    0.03534 200.545  < 2e-16 ***
## sponsor_type_cleanAcademic    0.01524    0.04185   0.364  0.71591    
## sponsor_type_cleanGovernment -0.13313    0.10992  -1.211  0.22648    
## num_sites_scaled              0.05021    0.01731   2.900  0.00392 ** 
## enrollment_scaled            -0.02209    0.01822  -1.212  0.22608    
## number_of_arms_scaled         0.01079    0.01777   0.607  0.54399    
## months_from_covid_scaled     -0.07882    0.01737  -4.538 7.37e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1290495)
## 
##     Null deviance: 88.442  on 443  degrees of freedom
## Residual deviance: 83.807  on 437  degrees of freedom
## AIC: 6765.6
## 
## Number of Fisher Scoring iterations: 5

5.2.2 Gamma Model Coefficients

gamma_coefs <- data.frame(
  Term = names(coef(model_gamma)),
  Estimate = coef(model_gamma),
  SE = summary(model_gamma)$coefficients[, 2],
  z_value = summary(model_gamma)$coefficients[, 3],
  p_value = summary(model_gamma)$coefficients[, 4],
  Exp_Estimate = exp(coef(model_gamma))
)
rownames(gamma_coefs) <- NULL

kable(gamma_coefs, digits = 4, 
      caption = "Gamma GLM Coefficients (Log Link)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Gamma GLM Coefficients (Log Link)
Term	Estimate	SE	z_value	p_value	Exp_Estimate
(Intercept)	7.0869	0.0353	200.5454	0.0000	1196.2471
sponsor_type_cleanAcademic	0.0152	0.0419	0.3642	0.7159	1.0154
sponsor_type_cleanGovernment	-0.1331	0.1099	-1.2112	0.2265	0.8753
num_sites_scaled	0.0502	0.0173	2.8997	0.0039	1.0515
enrollment_scaled	-0.0221	0.0182	-1.2122	0.2261	0.9782
number_of_arms_scaled	0.0108	0.0178	0.6073	0.5440	1.0109
months_from_covid_scaled	-0.0788	0.0174	-4.5375	0.0000	0.9242

5.2.3 Interpretation of Gamma Model

cat("Multiplicative Effects (Gamma Model):\n\n")

## Multiplicative Effects (Gamma Model):

academic_effect <- exp(coef(model_gamma)["sponsor_type_cleanAcademic"])
gov_effect <- exp(coef(model_gamma)["sponsor_type_cleanGovernment"])

cat("Academic vs Commercial: ", round((academic_effect - 1) * 100, 1), 
    "% change in completion time\n", sep = "")

## Academic vs Commercial: 1.5% change in completion time

cat("Government vs Commercial: ", round((gov_effect - 1) * 100, 1), 
    "% change in completion time\n\n", sep = "")

## Government vs Commercial: -12.5% change in completion time

cat("Per 1 SD increase in num_sites: ", round((exp(coef(model_gamma)["num_sites_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in num_sites: 5.15% change

cat("Per 1 SD increase in enrollment: ", round((exp(coef(model_gamma)["enrollment_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in enrollment: -2.18% change

cat("Per 1 SD increase in number_of_arms: ", round((exp(coef(model_gamma)["number_of_arms_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in number_of_arms: 1.09% change

cat("Per 1 SD increase in months_from_covid: ", round((exp(coef(model_gamma)["months_from_covid_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in months_from_covid: -7.58% change

5.2.4 Gamma Model Diagnostics

gamma_deviance_resid <- residuals(model_gamma, type = "deviance")
gamma_pearson_resid <- residuals(model_gamma, type = "pearson")
gamma_fitted <- fitted(model_gamma)

gamma_diag <- data.frame(
  fitted = gamma_fitted,
  deviance_resid = gamma_deviance_resid,
  pearson_resid = gamma_pearson_resid,
  observed = trials_scaled$time_to_final
)

p1_gamma <- ggplot(gamma_diag, aes(x = fitted, y = deviance_resid)) +
  geom_point(alpha = 0.4, color = "#2E86AB") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
  labs(title = "Gamma: Deviance Residuals vs Fitted",
       x = "Fitted Values", y = "Deviance Residuals")

p2_gamma <- ggplot(gamma_diag, aes(sample = deviance_resid)) +
  stat_qq(color = "#2E86AB", alpha = 0.5) +
  stat_qq_line(color = "#A23B72", linewidth = 1) +
  labs(title = "Gamma: Q-Q Plot of Deviance Residuals",
       x = "Theoretical Quantiles", y = "Sample Quantiles")

p3_gamma <- ggplot(gamma_diag, aes(x = fitted, y = sqrt(abs(deviance_resid)))) +
  geom_point(alpha = 0.4, color = "#2E86AB") +
  geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
  labs(title = "Gamma: Scale-Location Plot",
       x = "Fitted Values", y = "sqrt|Deviance Residuals|")

p4_gamma <- ggplot(gamma_diag, aes(x = fitted, y = observed)) +
  geom_point(alpha = 0.4, color = "#2E86AB") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Gamma: Observed vs Fitted",
       x = "Fitted Values", y = "Observed Values") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma)

(p1_gamma | p2_gamma) / (p3_gamma | p4_gamma)

5.2.5 Gamma Model Fit Statistics

gamma_stats <- data.frame(
  Metric = c("Null Deviance", "Residual Deviance", "AIC", "BIC", 
             "Dispersion Parameter", "Degrees of Freedom (Residual)"),
  Value = c(
    model_gamma$null.deviance,
    model_gamma$deviance,
    AIC(model_gamma),
    BIC(model_gamma),
    summary(model_gamma)$dispersion,
    model_gamma$df.residual
  )
)

kable(gamma_stats, digits = 2, caption = "Gamma GLM Fit Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Gamma GLM Fit Statistics
Metric	Value
Null Deviance	88.44
Residual Deviance	83.81
AIC	6765.58
BIC	6798.35
Dispersion Parameter	0.13
Degrees of Freedom (Residual)	437.00

5.3 Model 2: Inverse Gaussian GLM with Log Link

5.3.1 Fit the Model

model_ig <- glm(time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled + 
                  number_of_arms_scaled + months_from_covid_scaled,
                data = trials_scaled,
                family = inverse.gaussian(link = "log"),
                control = glm.control(maxit = 100))

summary(model_ig)

## 
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean + num_sites_scaled + 
##     enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled, 
##     family = inverse.gaussian(link = "log"), data = trials_scaled, 
##     control = glm.control(maxit = 100))
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)
## (Intercept)                  -1.994e+03  5.762e+17       0        1
## sponsor_type_cleanAcademic    1.784e+02  6.856e+17       0        1
## sponsor_type_cleanGovernment  3.562e+02  1.813e+18       0        1
## num_sites_scaled             -4.087e+02  2.766e+17       0        1
## enrollment_scaled             1.931e+02  3.507e+17       0        1
## number_of_arms_scaled        -1.824e+03  2.874e+17       0        1
## months_from_covid_scaled     -5.977e+02  2.968e+17       0        1
## 
## (Dispersion parameter for inverse.gaussian family taken to be 1.458158e+53)
## 
##     Null deviance: 1.8086e-01  on 443  degrees of freedom
## Residual deviance: 1.0262e+37  on 437  degrees of freedom
## AIC: 45732
## 
## Number of Fisher Scoring iterations: 7

5.3.2 Inverse Gaussian Model Coefficients

ig_coefs <- data.frame(
  Term = names(coef(model_ig)),
  Estimate = coef(model_ig),
  SE = summary(model_ig)$coefficients[, 2],
  z_value = summary(model_ig)$coefficients[, 3],
  p_value = summary(model_ig)$coefficients[, 4],
  Exp_Estimate = exp(coef(model_ig))
)
rownames(ig_coefs) <- NULL

kable(ig_coefs, digits = 4, 
      caption = "Inverse Gaussian GLM Coefficients (Log Link)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Inverse Gaussian GLM Coefficients (Log Link)
Term	Estimate	SE	p_value	Exp_Estimate
(Intercept)	-1994.2730	5.762382e+17	1	0.000000e+00
sponsor_type_cleanAcademic	178.3788	6.855617e+17	1	2.943907e+77
sponsor_type_cleanGovernment	356.2027	1.812901e+18	1	4.976036e+154
num_sites_scaled	-408.6918	2.766446e+17	1	0.000000e+00
enrollment_scaled	193.0600	3.507351e+17	1	6.996602e+83
number_of_arms_scaled	-1824.0322	2.873803e+17	1	0.000000e+00
months_from_covid_scaled	-597.6835	2.967745e+17	1	0.000000e+00

5.3.3 Interpretation of Inverse Gaussian Model

cat("Multiplicative Effects (Inverse Gaussian Model):\n\n")

## Multiplicative Effects (Inverse Gaussian Model):

academic_effect_ig <- exp(coef(model_ig)["sponsor_type_cleanAcademic"])
gov_effect_ig <- exp(coef(model_ig)["sponsor_type_cleanGovernment"])

cat("Academic vs Commercial: ", round((academic_effect_ig - 1) * 100, 1), 
    "% change in completion time\n", sep = "")

## Academic vs Commercial: 2.943907e+79% change in completion time

cat("Government vs Commercial: ", round((gov_effect_ig - 1) * 100, 1), 
    "% change in completion time\n\n", sep = "")

## Government vs Commercial: 4.976036e+156% change in completion time

cat("Per 1 SD increase in num_sites: ", round((exp(coef(model_ig)["num_sites_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in num_sites: -100% change

cat("Per 1 SD increase in enrollment: ", round((exp(coef(model_ig)["enrollment_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in enrollment: 6.996602e+85% change

cat("Per 1 SD increase in number_of_arms: ", round((exp(coef(model_ig)["number_of_arms_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in number_of_arms: -100% change

cat("Per 1 SD increase in months_from_covid: ", round((exp(coef(model_ig)["months_from_covid_scaled"]) - 1) * 100, 2), 
    "% change\n", sep = "")

## Per 1 SD increase in months_from_covid: -100% change

5.3.4 Inverse Gaussian Model Diagnostics

ig_deviance_resid <- residuals(model_ig, type = "deviance")
ig_pearson_resid <- residuals(model_ig, type = "pearson")
ig_fitted <- fitted(model_ig)

ig_diag <- data.frame(
  fitted = ig_fitted,
  deviance_resid = ig_deviance_resid,
  pearson_resid = ig_pearson_resid,
  observed = trials_scaled$time_to_final
)

p1_ig <- ggplot(ig_diag, aes(x = fitted, y = deviance_resid)) +
  geom_point(alpha = 0.4, color = "#2E86AB") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
  labs(title = "Inverse Gaussian: Deviance Residuals vs Fitted",
       x = "Fitted Values", y = "Deviance Residuals")

p2_ig <- ggplot(ig_diag, aes(sample = deviance_resid)) +
  stat_qq(color = "#2E86AB", alpha = 0.5) +
  stat_qq_line(color = "#A23B72", linewidth = 1) +
  labs(title = "Inverse Gaussian: Q-Q Plot of Deviance Residuals",
       x = "Theoretical Quantiles", y = "Sample Quantiles")

p3_ig <- ggplot(ig_diag, aes(x = fitted, y = sqrt(abs(deviance_resid)))) +
  geom_point(alpha = 0.4, color = "#2E86AB") +
  geom_smooth(method = "loess", se = TRUE, color = "#A23B72") +
  labs(title = "Inverse Gaussian: Scale-Location Plot",
       x = "Fitted Values", y = "sqrt|Deviance Residuals|")

p4_ig <- ggplot(ig_diag, aes(x = fitted, y = observed)) +
  geom_point(alpha = 0.4, color = "#2E86AB") +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Inverse Gaussian: Observed vs Fitted",
       x = "Fitted Values", y = "Observed Values") +
  scale_x_continuous(labels = comma) +
  scale_y_continuous(labels = comma)

(p1_ig | p2_ig) / (p3_ig | p4_ig)

5.3.5 Inverse Gaussian Model Fit Statistics

ig_stats <- data.frame(
  Metric = c("Null Deviance", "Residual Deviance", "AIC", "BIC", 
             "Dispersion Parameter", "Degrees of Freedom (Residual)"),
  Value = c(
    model_ig$null.deviance,
    model_ig$deviance,
    AIC(model_ig),
    BIC(model_ig),
    summary(model_ig)$dispersion,
    model_ig$df.residual
  )
)

kable(ig_stats, digits = 2, caption = "Inverse Gaussian GLM Fit Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Inverse Gaussian GLM Fit Statistics
Metric	Value
Null Deviance	1.800000e-01
Residual Deviance	1.026203e+37
AIC	4.573167e+04
BIC	4.576444e+04
Dispersion Parameter	1.458158e+53
Degrees of Freedom (Residual)	4.370000e+02

6 Model Comparison

6.1 Side-by-Side Fit Statistics

comparison <- data.frame(
  Metric = c("AIC", "BIC", "Residual Deviance", "Null Deviance", 
             "Deviance Explained (%)", "Dispersion Parameter"),
  Gamma = c(
    AIC(model_gamma),
    BIC(model_gamma),
    model_gamma$deviance,
    model_gamma$null.deviance,
    (1 - model_gamma$deviance / model_gamma$null.deviance) * 100,
    summary(model_gamma)$dispersion
  ),
  Inverse_Gaussian = c(
    AIC(model_ig),
    BIC(model_ig),
    model_ig$deviance,
    model_ig$null.deviance,
    (1 - model_ig$deviance / model_ig$null.deviance) * 100,
    summary(model_ig)$dispersion
  )
)

comparison$Difference <- comparison$Gamma - comparison$Inverse_Gaussian
comparison$Better <- ifelse(comparison$Difference > 0, "Inverse Gaussian", "Gamma")
comparison$Better[comparison$Metric == "Deviance Explained (%)"] <- 
  ifelse(comparison$Difference[comparison$Metric == "Deviance Explained (%)"] < 0, 
         "Inverse Gaussian", "Gamma")

kable(comparison, digits = 2, 
      caption = "Model Comparison: Gamma vs Inverse Gaussian") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  column_spec(5, bold = TRUE)

Model Comparison: Gamma vs Inverse Gaussian
Metric	Gamma	Inverse_Gaussian	Difference	Better
AIC	6765.58	4.573167e+04	-3.896609e+04	Gamma
BIC	6798.35	4.576444e+04	-3.896609e+04	Gamma
Residual Deviance	83.81	1.026203e+37	-1.026203e+37	Gamma
Null Deviance	88.44	1.800000e-01	8.826000e+01	Inverse Gaussian
Deviance Explained (%)	5.24	-5.674058e+39	5.674058e+39	Gamma
Dispersion Parameter	0.13	1.458158e+53	-1.458158e+53	Gamma

6.2 Coefficient Comparison

coef_compare <- data.frame(
  Term = names(coef(model_gamma)),
  Gamma_Est = coef(model_gamma),
  Gamma_Exp = exp(coef(model_gamma)),
  IG_Est = coef(model_ig),
  IG_Exp = exp(coef(model_ig))
)
rownames(coef_compare) <- NULL

kable(coef_compare, digits = 4, col.names = c("Term", "Gamma Est", "Gamma exp(Est)", 
                                                "IG Est", "IG exp(Est)"),
      caption = "Coefficient Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Coefficient Comparison
Term	Gamma Est	Gamma exp(Est)	IG Est	IG exp(Est)
(Intercept)	7.0869	1196.2471	-1994.2730	0.000000e+00
sponsor_type_cleanAcademic	0.0152	1.0154	178.3788	2.943907e+77
sponsor_type_cleanGovernment	-0.1331	0.8753	356.2027	4.976036e+154
num_sites_scaled	0.0502	1.0515	-408.6918	0.000000e+00
enrollment_scaled	-0.0221	0.9782	193.0600	6.996602e+83
number_of_arms_scaled	0.0108	1.0109	-1824.0322	0.000000e+00
months_from_covid_scaled	-0.0788	0.9242	-597.6835	0.000000e+00

6.3 Residual Comparison

resid_compare <- data.frame(
  Model = rep(c("Gamma", "Inverse Gaussian"), each = nrow(trials_scaled)),
  Fitted = c(gamma_fitted, ig_fitted),
  Deviance_Residuals = c(gamma_deviance_resid, ig_deviance_resid)
)

p_qq_compare <- ggplot(resid_compare, aes(sample = Deviance_Residuals, color = Model)) +
  stat_qq(alpha = 0.5) +
  stat_qq_line() +
  scale_color_manual(values = c("Gamma" = "#2E86AB", "Inverse Gaussian" = "#A23B72")) +
  labs(title = "Q-Q Plot Comparison",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  facet_wrap(~Model)

p_resid_compare <- ggplot(resid_compare, aes(x = Fitted, y = Deviance_Residuals, color = Model)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_smooth(method = "loess", se = TRUE) +
  scale_color_manual(values = c("Gamma" = "#2E86AB", "Inverse Gaussian" = "#A23B72")) +
  labs(title = "Residuals vs Fitted Comparison",
       x = "Fitted Values", y = "Deviance Residuals") +
  facet_wrap(~Model)

p_qq_compare / p_resid_compare

6.4 Model Selection Summary

aic_winner <- ifelse(AIC(model_gamma) < AIC(model_ig), "Gamma", "Inverse Gaussian")
bic_winner <- ifelse(BIC(model_gamma) < BIC(model_ig), "Gamma", "Inverse Gaussian")

cat("=== MODEL SELECTION SUMMARY ===\n\n")

## === MODEL SELECTION SUMMARY ===

cat("AIC prefers:", aic_winner, "\n")

## AIC prefers: Gamma

cat("  Gamma AIC:", round(AIC(model_gamma), 2), "\n")

##   Gamma AIC: 6765.58

cat("  Inverse Gaussian AIC:", round(AIC(model_ig), 2), "\n")

##   Inverse Gaussian AIC: 45731.67

cat("  Difference:", round(abs(AIC(model_gamma) - AIC(model_ig)), 2), "\n\n")

##   Difference: 38966.09

cat("BIC prefers:", bic_winner, "\n")

## BIC prefers: Gamma

cat("  Gamma BIC:", round(BIC(model_gamma), 2), "\n")

##   Gamma BIC: 6798.35

cat("  Inverse Gaussian BIC:", round(BIC(model_ig), 2), "\n")

##   Inverse Gaussian BIC: 45764.44

cat("  Difference:", round(abs(BIC(model_gamma) - BIC(model_ig)), 2), "\n")

##   Difference: 38966.09

7 Interaction Model: Sponsor Type x Number of Sites

Based on the model comparison above, we fit an interaction model using the better-fitting distribution.

best_family <- if(AIC(model_gamma) <= AIC(model_ig)) {
  cat("Using Gamma distribution (lower AIC)\n\n")
  Gamma(link = "log")
} else {
  cat("Using Inverse Gaussian distribution (lower AIC)\n\n")
  inverse.gaussian(link = "log")
}

## Using Gamma distribution (lower AIC)

model_interaction <- glm(time_to_final ~ sponsor_type_clean * num_sites_scaled + enrollment_scaled + 
                           number_of_arms_scaled + months_from_covid_scaled,
                         data = trials_scaled,
                         family = best_family,
                         control = glm.control(maxit = 100))

summary(model_interaction)

## 
## Call:
## glm(formula = time_to_final ~ sponsor_type_clean * num_sites_scaled + 
##     enrollment_scaled + number_of_arms_scaled + months_from_covid_scaled, 
##     family = best_family, data = trials_scaled, control = glm.control(maxit = 100))
## 
## Coefficients:
##                                                Estimate Std. Error t value
## (Intercept)                                    7.063732   0.036011 196.154
## sponsor_type_cleanAcademic                     0.036410   0.042219   0.862
## sponsor_type_cleanGovernment                   0.699054   0.676852   1.033
## num_sites_scaled                               0.269275   0.089623   3.005
## enrollment_scaled                             -0.027583   0.018308  -1.507
## number_of_arms_scaled                          0.006153   0.017763   0.346
## months_from_covid_scaled                      -0.079975   0.017357  -4.608
## sponsor_type_cleanAcademic:num_sites_scaled   -0.229341   0.091221  -2.514
## sponsor_type_cleanGovernment:num_sites_scaled  3.774063   3.257933   1.158
##                                               Pr(>|t|)    
## (Intercept)                                    < 2e-16 ***
## sponsor_type_cleanAcademic                     0.38893    
## sponsor_type_cleanGovernment                   0.30227    
## num_sites_scaled                               0.00281 ** 
## enrollment_scaled                              0.13264    
## number_of_arms_scaled                          0.72921    
## months_from_covid_scaled                      5.36e-06 ***
## sponsor_type_cleanAcademic:num_sites_scaled    0.01229 *  
## sponsor_type_cleanGovernment:num_sites_scaled  0.24733    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 0.1277541)
## 
##     Null deviance: 88.442  on 443  degrees of freedom
## Residual deviance: 82.764  on 435  degrees of freedom
## AIC: 6763.8
## 
## Number of Fisher Scoring iterations: 5

7.1 Interaction Model Coefficients

int_coefs <- data.frame(
  Term = names(coef(model_interaction)),
  Estimate = coef(model_interaction),
  SE = summary(model_interaction)$coefficients[, 2],
  z_value = summary(model_interaction)$coefficients[, 3],
  p_value = summary(model_interaction)$coefficients[, 4],
  Exp_Estimate = exp(coef(model_interaction))
)
rownames(int_coefs) <- NULL

kable(int_coefs, digits = 4, 
      caption = "Interaction Model Coefficients") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Interaction Model Coefficients
Term	Estimate	SE	z_value	p_value	Exp_Estimate
(Intercept)	7.0637	0.0360	196.1543	0.0000	1168.7985
sponsor_type_cleanAcademic	0.0364	0.0422	0.8624	0.3889	1.0371
sponsor_type_cleanGovernment	0.6991	0.6769	1.0328	0.3023	2.0118
num_sites_scaled	0.2693	0.0896	3.0045	0.0028	1.3090
enrollment_scaled	-0.0276	0.0183	-1.5066	0.1326	0.9728
number_of_arms_scaled	0.0062	0.0178	0.3464	0.7292	1.0062
months_from_covid_scaled	-0.0800	0.0174	-4.6077	0.0000	0.9231
sponsor_type_cleanAcademic:num_sites_scaled	-0.2293	0.0912	-2.5141	0.0123	0.7951
sponsor_type_cleanGovernment:num_sites_scaled	3.7741	3.2579	1.1584	0.2473	43.5567

7.2 Test Interaction Significance

if(AIC(model_gamma) <= AIC(model_ig)) {
  main_model <- model_gamma
} else {
  main_model <- model_ig
}

lr_test <- anova(main_model, model_interaction, test = "F")
print(lr_test)

## Analysis of Deviance Table
## 
## Model 1: time_to_final ~ sponsor_type_clean + num_sites_scaled + enrollment_scaled + 
##     number_of_arms_scaled + months_from_covid_scaled
## Model 2: time_to_final ~ sponsor_type_clean * num_sites_scaled + enrollment_scaled + 
##     number_of_arms_scaled + months_from_covid_scaled
##   Resid. Df Resid. Dev Df Deviance      F  Pr(>F)  
## 1       437     83.807                             
## 2       435     82.764  2   1.0431 4.0825 0.01752 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\n=== Interaction Test ===\n")

## 
## === Interaction Test ===

cat("Does adding the sponsor x sites interaction significantly improve fit?\n")

## Does adding the sponsor x sites interaction significantly improve fit?

cat("F-test p-value:", round(lr_test$`Pr(>F)`[2], 4), "\n")

## F-test p-value: 0.0175

if(lr_test$`Pr(>F)`[2] < 0.05) {
  cat("Conclusion: YES - Interaction is statistically significant\n")
} else {
  cat("Conclusion: NO - Interaction is not statistically significant\n")
}

## Conclusion: YES - Interaction is statistically significant

7.3 Visualize Interaction

pred_data <- expand.grid(
  sponsor_type_clean = levels(trials_scaled$sponsor_type_clean),
  num_sites_scaled = seq(min(trials_scaled$num_sites_scaled), 
                          quantile(trials_scaled$num_sites_scaled, 0.90), 
                          length.out = 50),
  enrollment_scaled = 0,
  number_of_arms_scaled = 0,
  months_from_covid_scaled = 0
)

pred_data$predicted <- predict(model_interaction, newdata = pred_data, type = "response")

# Back-transform num_sites for x-axis
pred_data$num_sites_original <- pred_data$num_sites_scaled * scale_params$SD[1] + scale_params$Mean[1]

ggplot(pred_data, aes(x = num_sites_original, y = predicted, color = sponsor_type_clean)) +
  geom_line(linewidth = 1.2) +
  scale_color_manual(values = c("Commercial" = "#2E86AB", 
                                 "Academic" = "#A23B72", 
                                 "Government" = "#F18F01")) +
  labs(
    title = "Predicted Completion Time by Number of Sites and Sponsor Type",
    subtitle = "Interaction Model (other covariates held at mean)",
    x = "Number of Clinical Sites",
    y = "Predicted Time to Completion (days)",
    color = "Sponsor Type"
  ) +
  scale_y_continuous(labels = comma) +
  theme(legend.position = "bottom")

8 Conclusions

8.1 Summary of Findings

cat("=== FINAL SUMMARY ===\n\n")

## === FINAL SUMMARY ===

cat("1. RESEARCH QUESTION\n")

## 1. RESEARCH QUESTION

cat("   Does sponsor type predict Phase 2 oncology trial completion time during COVID-19?\n\n")

##    Does sponsor type predict Phase 2 oncology trial completion time during COVID-19?

cat("2. SAMPLE\n")

## 2. SAMPLE

cat("   N =", nrow(trials_clean), "completed Phase 2 solid tumor trials\n")

##    N = 444 completed Phase 2 solid tumor trials

cat("   Period: March 2019 - March 2021\n\n")

##    Period: March 2019 - March 2021

cat("3. MODEL SELECTION\n")

## 3. MODEL SELECTION

cat("   Best fitting distribution:", aic_winner, "\n")

##    Best fitting distribution: Gamma

cat("   AIC difference:", round(abs(AIC(model_gamma) - AIC(model_ig)), 2), "\n\n")

##    AIC difference: 38966.09

cat("4. KEY FINDINGS (from", aic_winner, "model)\n")

## 4. KEY FINDINGS (from Gamma model)

if(aic_winner == "Gamma") {
  best_model <- model_gamma
} else {
  best_model <- model_ig
}

p_academic <- summary(best_model)$coefficients["sponsor_type_cleanAcademic", 4]
p_gov <- summary(best_model)$coefficients["sponsor_type_cleanGovernment", 4]

cat("   Academic vs Commercial: ", 
    round((exp(coef(best_model)["sponsor_type_cleanAcademic"]) - 1) * 100, 1), 
    "% difference (p = ", round(p_academic, 4), ")\n", sep = "")

##    Academic vs Commercial: 1.5% difference (p = 0.7159)

cat("   Government vs Commercial: ", 
    round((exp(coef(best_model)["sponsor_type_cleanGovernment"]) - 1) * 100, 1), 
    "% difference (p = ", round(p_gov, 4), ")\n\n", sep = "")

##    Government vs Commercial: -12.5% difference (p = 0.2265)

cat("5. SIGNIFICANT COVARIATES\n")

## 5. SIGNIFICANT COVARIATES

coef_summary <- summary(best_model)$coefficients
sig_covars <- rownames(coef_summary)[coef_summary[, 4] < 0.05]
sig_covars <- sig_covars[sig_covars != "(Intercept)"]
for(cv in sig_covars) {
  cat("   ", cv, ": exp(b) = ", round(exp(coef(best_model)[cv]), 4), 
      ", p = ", round(coef_summary[cv, 4], 4), "\n", sep = "")
}

##    num_sites_scaled: exp(b) = 1.0515, p = 0.0039
##    months_from_covid_scaled: exp(b) = 0.9242, p = 0

8.2 Why One Distribution Fits Better

cat("\n=== DISTRIBUTION CHOICE RATIONALE ===\n\n")

## 
## === DISTRIBUTION CHOICE RATIONALE ===

if(aic_winner == "Inverse Gaussian") {
  cat("The Inverse Gaussian distribution provided better fit, likely because:\n\n")
  cat("1. HEAVIER RIGHT TAIL: Clinical trial completion times often have extreme\n")
  cat("   outliers where a few trials run much longer than expected. The Inverse\n")
  cat("   Gaussian's heavier tail better accommodates these observations.\n\n")
  cat("2. VARIANCE-MEAN RELATIONSHIP: The Inverse Gaussian assumes variance is\n")
  cat("   proportional to mu^3 (vs mu^2 for Gamma). This may better reflect the\n")
  cat("   reality that uncertainty compounds more dramatically for longer trials.\n\n")
  cat("3. PROCESS INTERPRETATION: The Inverse Gaussian arises naturally as the\n")
  cat("   first passage time distribution, which aligns conceptually with trial\n")
  cat("   completion as a 'time to reach a threshold' process.\n")
} else {
  cat("The Gamma distribution provided better fit, likely because:\n\n")
  cat("1. MODERATE SKEWNESS: The completion time distribution shows right skew\n")
  cat("   but not extreme outliers, which Gamma handles well.\n\n")
  cat("2. VARIANCE-MEAN RELATIONSHIP: The Gamma's assumption that variance is\n")
  cat("   proportional to mu^2 appears to match the observed pattern in the data.\n\n")
  cat("3. ROBUSTNESS: Gamma GLMs are generally more stable and less sensitive\n")
  cat("   to influential observations than Inverse Gaussian models.\n")
}

## The Gamma distribution provided better fit, likely because:
## 
## 1. MODERATE SKEWNESS: The completion time distribution shows right skew
##    but not extreme outliers, which Gamma handles well.
## 
## 2. VARIANCE-MEAN RELATIONSHIP: The Gamma's assumption that variance is
##    proportional to mu^2 appears to match the observed pattern in the data.
## 
## 3. ROBUSTNESS: Gamma GLMs are generally more stable and less sensitive
##    to influential observations than Inverse Gaussian models.

9 Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0 knitr_1.50       car_3.1-3        carData_3.0-5   
##  [5] MASS_7.3-65      statmod_1.5.1    patchwork_1.3.2  scales_1.4.0    
##  [9] ggplot2_4.0.0    tidyr_1.3.1      dplyr_1.1.4     
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.4.0         lattice_0.22-7    
##  [5] stringi_1.8.7      digest_0.6.37      magrittr_2.0.3     evaluate_1.0.5    
##  [9] grid_4.5.1         RColorBrewer_1.1-3 fastmap_1.2.0      Matrix_1.7-3      
## [13] jsonlite_2.0.0     Formula_1.2-5      mgcv_1.9-3         purrr_1.1.0       
## [17] viridisLite_0.4.2  textshaping_1.0.3  jquerylib_0.1.4    abind_1.4-8       
## [21] cli_3.6.5          rlang_1.1.6        splines_4.5.1      withr_3.0.2       
## [25] cachem_1.1.0       yaml_2.3.10        tools_4.5.1        vctrs_0.6.5       
## [29] R6_2.6.1           lifecycle_1.0.4    stringr_1.5.1      pkgconfig_2.0.3   
## [33] pillar_1.11.0      bslib_0.9.0        gtable_0.3.6       glue_1.8.0        
## [37] systemfonts_1.3.1  xfun_0.52          tibble_3.3.0       tidyselect_1.2.1  
## [41] rstudioapi_0.17.1  farver_2.1.2       nlme_3.1-168       htmltools_0.5.8.1 
## [45] rmarkdown_2.30     svglite_2.2.1      labeling_0.4.3     compiler_4.5.1    
## [49] S7_0.2.0

Does Sponsor Type Predict Clinical Trial Completion Time During COVID-19?

PHD 1502 Quantitative Analysis I - Exercise 4

Amanda McEwen

2025-11-24