From Step 1, we have:
aact_studies_raw.RData - 25,419 Phase 2 trialsaact_sponsors_raw.RData - 559,371 sponsor recordsaact_conditions_raw.RData - 990,427 condition
recordsaact_facilities_raw.RData - 3,345,178 facility
recordstrials_analysis.RData - Final analysis datasetlibrary(dplyr) # For dataset creation and joining
load("aact_studies_raw.RData")
load("aact_sponsors_raw.RData")
load("aact_conditions_raw.RData")
load("aact_facilities_raw.RData")
These are chronic respiratory diseases relevant to research question.
respiratory_keywords <- c(
"copd",
"chronic obstructive pulmonary",
"asthma",
"bronchiectasis",
"emphysema",
"chronic bronchitis",
"pulmonary hypertension",
"pulmonary fibrosis",
"ipf",
"idiopathic pulmonary fibrosis",
"interstitial lung",
"sarcoidosis",
"cystic fibrosis",
"alpha-1 antitrypsin",
"alpha 1 antitrypsin"
)
Exclude lung cancers (different trial dynamics) and acute infections (different completion patterns).
exclude_keywords <- c(
"lung cancer",
"lung neoplasm",
"non-small cell",
"small cell lung",
"mesothelioma",
"covid",
"coronavirus",
"pneumonia",
"influenza",
"respiratory syncytial",
"rsv",
"tuberculosis"
)
Trial is included if it had at least one respiratory condition matching the inclusion list and none matching the exclusion list.
# Convert condition names to lowercase for matching
conditions_raw$condition_lower <- tolower(conditions_raw$downcase_name)
# Create a function to check if text contains any keyword
contains_any <- function(text, keywords) {
any(sapply(keywords, function(k) grepl(k, text, fixed = TRUE)))
}
# Apply to each condition
conditions_raw$is_respiratory <- sapply(conditions_raw$condition_lower,
contains_any,
keywords = respiratory_keywords)
conditions_raw$is_excluded <- sapply(conditions_raw$condition_lower,
contains_any,
keywords = exclude_keywords)
Tie condition to specific trial.
# Get NCTs with at least one respiratory condition
respiratory_trials <- conditions_raw %>%
filter(is_respiratory == TRUE) %>%
select(nct_id) %>%
distinct()
# Get trial IDs with any excluded condition
excluded_trials <- conditions_raw %>%
filter(is_excluded == TRUE) %>%
select(nct_id) %>%
distinct()
# Final list: respiratory trials minus excluded trials
respiratory_ids <- respiratory_trials %>%
filter(!(nct_id %in% excluded_trials$nct_id))
# Keep only respiratory trials
studies_filtered <- studies_raw %>%
filter(nct_id %in% respiratory_ids$nct_id)
Merge the sponsor information to the filtered respiraotry trials.
# Look at the sponsor table
str(sponsors_raw)
## 'data.frame': 559371 obs. of 3 variables:
## $ nct_id : chr "NCT00373516" "NCT01870622" "NCT03989297" "NCT04055727" ...
## $ agency_class : chr "INDUSTRY" "OTHER" "OTHER" "OTHER" ...
## $ lead_or_collaborator: chr "lead" "lead" "lead" "lead" ...
# What are the agency_class values?
table(sponsors_raw$agency_class, useNA = "ifany")
##
## AMBIG FED INDIV INDUSTRY NETWORK NIH OTHER OTHER_GOV
## 3 4812 564 125646 4812 11460 396584 14474
## UNKNOWN <NA>
## 72 944
# Merge with studies
studies_with_sponsor <- merge(
x = studies_filtered,
y = sponsors_raw[, c("nct_id", "agency_class")],
by = "nct_id",
all.x = TRUE
)
# Count facilities per trial
site_counts <- facilities_raw %>%
group_by(nct_id) %>%
summarise(num_sites = n())
studies_with_sites <- merge(
x = studies_with_sponsor,
y = site_counts,
by = "nct_id",
all.x = TRUE
)
Collapse agency_class into Commercial vs Non-Commercial.
# Look at current agency_class values
table(studies_with_sites$agency_class, useNA = "ifany")
##
## FED INDUSTRY NIH OTHER OTHER_GOV
## 2 212 2 154 7
# Create binary sponsor variable, INDUSTRY = Commercial - Everything else = Non-Commercial
studies_with_sites$sponsor <- ifelse(
studies_with_sites$agency_class == "INDUSTRY",
"Commercial",
"Non-Commercial"
)
# Convert to factor with Commercial as reference
studies_with_sites$sponsor <- factor(
studies_with_sites$sponsor,
levels = c("Commercial", "Non-Commercial")
)
# Verify transformation
table(studies_with_sites$sponsor, useNA = "ifany")
##
## Commercial Non-Commercial
## 212 165
# Cross-tabulation to verify mapping
table(studies_with_sites$agency_class, studies_with_sites$sponsor, useNA = "ifany")
##
## Commercial Non-Commercial
## FED 0 2
## INDUSTRY 212 0
## NIH 0 2
## OTHER 0 154
## OTHER_GOV 0 7
Completion time is the number of days from trial start to completion.
# Convert date columns to Date type
studies_with_sites$start_date <- as.Date(studies_with_sites$start_date)
studies_with_sites$completion_date <- as.Date(studies_with_sites$completion_date)
# Calculate completion time in days
studies_with_sites$completion_time <- as.numeric(
studies_with_sites$completion_date - studies_with_sites$start_date
)
This measures how many months after March 2020 (pandemic declaration) each trial started.
# Define pandemic start date
covid_start <- as.Date("2020-03-01")
# Calculate months from COVID start
studies_with_sites$months_from_covid <- as.numeric(
studies_with_sites$start_date - covid_start
) / 30.44 # Average days per month
# What statuses do we have?
table(studies_with_sites$overall_status, useNA = "ifany")
##
## ACTIVE_NOT_RECRUITING COMPLETED ENROLLING_BY_INVITATION
## 34 128 6
## NOT_YET_RECRUITING RECRUITING SUSPENDED
## 8 98 2
## TERMINATED UNKNOWN WITHDRAWN
## 41 42 18
# Record starting count
n_before <- nrow(studies_with_sites)
# Keep only completed trials
mydata <- studies_with_sites %>%
filter(overall_status == "COMPLETED")
cat("\nTrials before:", n_before, "\n")
##
## Trials before: 377
cat("Removed:", n_before - nrow(mydata), "\n")
## Removed: 249
n_before <- nrow(mydata)
# Remove invalid values
mydata <- mydata %>%
filter(!is.na(completion_time) & completion_time > 0)
n_before <- nrow(mydata)
# Remove rows with missing covariates
mydata <- mydata %>%
filter(!is.na(num_sites) &
!is.na(enrollment) &
!is.na(number_of_arms) &
!is.na(sponsor))
cat("\nTrials before:", n_before, "\n")
##
## Trials before: 128
cat("Trials after removing missing:", nrow(mydata), "\n")
## Trials after removing missing: 127
cat("Removed:", n_before - nrow(mydata), "\n")
## Removed: 1
# Ensure numeric types
mydata$num_sites <- as.numeric(mydata$num_sites)
mydata$enrollment <- as.numeric(mydata$enrollment)
mydata$number_of_arms <- as.numeric(mydata$number_of_arms)
# z-score transformation
mydata$num_sites_z <- scale(mydata$num_sites)[,1]
mydata$enrollment_z <- scale(mydata$enrollment)[,1]
mydata$arms_z <- scale(mydata$number_of_arms)[,1]
mydata$covid_timing_z <- scale(mydata$months_from_covid)[,1]
# Save analysis dataset
save(mydata, file = "trials_analysis.RData")
write.csv(mydata, "trials_analysis.csv", row.names = FALSE)
cat("Final analysis dataset saved!\n")
## Final analysis dataset saved!
cat("Final sample size:", nrow(mydata), "completed trials\n")
## Final sample size: 127 completed trials