This document explains how to perform analytics on clinical trials sponsors. The document explains steps for performing data ingestion and data preparation using R programming language.
For more detailed explaination and interpretations of results, and also to read other articles on Clinical Trials visit my blog here at https://businessintelligencedw.blogspot.com/.
#import required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(stringr)
library(ggplot2)
library(broom)
library(caret)
## Loading required package: lattice
#set paths for data files
in_path_studies<-"C:/Users/ranamanohar/Documents/MSR_Docs/BigData/Data/ACCT/Data/unzipSrcFiles/studies.txt"
in_path_sponsors<-"C:/Users/ranamanohar/Documents/MSR_Docs/BigData/Data/ACCT/Data/unzipSrcFiles/sponsors.txt"
#reads the data files into dataframes
studies<-read.csv(in_path_studies, header=TRUE, sep = "|",na.strings = "NA", nrows = -100)
sponsors<-read.csv(in_path_sponsors, header=TRUE, sep = "|",na.strings = "NA", nrows = -100)
Once you download the files and import the data into dataframes,we would create Subsets of that data so that we can fetch selective data:
#create subset df with selected columns
studies_1<-subset.data.frame(studies, select =c("nct_id",
"study_first_submitted_date",
"start_date",
"start_date_type",
"completion_date",
"completion_date_type",
"results_first_posted_date",
"results_first_posted_date_type",
"overall_status"
)
)
#head(studies_1,10)
#view the structure of df
#str(studies_1)
#create new columns
studies_1<-mutate(studies_1,
"submission_year"=year(study_first_submitted_date),
"submission_month"=month(study_first_submitted_date, label = TRUE),
"submission_yrmonth"=str_sub(study_first_submitted_date,1,7),
"flag_study_submitted"=if_else(str_length(study_first_submitted_date)>0,1,0),
"flag_completed_status"=if_else (overall_status=="Completed",1,0),
"flag_recruiting_status"=if_else (overall_status=="Recruiting",1,0),
"start_year"=str_sub(start_date,1,4),
"start_month"=str_sub(start_date,6,7),
"start_yrmonth"=str_sub(start_date,1,7),
"flag_act_started"=if_else(start_date_type=="Actual",1,0),
"completed_year"=str_sub(completion_date,1,4),
"completed_month"=str_sub(completion_date,6,7),
"completed_yrmonth"=str_sub(completion_date,1,7),
"flag_act_completed"=if_else(completion_date_type=="Actual",1,0),
"flag_act_started"=if_else(start_date_type=="Actual",1,0),
"results_submit_year"=str_sub(results_first_posted_date,1,4),
"results_submit_month"=str_sub(results_first_posted_date,6,7),
"results_submit_yrmonth"=str_sub(results_first_posted_date,1,7),
"flag_results_submitted"=if_else(str_length(results_first_posted_date)>0,1,0)
)
#create new df with required columns and filter condition
lead_sponsors<-subset.data.frame(sponsors, subset=lead_or_collaborator=="lead", select = c("nct_id",
"agency_class",
"name",
"id")
)
#create new columns lead sponsor
lead_sponsors<-mutate(lead_sponsors,
"flag_sponsor_industry"=if_else(agency_class=="Industry",1,0),
"sponsor_type"=if_else(
grepl("university",casefold(name)) |
grepl("univerzi",casefold(name)) |
grepl("institut",casefold(name)) |
grepl("school",casefold(name)) |
grepl("campus",casefold(name)) |
grepl("college",casefold(name)) |
grepl("education",casefold(name)) |
grepl("academ",casefold(name)) |
grepl("univers",casefold(name)), "Academic",
if_else(grepl("hospital",casefold(name))|
grepl("clinic",casefold(name))|
grepl("medical center",casefold(name))|
grepl("health center",casefold(name))|
grepl("center",casefold(name))|
grepl("centre",casefold(name))|
grepl("hôpital",casefold(name))|
grepl("hopital",casefold(name)), "Hospital"
,"NA")),
"flag_sponsor_type_academic"=if_else(sponsor_type=="Academic",1,0),
"flag_sponsor_type_hospital"=if_else(sponsor_type=="Hospital",1,0)
)
#str(lead_sponsors)
#change datatype of categorical variables to factor
lead_sponsors$flag_sponsor_type_academic<-as.factor(lead_sponsors$flag_sponsor_type_academic)
lead_sponsors$flag_sponsor_type_hospital<-as.factor(lead_sponsors$flag_sponsor_type_hospital)
lead_sponsors$flag_sponsor_industry<-as.factor(lead_sponsors$flag_sponsor_industry)
#head(lead_sponsors)
#left join lead_sponsors to studies - has 1-1 relationship
studies_f<-left_join(studies_1,lead_sponsors,by="nct_id")
#create parameters
var_current_year<-year(today())
var_last_year<-(var_current_year-1)
var_last_year_2<-(var_current_year-2)
var_last_year_3<-(var_current_year-3)
var_last_year_4<-(var_current_year-4)
var_last_year_5<-(var_current_year-5)
#create sponsor stats table
sponsor_f<-studies_f %>%
group_by(name,agency_class, sponsor_type, flag_sponsor_industry, flag_sponsor_type_academic, flag_sponsor_type_hospital) %>%
summarise(cnt_studies=n(),##counts number of records
cnt_study_submitted=sum(flag_study_submitted),
cnt_started_actual=sum(flag_act_started),
cnt_recruiting_status=sum(flag_recruiting_status),
cnt_results_submitted=sum(flag_results_submitted),
cnt_completed_status=sum(flag_completed_status),
ratio_results_to_completed=100*(sum(flag_results_submitted)/sum(flag_completed_status)),
cnt_study_submitted_lstyr=sum(if_else(submission_year==var_last_year[1],flag_study_submitted,0)),
cnt_started_actual_lstyr=sum(if_else(start_year==var_last_year[1],flag_act_started,0)),
cnt_completed_status_lstyr=sum(if_else(completed_year==var_last_year[1],flag_completed_status,0)),
cnt_results_submitted_lstyr=sum(if_else(results_submit_year==var_last_year[1],flag_results_submitted,0)),
ratio_results_to_completed_lstyr=100*(sum(cnt_results_submitted_lstyr)/sum(cnt_completed_status_lstyr)),
cnt_study_submitted_lstyr2=sum(if_else(submission_year==var_last_year_2[1],flag_study_submitted,0)),
cnt_started_actual_lstyr2=sum(if_else(start_year==var_last_year_2[1],flag_act_started,0)),
cnt_completed_status_lstyr2=sum(if_else(completed_year==var_last_year_2[1],flag_completed_status,0)),
cnt_results_submitted_lstyr2=sum(if_else(results_submit_year==var_last_year_2[1],flag_results_submitted,0)),
ratio_results_to_completed_lstyr2=100*(sum(cnt_results_submitted_lstyr2)/sum(cnt_completed_status_lstyr2)),
cnt_study_submitted_lstyr3=sum(if_else(submission_year==var_last_year_3[1],flag_study_submitted,0)),
cnt_started_actual_lstyr3=sum(if_else(start_year==var_last_year_3[1],flag_act_started,0)),
cnt_completed_status_lstyr3=sum(if_else(completed_year==var_last_year_3[1],flag_completed_status,0)),
cnt_results_submitted_lstyr3=sum(if_else(results_submit_year==var_last_year_3[1],flag_results_submitted,0)),
ratio_results_to_completed_lstyr3=100*(sum(cnt_results_submitted_lstyr3)/sum(cnt_completed_status_lstyr3)),
cnt_study_submitted_lstyr4=sum(if_else(submission_year==var_last_year_4[1],flag_study_submitted,0)),
cnt_started_actual_lstyr4=sum(if_else(start_year==var_last_year_4[1],flag_act_started,0)),
cnt_completed_status_lstyr4=sum(if_else(completed_year==var_last_year_4[1],flag_completed_status,0)),
cnt_results_submitted_lstyr4=sum(if_else(results_submit_year==var_last_year_4[1],flag_results_submitted,0)),
ratio_results_to_completed_lstyr4=100*(sum(cnt_results_submitted_lstyr4)/sum(cnt_completed_status_lstyr4)),
cnt_study_submitted_lstyr5=sum(if_else(submission_year==var_last_year_5[1],flag_study_submitted,0)),
cnt_started_actual_lstyr5=sum(if_else(start_year==var_last_year_5[1],flag_act_started,0)),
cnt_completed_status_lstyr5=sum(if_else(completed_year==var_last_year_5[1],flag_completed_status,0)),
cnt_results_submitted_lstyr5=sum(if_else(results_submit_year==var_last_year_5[1],flag_results_submitted,0)),
ratio_results_to_completed_lstyr5=100*(sum(cnt_results_submitted_lstyr5)/sum(cnt_completed_status_lstyr5)),
ratio_results_to_completed_lstyr_change=sum(ratio_results_to_completed_lstyr)-sum(ratio_results_to_completed_lstyr2),
ratio_results_to_completed_lstyr2_change=sum(ratio_results_to_completed_lstyr2)-sum(ratio_results_to_completed_lstyr3),
ratio_results_to_completed_lstyr3_change=sum(ratio_results_to_completed_lstyr3)-sum(ratio_results_to_completed_lstyr4),
ratio_results_to_completed_lstyr4_change=sum(ratio_results_to_completed_lstyr4)-sum(ratio_results_to_completed_lstyr5),
cnt_completed_status_lstyr_change=sum(cnt_completed_status_lstyr)-sum(cnt_completed_status_lstyr2),
cnt_completed_status_lstyr2_change=sum(cnt_completed_status_lstyr2)-sum(cnt_completed_status_lstyr3),
cnt_completed_status_lstyr3_change=sum(cnt_completed_status_lstyr3)-sum(cnt_completed_status_lstyr4),
cnt_completed_status_lstyr4_change=sum(cnt_completed_status_lstyr4)-sum(cnt_completed_status_lstyr5),
cnt_completed_status_lstyr_chgpc=100*(sum(cnt_completed_status_lstyr_change)/sum(cnt_completed_status_lstyr2)),
cnt_completed_status_lstyr2_chgpc=100*(sum(cnt_completed_status_lstyr2_change)/sum(cnt_completed_status_lstyr3)),
cnt_completed_status_lstyr3_chgpc=100*(sum(cnt_completed_status_lstyr3_change)/sum(cnt_completed_status_lstyr4)),
cnt_completed_status_lstyr4_chgpc=100*(sum(cnt_completed_status_lstyr4_change)/sum(cnt_completed_status_lstyr5)),
cnt_results_submitted_lstyr_change=sum(cnt_results_submitted_lstyr)-sum(cnt_results_submitted_lstyr2),
cnt_results_submitted_lstyr2_change=sum(cnt_results_submitted_lstyr2)-sum(cnt_results_submitted_lstyr3),
cnt_results_submitted_lstyr3_change=sum(cnt_results_submitted_lstyr3)-sum(cnt_results_submitted_lstyr4),
cnt_results_submitted_lstyr4_change=sum(cnt_results_submitted_lstyr4)-sum(cnt_results_submitted_lstyr5),
cnt_results_submitted_lstyr_chgpc=100*(sum(cnt_results_submitted_lstyr_change)/sum(cnt_results_submitted_lstyr2)),
cnt_results_submitted_lstyr2_chgpc=100*(sum(cnt_results_submitted_lstyr2_change)/sum(cnt_results_submitted_lstyr3)),
cnt_results_submitted_lstyr3_chgpc=100*(sum(cnt_results_submitted_lstyr3_change)/sum(cnt_results_submitted_lstyr4)),
cnt_results_submitted_lstyr4_chgpc=100*(sum(cnt_results_submitted_lstyr4_change)/sum(cnt_results_submitted_lstyr5))
)
#str(sponsor_f)
#change datatype of categorical var to factor
sponsor_f$sponsor_type<-as.factor(sponsor_f$sponsor_type)
#View summary stats
#summary(sponsor_f$cnt_results_submitted_lstyr)
#create yearly fact
year_f_completed<-studies_f %>%
filter(completion_date_type=="Actual" & overall_status=="Completed") %>%
group_by(completed_year) %>%
summarise("cnt_completed"=n())
#rename the year column - use any method
names(year_f_completed)[1]<-"common_year"
colnames(year_f_completed)[colnames(year_f_completed)=="completed_year"]<-"common_year"
year_f_results<-studies_f %>%
filter(overall_status=="Completed") %>%
group_by(results_submit_year) %>%
summarise("cnt_results"=n())
names(year_f_results)[1]<-"common_year"
#combine to create final yearly fact
year_f_1<-full_join(year_f_completed,year_f_results,by="common_year")
year_f_1$common_year<- as.factor(year_f_1$common_year)
year_f<-year_f_1 %>%
filter(common_year>=2005 & common_year<var_current_year)
#change datatype of column to factor
year_f$common_year<- as.factor(year_f$common_year)
End of Document