##Dataset downloaded from kaggle: https://www.kaggle.com/datasets/isabelladil/phase-iii-clinical-trial-dataset ## This synthetic dataset simulates a Phase III randomized controlled clinical trial evaluating CardioX (Drug A) versus an active comparator (Drug B) and a placebo for treating hypertension. It is designed for anomaly detection, and risk-based monitoring (RBM) applications.
##The dataset includes 1,000 patients across 50 trial sites, with realistic patient demographics, blood pressure readings, cholesterol levels, dropout rates, and adverse event reporting. Several anomalies have been embedded to simulate real-world data quality issues commonly encountered in clinical trials.
##This dataset is ideal for data quality assessments, statistical anomaly detection (Z-scores, IQR, clustering), and risk-based management (RBM) in clinical research.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#Read data
#If the file is in your working directory, just use the file name.
#Otherwise, give the full path
clinical_data <- read.csv("synthetic_clinical_trial_data.csv", stringsAsFactors = FALSE)
head(clinical_data)
## Subject_ID Site_ID Age Gender Enrollment_Date Treatment_Group Adverse_Events
## 1 1 49 54 Male 1/1/22 Drug A 0
## 2 2 37 44 Male 1/2/22 Placebo 1
## 3 3 1 58 Male 1/3/22 Drug A 0
## 4 4 25 48 Male 1/4/22 Drug B 0
## 5 5 10 57 Female 1/5/22 Drug A 2
## 6 6 36 59 Male 1/6/22 Placebo 1
## Dropout Systolic_BP Diastolic_BP Cholesterol_Level
## 1 0 117 74 229
## 2 0 111 57 173
## 3 1 122 89 220
## 4 0 122 85 175
## 5 0 105 90 185
## 6 0 128 85 206
##Detecting anomalies in dropout rate ##“Are some sites showing unusually high or low dropout beyond what we’d expect by chance?”
site_dropout <- clinical_data |>
group_by(Site_ID) |>
summarise(
Total_subjects = n(),
Dropout_Count = sum(Dropout, na.rm = TRUE),
Dropout_Rate = Dropout_Count/ Total_subjects,
.groups = "drop"
)
#.groups = "drop" works like ungroup
#Dropout_Rate = (Dropout_Count / Total_subjects) * 100 will give percentage
##Now to calculate binomial confidence intervals with Exact Clopper Pearson intervals
library(purrr)
site_dropout <- site_dropout |>
mutate(
CI = map2(Dropout_Count, Total_subjects,
~binom.test(.x,.y)$conf.int),
Lower_CI = map_dbl(CI,1),
Upper_CI = map_dbl(CI,2)
) |>
select(-CI)
##To flag unusual sites ##First calculate overall drop out rate of study
overall_rate <- sum(site_dropout$Dropout_Count) /
sum(site_dropout$Total_subjects)
print(overall_rate)
## [1] 0.161
##To flag sites whose CI does not overlap overall rate
site_dropout <- site_dropout |>
mutate(
Flag = case_when(
Lower_CI > overall_rate ~"Higher than expected",
Upper_CI < overall_rate ~ "Lower than expected",
TRUE ~ "Within expected range"
)
)
##Forest plot may be made; mixed effects model can be done. Not tried here.
##Now to evaluate the outliers in blood pressure.
clinical_data |>
ggplot(aes(x = Site_ID, y = Systolic_BP))+
geom_boxplot(fill = "red", alpha = 0.5, outlier.color = "black")+
geom_jitter(width = 0.1, alpha = 0.3)+
labs(
title = "Systolic Blood Pressure Distribution",
x = "Site ID",
y = "Systolic BP (mmHg)"
)+
theme_minimal()
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
##Without geom_jitter
clinical_data |>
ggplot(aes(x = Site_ID, y = Systolic_BP))+
geom_boxplot(fill = "red", alpha = 0.5, outlier.color = "black")+
labs(
title = "Systolic Blood Pressure Distribution",
x = "Site ID",
y = "Systolic BP (mmHg)"
)+
theme_minimal()
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
##For diastolic BP
clinical_data |>
ggplot(aes(x = Site_ID, y = Diastolic_BP))+
geom_boxplot(fill = "blue", alpha = 0.5, outlier.color = "black")+
labs(
title = "Diastolic Blood Pressure Distribution",
x = "Site ID",
y = "Diastolic BP (mmHg)"
)+
theme_minimal()
## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
##Distribution of AE across sites
clinical_data |>
ggplot(aes(x = Adverse_Events))+
geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7)+
labs(title = "Distribution of adverse events",
x = "Number of adverse events",
y = "Number of subjects")
#The below graph is better if it is discrete data with small counts.
clinical_data %>%
ggplot(aes(x = factor(Adverse_Events))) +
geom_bar(fill = "blue", alpha = 0.7) +
labs(
title = "Frequency of Adverse Events",
x = "Number of Adverse Events",
y = "Count"
) +
theme_minimal()
##Stratifying AE by treatment group
clinical_data %>%
ggplot(aes(x = Adverse_Events, fill = Treatment_Group)) +
geom_histogram(binwidth = 1, alpha = 0.6, position = "dodge") +
labs(title = "Adverse Events by Treatment Group") +
theme_minimal()
##Stratification shown without the position argument.
clinical_data %>%
ggplot(aes(x = Adverse_Events, fill = Treatment_Group)) +
geom_histogram(binwidth = 1, alpha = 0.6) +
labs(title = "Adverse Events by Treatment Group") +
theme_minimal()