kaggle_synthetic_clinical_trial

##Dataset downloaded from kaggle: https://www.kaggle.com/datasets/isabelladil/phase-iii-clinical-trial-dataset ## This synthetic dataset simulates a Phase III randomized controlled clinical trial evaluating CardioX (Drug A) versus an active comparator (Drug B) and a placebo for treating hypertension. It is designed for anomaly detection, and risk-based monitoring (RBM) applications.

##The dataset includes 1,000 patients across 50 trial sites, with realistic patient demographics, blood pressure readings, cholesterol levels, dropout rates, and adverse event reporting. Several anomalies have been embedded to simulate real-world data quality issues commonly encountered in clinical trials.

##This dataset is ideal for data quality assessments, statistical anomaly detection (Z-scores, IQR, clustering), and risk-based management (RBM) in clinical research.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
#Read data
#If the file is in your working directory, just use the file name.
#Otherwise, give the full path
clinical_data <- read.csv("synthetic_clinical_trial_data.csv", stringsAsFactors = FALSE)

head(clinical_data)

##   Subject_ID Site_ID Age Gender Enrollment_Date Treatment_Group Adverse_Events
## 1          1      49  54   Male          1/1/22          Drug A              0
## 2          2      37  44   Male          1/2/22         Placebo              1
## 3          3       1  58   Male          1/3/22          Drug A              0
## 4          4      25  48   Male          1/4/22          Drug B              0
## 5          5      10  57 Female          1/5/22          Drug A              2
## 6          6      36  59   Male          1/6/22         Placebo              1
##   Dropout Systolic_BP Diastolic_BP Cholesterol_Level
## 1       0         117           74               229
## 2       0         111           57               173
## 3       1         122           89               220
## 4       0         122           85               175
## 5       0         105           90               185
## 6       0         128           85               206

##Detecting anomalies in dropout rate ##“Are some sites showing unusually high or low dropout beyond what we’d expect by chance?”

site_dropout <- clinical_data |> 
  group_by(Site_ID) |> 
  summarise(
    Total_subjects = n(),
    Dropout_Count = sum(Dropout, na.rm = TRUE),
    Dropout_Rate = Dropout_Count/ Total_subjects, 
    .groups = "drop"
  )

#.groups = "drop" works like ungroup
#Dropout_Rate = (Dropout_Count / Total_subjects) * 100 will give percentage

##Now to calculate binomial confidence intervals with Exact Clopper Pearson intervals

library(purrr)
site_dropout <-  site_dropout |> 
  mutate(
    CI = map2(Dropout_Count, Total_subjects,
              ~binom.test(.x,.y)$conf.int),
    Lower_CI = map_dbl(CI,1),
    Upper_CI = map_dbl(CI,2)
  ) |> 
  select(-CI)

##To flag unusual sites ##First calculate overall drop out rate of study

overall_rate <- sum(site_dropout$Dropout_Count) / 
                sum(site_dropout$Total_subjects)
print(overall_rate)

## [1] 0.161

##To flag sites whose CI does not overlap overall rate

site_dropout <- site_dropout |> 
  mutate(
    Flag = case_when(
      Lower_CI > overall_rate ~"Higher than expected",
      Upper_CI < overall_rate ~ "Lower than expected", 
      TRUE ~ "Within expected range"
    )
  )

##Forest plot may be made; mixed effects model can be done. Not tried here.

##Now to evaluate the outliers in blood pressure.

clinical_data |> 
  ggplot(aes(x = Site_ID, y = Systolic_BP))+
  geom_boxplot(fill = "red", alpha = 0.5, outlier.color = "black")+
  geom_jitter(width = 0.1, alpha = 0.3)+
  labs(
    title = "Systolic Blood Pressure Distribution",
    x = "Site ID",
    y = "Systolic BP (mmHg)"
  )+
  theme_minimal()

## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

##Without geom_jitter

clinical_data |> 
  ggplot(aes(x = Site_ID, y = Systolic_BP))+
  geom_boxplot(fill = "red", alpha = 0.5, outlier.color = "black")+
  labs(
    title = "Systolic Blood Pressure Distribution",
    x = "Site ID",
    y = "Systolic BP (mmHg)"
  )+
  theme_minimal()

## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

##For diastolic BP

clinical_data |> 
  ggplot(aes(x = Site_ID, y = Diastolic_BP))+
  geom_boxplot(fill = "blue", alpha = 0.5, outlier.color = "black")+
  labs(
    title = "Diastolic Blood Pressure Distribution",
    x = "Site ID",
    y = "Diastolic BP (mmHg)"
  )+
  theme_minimal()

## Warning: Orientation is not uniquely specified when both the x and y aesthetics are
## continuous. Picking default orientation 'x'.

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

##Distribution of AE across sites

clinical_data |> 
  ggplot(aes(x = Adverse_Events))+
  geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7)+
  labs(title = "Distribution of adverse events",
       x = "Number of adverse events",
       y = "Number of subjects")

#The below graph is better if it is discrete data with small counts.

clinical_data %>%
  ggplot(aes(x = factor(Adverse_Events))) +
  geom_bar(fill = "blue", alpha = 0.7) +
  labs(
    title = "Frequency of Adverse Events",
    x = "Number of Adverse Events",
    y = "Count"
  ) +
  theme_minimal()

##Stratifying AE by treatment group

clinical_data %>%
  ggplot(aes(x = Adverse_Events, fill = Treatment_Group)) +
  geom_histogram(binwidth = 1, alpha = 0.6, position = "dodge") +
  labs(title = "Adverse Events by Treatment Group") +
  theme_minimal()

##Stratification shown without the position argument.

clinical_data %>%
  ggplot(aes(x = Adverse_Events, fill = Treatment_Group)) +
  geom_histogram(binwidth = 1, alpha = 0.6) +
  labs(title = "Adverse Events by Treatment Group") +
  theme_minimal()

kaggle_synthetic_clinical_trial_data

2026-05-30