Kaggle_CYPdataset

##CYP-GUIDES (Cytochrome Psychotropic Genotyping Under Investigation for Decision Support) is a randomized controlled trial (RCT) comparing 2 outcomes in hospitalized patients with severe depressive disorders treated according to the patient’s CYP2D6 genotype and functional status versus standard psychotropic therapy. The primary outcome was hospital Length of Stay (LOS) and the secondary outcome was the Re-Admission Rate (RAR) 30 days after discharge. The RCT recruited 1500 patients, genotyped CYP2D6 in 1459, and randomized 477 to standard therapy (Group S), for whom treatment-as-usual guidance was delivered without consideration of patient CYP2D6 genotype, and 982 to genetically-guided therapy (Group G) where CYP2D6-based treatment recommendations were provided via EMR to physicians. For inpatients in Group G whose CYP2D6 function was sub- or supra-normal, medications primarily metabolized by the CYP2D6 enzyme were proscribed.

cypdata <- read.csv("Dataset.csv", stringsAsFactors = FALSE)
dim(df) ##gives number of rows and columns. If null, it is not dataframe or matrix. might be vector.

## NULL

##To find out

class(cypdata)

## [1] "data.frame"

##Normality testing of age; just to learn.

library(ggplot2)

ggplot(cypdata, aes(sample = AGE)) +
  stat_qq() +
  stat_qq_line()

To roughly summarise the data

summary(cypdata)

##        ID            GENDER               AGE        RACE.ETHNICITY    
##  Min.   :   1.0   Length:1500        Min.   :18.00   Length:1500       
##  1st Qu.: 375.8   Class :character   1st Qu.:24.00   Class :character  
##  Median : 750.5   Mode  :character   Median :37.00   Mode  :character  
##  Mean   : 750.5                      Mean   :38.78                     
##  3rd Qu.:1125.2                      3rd Qu.:51.00                     
##  Max.   :1500.0                      Max.   :87.00                     
##   Diagnosis              MD             Assignment            EMR           
##  Length:1500        Length:1500        Length:1500        Length:1500       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       LOS              RAR                A                 B          
##  Min.   :   6.0   Min.   :0.00000   Min.   : 0.0000   Min.   : 0.0000  
##  1st Qu.:  90.0   1st Qu.:0.00000   1st Qu.: 0.0000   1st Qu.: 0.0000  
##  Median : 138.0   Median :0.00000   Median : 0.0000   Median : 0.0000  
##  Mean   : 176.5   Mean   :0.09467   Mean   : 0.1093   Mean   : 0.8773  
##  3rd Qu.: 191.2   3rd Qu.:0.00000   3rd Qu.: 0.0000   3rd Qu.: 0.0000  
##  Max.   :2776.0   Max.   :1.00000   Max.   :20.0000   Max.   :49.0000  
##        C                D                 E                 F         
##  Min.   : 0.000   Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median : 0.000   Median : 0.0000   Median : 0.0000   Median : 0.000  
##  Mean   : 0.092   Mean   : 0.8247   Mean   : 0.3513   Mean   : 1.091  
##  3rd Qu.: 0.000   3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.000  
##  Max.   :26.000   Max.   :72.0000   Max.   :33.0000   Max.   :49.000  
##        G                  H                  I                J          
##  Min.   : 0.00000   Min.   :  0.0000   Min.   : 0.000   Min.   : 0.0000  
##  1st Qu.: 0.00000   1st Qu.:  0.0000   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Median : 0.00000   Median :  0.0000   Median : 0.000   Median : 0.0000  
##  Mean   : 0.02867   Mean   :  0.6427   Mean   : 0.102   Mean   : 0.6373  
##  3rd Qu.: 0.00000   3rd Qu.:  0.0000   3rd Qu.: 0.000   3rd Qu.: 0.0000  
##  Max.   :17.00000   Max.   :104.0000   Max.   :30.000   Max.   :31.0000  
##        K                 L                 M                N          
##  Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.000   Min.   : 0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.: 0.0000  
##  Median : 0.0000   Median : 0.0000   Median : 0.000   Median : 0.0000  
##  Mean   : 0.9633   Mean   : 0.6813   Mean   : 0.088   Mean   : 0.1513  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.: 0.000   3rd Qu.: 0.0000  
##  Max.   :59.0000   Max.   :24.0000   Max.   :25.000   Max.   :63.0000  
##        O                  P                 Q                  R          
##  Min.   : 0.00000   Min.   : 0.0000   Min.   : 0.00000   Min.   :  0.000  
##  1st Qu.: 0.00000   1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.:  0.000  
##  Median : 0.00000   Median : 0.0000   Median : 0.00000   Median :  0.000  
##  Mean   : 0.02867   Mean   : 0.7927   Mean   : 0.05267   Mean   :  0.874  
##  3rd Qu.: 0.00000   3rd Qu.: 0.0000   3rd Qu.: 0.00000   3rd Qu.:  0.000  
##  Max.   :20.00000   Max.   :38.0000   Max.   :63.00000   Max.   :135.000  
##        S                  T                 U                V          
##  Min.   : 0.00000   Min.   : 0.0000   Min.   : 0.000   Min.   :  0.000  
##  1st Qu.: 0.00000   1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.:  0.000  
##  Median : 0.00000   Median : 0.0000   Median : 0.000   Median :  0.000  
##  Mean   : 0.07867   Mean   : 0.8727   Mean   : 0.028   Mean   :  0.774  
##  3rd Qu.: 0.00000   3rd Qu.: 0.0000   3rd Qu.: 0.000   3rd Qu.:  0.000  
##  Max.   :23.00000   Max.   :47.0000   Max.   :14.000   Max.   :115.000  
##        W                X                Y                 Z         
##  Min.   : 0.000   Min.   : 0.000   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 0.000   Median :  0.000   Median : 0.000  
##  Mean   : 0.032   Mean   : 0.158   Mean   :  1.996   Mean   : 1.173  
##  3rd Qu.: 0.000   3rd Qu.: 0.000   3rd Qu.:  0.000   3rd Qu.: 0.000  
##  Max.   :14.000   Max.   :59.000   Max.   :134.000   Max.   :73.000  
##        AA               AB                AC                AD        
##  Min.   : 0.000   Min.   :  0.000   Min.   : 0.0000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.:  0.000   1st Qu.: 0.0000   1st Qu.: 0.000  
##  Median : 0.000   Median :  1.000   Median : 0.0000   Median : 0.000  
##  Mean   : 0.954   Mean   :  3.111   Mean   : 0.6087   Mean   : 0.148  
##  3rd Qu.: 0.000   3rd Qu.:  4.000   3rd Qu.: 0.0000   3rd Qu.: 0.000  
##  Max.   :50.000   Max.   :111.000   Max.   :31.0000   Max.   :41.000  
##  X..Psychotropic.Medications X..Administrations Therapeutic.Guidances
##  Min.   :0.00                Min.   :  0.00     Length:1500          
##  1st Qu.:2.00                1st Qu.:  6.00     Class :character     
##  Median :3.00                Median : 12.00     Mode  :character     
##  Mean   :2.66                Mean   : 18.32                          
##  3rd Qu.:4.00                3rd Qu.: 22.00                          
##  Max.   :9.00                Max.   :448.00

##gender needs to be factor, diagnosis to be extracted into three columns and coded,

##Clean column names

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

cypdata <- clean_names(cypdata)

##Dealing with missing data ##R can identify any cell with a missing value in a column, delete that particular row. This is aggressive; you might lose data. What you need is to delete rows with missing values in specific columns. Or you can do imputation.
##Let us assume you want to delete rows where data is missing in age

cypdata_clean <- cypdata[!is.na(cypdata$age), ]

##Suppose you want to do imputation. ##Identify columns with missing data

colSums(is.na(cypdata))

##                         id                     gender 
##                          0                          0 
##                        age             race_ethnicity 
##                          0                          0 
##                  diagnosis                         md 
##                          0                          0 
##                 assignment                        emr 
##                          0                          0 
##                        los                        rar 
##                          0                          0 
##                          a                          b 
##                          0                          0 
##                          c                          d 
##                          0                          0 
##                          e                          f 
##                          0                          0 
##                          g                          h 
##                          0                          0 
##                          i                          j 
##                          0                          0 
##                          k                          l 
##                          0                          0 
##                          m                          n 
##                          0                          0 
##                          o                          p 
##                          0                          0 
##                          q                          r 
##                          0                          0 
##                          s                          t 
##                          0                          0 
##                          u                          v 
##                          0                          0 
##                          w                          x 
##                          0                          0 
##                          y                          z 
##                          0                          0 
##                         aa                         ab 
##                          0                          0 
##                         ac                         ad 
##                          0                          0 
## x_psychotropic_medications          x_administrations 
##                          0                          0 
##      therapeutic_guidances 
##                          0

##This returned zero. But evidently there is missing data. It may be coded as strings. It may not have been imported correctly on csv. So we can convert them to readable missing values. 

cypdata <- as.data.frame(lapply(cypdata, function(x) {
  x[x %in% c("", "NA", "N/A", "NULL", ".")] <- NA
  return(x)
}))

##Now to run the test again. 
colSums(is.na(cypdata))

##                         id                     gender 
##                          0                          0 
##                        age             race_ethnicity 
##                          0                          0 
##                  diagnosis                         md 
##                          0                          0 
##                 assignment                        emr 
##                         41                          0 
##                        los                        rar 
##                          0                          0 
##                          a                          b 
##                          0                          0 
##                          c                          d 
##                          0                          0 
##                          e                          f 
##                          0                          0 
##                          g                          h 
##                          0                          0 
##                          i                          j 
##                          0                          0 
##                          k                          l 
##                          0                          0 
##                          m                          n 
##                          0                          0 
##                          o                          p 
##                          0                          0 
##                          q                          r 
##                          0                          0 
##                          s                          t 
##                          0                          0 
##                          u                          v 
##                          0                          0 
##                          w                          x 
##                          0                          0 
##                          y                          z 
##                          0                          0 
##                         aa                         ab 
##                          0                          0 
##                         ac                         ad 
##                          0                          0 
## x_psychotropic_medications          x_administrations 
##                          0                          0 
##      therapeutic_guidances 
##                         41

##In this dataset, there is no such value that can be identified that can be imputed with means. But suppose it was age, this is what you'd do: 
## Replace NA in age with mean
##df$age[is.na(df$age)] <- mean(df$age, na.rm = TRUE)

##Now one of the columns with too many missing data is assignment. To delete the rows with missing assignment:
cypdata_clean <- cypdata[!is.na(cypdata$assignment), ]

##Now to convert data to proper data types.

# Convert categorical variables
# But first find if dataset is a dataframe
class(cypdata_clean)

## [1] "data.frame"

cypdata_clean$gender <- as.factor(cypdata_clean$gender)
cypdata_clean$diagnosis <- as.factor(cypdata_clean$diagnosis)
cypdata_clean$therapeutic_guidances <- as.factor(cypdata_clean$therapeutic_guidances)

##Basic descriptive analysis ##Numerical variables

summary(cypdata_clean$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   24.00   37.00   38.72   51.00   87.00

summary(cypdata_clean$los)   # Length of stay

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     6.0    90.0   138.0   176.6   191.0  2776.0

##Categorical variables

table(cypdata_clean$gender)

## 
##   F   M 
## 744 715

table(cypdata_clean$diagnosis)

## 
##                                                    \tDepression, unspecified type 
##                                                                                 5 
##                                    \tMDD, recurrent episode with anxious distress 
##                                                                                16 
##                                           Adjustment Disorder With Depressed Mood 
##                                                                                 1 
##                                                               Bipolar II Disorder 
##                                                                                 1 
##                                                                        Depression 
##                                                                                 5 
##                                                 Depression with suicidal ideation 
##                                                                                 2 
##                                                           Depressive Disorder NOS 
##                                                                               256 
##                                                         Dissociative Disorder NOS 
##                                                                                 1 
##                                                Major depression, melancholic type 
##                                                                                 1 
##                Major depressive disorder, recurrent episode with anxious distress 
##                                                                                 1 
##              Major depressive disorder, recurrent episode, severe, with psychosis 
##                                                                                 1 
##                       Major depressive disorder, recurrent, with postpartum onset 
##                                                                                 1 
##                                                                               MDD 
##                                                                               114 
##                                                           MDD, recurrent episodes 
##                                                                                 2 
##                                                           MDD, Recurrent, Chronic 
##                                                                                13 
##                                                              MDD, Recurrent, Mild 
##                                                                                 2 
##                                                          MDD, Recurrent, Moderate 
##                                                                                16 
##                                                            MDD, recurrent, severe 
##                                                                                10 
##                                     MDD, recurrent, severe with atypical features 
##                                                                                 1 
##                                    MDD, Recurrent, Severe With Psychotic Features 
##                                                                                81 
##                                 MDD, Recurrent, Severe Without Psychotic Features 
##                                                                               316 
##                                                       MDD, Recurrent, Unspecified 
##                                                                               234 
##                        MDD, single episode with psychotic features, mood-conguent 
##                                                                                 1 
##                                                         MDD, Single Episode, Mild 
##                                                                                 2 
##                                                     MDD, Single Episode, Moderate 
##                                                                                 5 
##                               MDD, Single Episode, Severe With Psychotic Features 
##                                                                               111 
##                                                  MDD, Single Episode, Unspecified 
##                                                                                25 
##                             MDD, Single Episode,Severe Without Psychotic Features 
##                                                                               228 
##                                                                 Mood Disorder NOS 
##                                                                                 1 
##                                                          Schizoaffective Disorder 
##                                                                                 1 
## Severe episode of recurrent major depressive disorder, without psychotic features 
##                                                                                 4 
##                                                   Unspecified Depressive Disorder 
##                                                                                 1

##Visualization

##Age
hist(cypdata_clean$age, main="Age Distribution", xlab="Age")

##LOS by gender
boxplot(los ~ gender, data=cypdata_clean,
        main="LOS by Gender",
        xlab="Gender", ylab="Length of Stay")

##Installing packages for tests

##install.packages(c("tidyverse", "GGally"))
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(GGally) ##for correlation

##Correlation of numeric variables.

# Select numeric variables
numeric_cypdata_clean <- cypdata_clean %>% select(where(is.numeric))

# Correlation plot
ggpairs(numeric_cypdata_clean)

ggplot(cypdata_clean, aes(x = x_psychotropic_medications, y = los)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "LOS vs Psychotropic Medications",
       x = "Number of Medications",
       y = "Length of Stay") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

cor(cypdata_clean$x_psychotropic_medications, cypdata_clean$los, method = "pearson")

## [1] 0.3623815

##Paused the analaysis at this point.

Kaggle_CYPdataset

2026-06-03