##CYP-GUIDES (Cytochrome Psychotropic Genotyping Under Investigation for Decision Support) is a randomized controlled trial (RCT) comparing 2 outcomes in hospitalized patients with severe depressive disorders treated according to the patient’s CYP2D6 genotype and functional status versus standard psychotropic therapy. The primary outcome was hospital Length of Stay (LOS) and the secondary outcome was the Re-Admission Rate (RAR) 30 days after discharge. The RCT recruited 1500 patients, genotyped CYP2D6 in 1459, and randomized 477 to standard therapy (Group S), for whom treatment-as-usual guidance was delivered without consideration of patient CYP2D6 genotype, and 982 to genetically-guided therapy (Group G) where CYP2D6-based treatment recommendations were provided via EMR to physicians. For inpatients in Group G whose CYP2D6 function was sub- or supra-normal, medications primarily metabolized by the CYP2D6 enzyme were proscribed.
cypdata <- read.csv("Dataset.csv", stringsAsFactors = FALSE)
dim(df) ##gives number of rows and columns. If null, it is not dataframe or matrix. might be vector.
## NULL
##To find out
class(cypdata)
## [1] "data.frame"
##Normality testing of age; just to learn.
library(ggplot2)
ggplot(cypdata, aes(sample = AGE)) +
stat_qq() +
stat_qq_line()
To roughly summarise the data
summary(cypdata)
## ID GENDER AGE RACE.ETHNICITY
## Min. : 1.0 Length:1500 Min. :18.00 Length:1500
## 1st Qu.: 375.8 Class :character 1st Qu.:24.00 Class :character
## Median : 750.5 Mode :character Median :37.00 Mode :character
## Mean : 750.5 Mean :38.78
## 3rd Qu.:1125.2 3rd Qu.:51.00
## Max. :1500.0 Max. :87.00
## Diagnosis MD Assignment EMR
## Length:1500 Length:1500 Length:1500 Length:1500
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## LOS RAR A B
## Min. : 6.0 Min. :0.00000 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 90.0 1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.: 0.0000
## Median : 138.0 Median :0.00000 Median : 0.0000 Median : 0.0000
## Mean : 176.5 Mean :0.09467 Mean : 0.1093 Mean : 0.8773
## 3rd Qu.: 191.2 3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.0000
## Max. :2776.0 Max. :1.00000 Max. :20.0000 Max. :49.0000
## C D E F
## Min. : 0.000 Min. : 0.0000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.000 Median : 0.0000 Median : 0.0000 Median : 0.000
## Mean : 0.092 Mean : 0.8247 Mean : 0.3513 Mean : 1.091
## 3rd Qu.: 0.000 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.000
## Max. :26.000 Max. :72.0000 Max. :33.0000 Max. :49.000
## G H I J
## Min. : 0.00000 Min. : 0.0000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.00000 Median : 0.0000 Median : 0.000 Median : 0.0000
## Mean : 0.02867 Mean : 0.6427 Mean : 0.102 Mean : 0.6373
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. :17.00000 Max. :104.0000 Max. :30.000 Max. :31.0000
## K L M N
## Min. : 0.0000 Min. : 0.0000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0000
## Median : 0.0000 Median : 0.0000 Median : 0.000 Median : 0.0000
## Mean : 0.9633 Mean : 0.6813 Mean : 0.088 Mean : 0.1513
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.0000
## Max. :59.0000 Max. :24.0000 Max. :25.000 Max. :63.0000
## O P Q R
## Min. : 0.00000 Min. : 0.0000 Min. : 0.00000 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.000
## Median : 0.00000 Median : 0.0000 Median : 0.00000 Median : 0.000
## Mean : 0.02867 Mean : 0.7927 Mean : 0.05267 Mean : 0.874
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.00000 3rd Qu.: 0.000
## Max. :20.00000 Max. :38.0000 Max. :63.00000 Max. :135.000
## S T U V
## Min. : 0.00000 Min. : 0.0000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.00000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.00000 Median : 0.0000 Median : 0.000 Median : 0.000
## Mean : 0.07867 Mean : 0.8727 Mean : 0.028 Mean : 0.774
## 3rd Qu.: 0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :23.00000 Max. :47.0000 Max. :14.000 Max. :115.000
## W X Y Z
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000 Median : 0.000 Median : 0.000
## Mean : 0.032 Mean : 0.158 Mean : 1.996 Mean : 1.173
## 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0.000 3rd Qu.: 0.000
## Max. :14.000 Max. :59.000 Max. :134.000 Max. :73.000
## AA AB AC AD
## Min. : 0.000 Min. : 0.000 Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.0000 1st Qu.: 0.000
## Median : 0.000 Median : 1.000 Median : 0.0000 Median : 0.000
## Mean : 0.954 Mean : 3.111 Mean : 0.6087 Mean : 0.148
## 3rd Qu.: 0.000 3rd Qu.: 4.000 3rd Qu.: 0.0000 3rd Qu.: 0.000
## Max. :50.000 Max. :111.000 Max. :31.0000 Max. :41.000
## X..Psychotropic.Medications X..Administrations Therapeutic.Guidances
## Min. :0.00 Min. : 0.00 Length:1500
## 1st Qu.:2.00 1st Qu.: 6.00 Class :character
## Median :3.00 Median : 12.00 Mode :character
## Mean :2.66 Mean : 18.32
## 3rd Qu.:4.00 3rd Qu.: 22.00
## Max. :9.00 Max. :448.00
##gender needs to be factor, diagnosis to be extracted into three columns and coded,
##Clean column names
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
cypdata <- clean_names(cypdata)
##Dealing with missing data ##R can identify any cell with a missing
value in a column, delete that particular row. This is aggressive; you
might lose data. What you need is to delete rows with missing values in
specific columns. Or you can do imputation.
##Let us assume you want to delete rows where data is missing in age
cypdata_clean <- cypdata[!is.na(cypdata$age), ]
##Suppose you want to do imputation. ##Identify columns with missing data
colSums(is.na(cypdata))
## id gender
## 0 0
## age race_ethnicity
## 0 0
## diagnosis md
## 0 0
## assignment emr
## 0 0
## los rar
## 0 0
## a b
## 0 0
## c d
## 0 0
## e f
## 0 0
## g h
## 0 0
## i j
## 0 0
## k l
## 0 0
## m n
## 0 0
## o p
## 0 0
## q r
## 0 0
## s t
## 0 0
## u v
## 0 0
## w x
## 0 0
## y z
## 0 0
## aa ab
## 0 0
## ac ad
## 0 0
## x_psychotropic_medications x_administrations
## 0 0
## therapeutic_guidances
## 0
##This returned zero. But evidently there is missing data. It may be coded as strings. It may not have been imported correctly on csv. So we can convert them to readable missing values.
cypdata <- as.data.frame(lapply(cypdata, function(x) {
x[x %in% c("", "NA", "N/A", "NULL", ".")] <- NA
return(x)
}))
##Now to run the test again.
colSums(is.na(cypdata))
## id gender
## 0 0
## age race_ethnicity
## 0 0
## diagnosis md
## 0 0
## assignment emr
## 41 0
## los rar
## 0 0
## a b
## 0 0
## c d
## 0 0
## e f
## 0 0
## g h
## 0 0
## i j
## 0 0
## k l
## 0 0
## m n
## 0 0
## o p
## 0 0
## q r
## 0 0
## s t
## 0 0
## u v
## 0 0
## w x
## 0 0
## y z
## 0 0
## aa ab
## 0 0
## ac ad
## 0 0
## x_psychotropic_medications x_administrations
## 0 0
## therapeutic_guidances
## 41
##In this dataset, there is no such value that can be identified that can be imputed with means. But suppose it was age, this is what you'd do:
## Replace NA in age with mean
##df$age[is.na(df$age)] <- mean(df$age, na.rm = TRUE)
##Now one of the columns with too many missing data is assignment. To delete the rows with missing assignment:
cypdata_clean <- cypdata[!is.na(cypdata$assignment), ]
##Now to convert data to proper data types.
# Convert categorical variables
# But first find if dataset is a dataframe
class(cypdata_clean)
## [1] "data.frame"
cypdata_clean$gender <- as.factor(cypdata_clean$gender)
cypdata_clean$diagnosis <- as.factor(cypdata_clean$diagnosis)
cypdata_clean$therapeutic_guidances <- as.factor(cypdata_clean$therapeutic_guidances)
##Basic descriptive analysis ##Numerical variables
summary(cypdata_clean$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 24.00 37.00 38.72 51.00 87.00
summary(cypdata_clean$los) # Length of stay
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.0 90.0 138.0 176.6 191.0 2776.0
##Categorical variables
table(cypdata_clean$gender)
##
## F M
## 744 715
table(cypdata_clean$diagnosis)
##
## \tDepression, unspecified type
## 5
## \tMDD, recurrent episode with anxious distress
## 16
## Adjustment Disorder With Depressed Mood
## 1
## Bipolar II Disorder
## 1
## Depression
## 5
## Depression with suicidal ideation
## 2
## Depressive Disorder NOS
## 256
## Dissociative Disorder NOS
## 1
## Major depression, melancholic type
## 1
## Major depressive disorder, recurrent episode with anxious distress
## 1
## Major depressive disorder, recurrent episode, severe, with psychosis
## 1
## Major depressive disorder, recurrent, with postpartum onset
## 1
## MDD
## 114
## MDD, recurrent episodes
## 2
## MDD, Recurrent, Chronic
## 13
## MDD, Recurrent, Mild
## 2
## MDD, Recurrent, Moderate
## 16
## MDD, recurrent, severe
## 10
## MDD, recurrent, severe with atypical features
## 1
## MDD, Recurrent, Severe With Psychotic Features
## 81
## MDD, Recurrent, Severe Without Psychotic Features
## 316
## MDD, Recurrent, Unspecified
## 234
## MDD, single episode with psychotic features, mood-conguent
## 1
## MDD, Single Episode, Mild
## 2
## MDD, Single Episode, Moderate
## 5
## MDD, Single Episode, Severe With Psychotic Features
## 111
## MDD, Single Episode, Unspecified
## 25
## MDD, Single Episode,Severe Without Psychotic Features
## 228
## Mood Disorder NOS
## 1
## Schizoaffective Disorder
## 1
## Severe episode of recurrent major depressive disorder, without psychotic features
## 4
## Unspecified Depressive Disorder
## 1
##Visualization
##Age
hist(cypdata_clean$age, main="Age Distribution", xlab="Age")
##LOS by gender
boxplot(los ~ gender, data=cypdata_clean,
main="LOS by Gender",
xlab="Gender", ylab="Length of Stay")
##Installing packages for tests
##install.packages(c("tidyverse", "GGally"))
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally) ##for correlation
##Correlation of numeric variables.
# Select numeric variables
numeric_cypdata_clean <- cypdata_clean %>% select(where(is.numeric))
# Correlation plot
ggpairs(numeric_cypdata_clean)
ggplot(cypdata_clean, aes(x = x_psychotropic_medications, y = los)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE) +
labs(title = "LOS vs Psychotropic Medications",
x = "Number of Medications",
y = "Length of Stay") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
cor(cypdata_clean$x_psychotropic_medications, cypdata_clean$los, method = "pearson")
## [1] 0.3623815
##Paused the analaysis at this point.