Loading packages and reading in the data
# Installing and loading the packages I will use
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(corrplot)
## corrplot 0.95 loaded
# Reading in the data sets
personalized <- read.csv("personalized_medication_dataset.csv")
drug <- read.csv("Drug_clean.csv")
# Checking the structure
head(personalized)
## Patient_ID Age Gender Weight_kg Height_cm BMI Chronic_Conditions
## 1 P0001 78 Other 88.7 196.3 21.1 None
## 2 P0002 57 Female 90.5 195.6 30.2 Hypertension
## 3 P0003 29 Female 87.0 168.2 27.0 None
## 4 P0004 56 Female 81.4 188.9 26.9 Hypertension
## 5 P0005 90 Male 64.2 157.0 33.3 None
## 6 P0006 75 Female 75.2 152.9 19.3 None
## Drug_Allergies Genetic_Disorders Diagnosis Symptoms
## 1 Penicillin Cystic Fibrosis Inflammation Fever
## 2 None Cystic Fibrosis Depression Fatigue, Headache, Dizziness
## 3 Sulfa None Inflammation Joint Pain, Headache, Nausea
## 4 Penicillin Cystic Fibrosis Infection Joint Pain
## 5 Sulfa Sickle Cell Anemia Inflammation Fatigue, Fever, Headache
## 6 Penicillin None Arthritis Cough
## Recommended_Medication Dosage Duration Treatment_Effectiveness
## 1 Amlodipine None 30 days Effective
## 2 Amoxicillin 5 mg None Neutral
## 3 None None 7 days Effective
## 4 Ibuprofen 200 mg 7 days Very Effective
## 5 Amlodipine 500 mg 10 days Ineffective
## 6 Amoxicillin 400 mg None Very Effective
## Adverse_Reactions Recovery_Time_Days
## 1 Yes 18
## 2 No 24
## 3 Yes 12
## 4 No 22
## 5 Yes 25
## 6 Yes 16
head(drug)
## Condition Drug EaseOfUse Effective
## 1 Acute Bacterial Sinusitis Amoxicillin 3.852353 3.655882
## 2 Acute Bacterial Sinusitis Amoxicillin-Pot Clavulanate 3.470000 3.290000
## 3 Acute Bacterial Sinusitis Amoxicillin-Pot Clavulanate 3.121429 2.962857
## 4 Acute Bacterial Sinusitis Ampicillin 2.000000 3.000000
## 5 Acute Bacterial Sinusitis Ampicillin 3.250000 3.000000
## 6 Acute Bacterial Sinusitis Ampicillin Sodium 3.000000 3.000000
## Form Indication Price Reviews Satisfaction Type
## 1 Capsule On Label 12.59000 86.29412 3.197647 RX
## 2 Liquid (Drink) Off Label 287.37000 43.00000 2.590000 RX
## 3 Tablet On Label 70.60857 267.28571 2.248571 RX
## 4 Capsule On Label 12.59000 1.00000 1.000000 RX
## 5 Tablet On Label 125.24000 15.00000 3.000000 RX
## 6 Tablet Off Label 143.21500 1.00000 3.000000 RX
Data cleaning
# Making column names lowercase so everything matches
names(personalized) <- tolower(names(personalized))
names(drug) <- tolower(names(drug))
# Converting ID columns to character values
personalized$patient_id <- as.character(personalized$patient_id)
# Converting the drug column to a character format
drug$drug <- as.character(drug$drug)
# Combining the personalized data with drug information
merged <- left_join(
personalized,
drug,
by = c("recommended_medication" = "drug")
)
## Warning in left_join(personalized, drug, by = c(recommended_medication = "drug")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 527 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Looking at the merged data set
dim(merged)
## [1] 2444 26
head(merged)
## patient_id age gender weight_kg height_cm bmi chronic_conditions
## 1 P0001 78 Other 88.7 196.3 21.1 None
## 2 P0002 57 Female 90.5 195.6 30.2 Hypertension
## 3 P0002 57 Female 90.5 195.6 30.2 Hypertension
## 4 P0003 29 Female 87.0 168.2 27.0 None
## 5 P0004 56 Female 81.4 188.9 26.9 Hypertension
## 6 P0004 56 Female 81.4 188.9 26.9 Hypertension
## drug_allergies genetic_disorders diagnosis symptoms
## 1 Penicillin Cystic Fibrosis Inflammation Fever
## 2 None Cystic Fibrosis Depression Fatigue, Headache, Dizziness
## 3 None Cystic Fibrosis Depression Fatigue, Headache, Dizziness
## 4 Sulfa None Inflammation Joint Pain, Headache, Nausea
## 5 Penicillin Cystic Fibrosis Infection Joint Pain
## 6 Penicillin Cystic Fibrosis Infection Joint Pain
## recommended_medication dosage duration treatment_effectiveness
## 1 Amlodipine None 30 days Effective
## 2 Amoxicillin 5 mg None Neutral
## 3 Amoxicillin 5 mg None Neutral
## 4 None None 7 days Effective
## 5 Ibuprofen 200 mg 7 days Very Effective
## 6 Ibuprofen 200 mg 7 days Very Effective
## adverse_reactions recovery_time_days
## 1 Yes 18
## 2 No 24
## 3 No 24
## 4 Yes 12
## 5 No 22
## 6 No 22
## condition easeofuse effective form
## 1 hypertension 3.920000 3.270000 Tablet
## 2 Acute Bacterial Sinusitis 3.852353 3.655882 Capsule
## 3 Pharyngitis due to Streptococcus Pyogenes 4.070000 3.561765 Capsule
## 4 <NA> NA NA <NA>
## 5 fever 4.200000 3.954400 Tablet
## 6 fever 4.375000 4.125000 Tablet
## indication price reviews satisfaction type
## 1 On Label 57.99 730.00000 2.605000 RX
## 2 On Label 12.59 86.29412 3.197647 RX
## 3 On Label 12.59 87.41176 3.327059 RX
## 4 <NA> NA NA NA <NA>
## 5 On Label 13.49 5.72000 3.833600 OTC
## 6 On Label 13.49 1.50000 3.500000 RX
# Naming the merged data into a data frame
df <- merged
# Scaling certain numeric variables to prepare them for modeling
df$satisfaction <- scale(df$satisfaction)
df$reviews <- scale(df$reviews)
df$easeofuse <- scale(df$easeofuse)
df$price <- scale(df$price)
# Checking the structure of the treatment effectiveness column
str(df$treatment_effectiveness)
## chr [1:2444] "Effective" "Neutral" "Neutral" "Effective" "Very Effective" ...
table(df$treatment_effectiveness, useNA = "ifany")
##
## Effective Ineffective Neutral Very Effective
## 541 672 586 645
# Converting treatment_effectiveness into an ordered numeric scale to prepare for modeling
df$treatment_effectiveness_num <- case_when(
df$treatment_effectiveness == "Ineffective" ~ 1,
df$treatment_effectiveness == "Neutral" ~ 2,
df$treatment_effectiveness == "Effective" ~ 3,
df$treatment_effectiveness == "Very Effective" ~ 4,
TRUE ~ NA_real_
)
# Converting treatment effectiveness into categories
df$treatment_effectiveness_cat <- cut(
df$treatment_effectiveness_num,
breaks = c(0, 2, 3, 4),
labels = c("low", "medium", "high"),
include.lowest = TRUE
)
# Converting recovery time into groups for modeling
df$recovery_time_cat <- cut(
df$recovery_time_days,
breaks = quantile(df$recovery_time_days, c(0, .33, .66, 1), na.rm = TRUE),
labels = c("short", "medium", "long"),
include.lowest = TRUE
)
# Converting adverse reactions to a yes or no for modeling
df$adverse_flag <- ifelse(df$adverse_reactions == "None", 0, 1)
# Converting the effective column to a numeric flag
df$effective_flag <- ifelse(df$effective == "Yes", 1,
ifelse(df$effective == "No", 0, NA))
# Removing the leaking continuous variables
df$treatment_effectiveness <- NULL
df$recovery_time_days <- NULL
df$adverse_reactions <- NULL
df$effective <- NULL
# Loooking at the final structure of the data frame
str(df)
## 'data.frame': 2444 obs. of 27 variables:
## $ patient_id : chr "P0001" "P0002" "P0002" "P0003" ...
## $ age : int 78 57 57 29 56 56 56 56 56 56 ...
## $ gender : chr "Other" "Female" "Female" "Female" ...
## $ weight_kg : num 88.7 90.5 90.5 87 81.4 81.4 81.4 81.4 81.4 81.4 ...
## $ height_cm : num 196 196 196 168 189 ...
## $ bmi : num 21.1 30.2 30.2 27 26.9 26.9 26.9 26.9 26.9 26.9 ...
## $ chronic_conditions : chr "None" "Hypertension" "Hypertension" "None" ...
## $ drug_allergies : chr "Penicillin" "None" "None" "Sulfa" ...
## $ genetic_disorders : chr "Cystic Fibrosis" "Cystic Fibrosis" "Cystic Fibrosis" "None" ...
## $ diagnosis : chr "Inflammation" "Depression" "Depression" "Inflammation" ...
## $ symptoms : chr "Fever" "Fatigue, Headache, Dizziness" "Fatigue, Headache, Dizziness" "Joint Pain, Headache, Nausea" ...
## $ recommended_medication : chr "Amlodipine" "Amoxicillin" "Amoxicillin" "None" ...
## $ dosage : chr "None" "5 mg" "5 mg" "None" ...
## $ duration : chr "30 days" "None" "None" "7 days" ...
## $ condition : chr "hypertension" "Acute Bacterial Sinusitis" "Pharyngitis due to Streptococcus Pyogenes" NA ...
## $ easeofuse : num [1:2444, 1] -1.209 -1.581 -0.383 NA 0.333 ...
## ..- attr(*, "scaled:center")= num 4.14
## ..- attr(*, "scaled:scale")= num 0.182
## $ form : chr "Tablet" "Capsule" "Capsule" NA ...
## $ indication : chr "On Label" "On Label" "On Label" NA ...
## $ price : num [1:2444, 1] 2.809 -0.402 -0.402 NA -0.339 ...
## ..- attr(*, "scaled:center")= num 18.3
## ..- attr(*, "scaled:scale")= num 14.1
## $ reviews : num [1:2444, 1] 2.711 -0.23 -0.225 NA -0.598 ...
## ..- attr(*, "scaled:center")= num 137
## ..- attr(*, "scaled:scale")= num 219
## $ satisfaction : num [1:2444, 1] -2.198 -0.569 -0.213 NA 1.179 ...
## ..- attr(*, "scaled:center")= num 3.4
## ..- attr(*, "scaled:scale")= num 0.364
## $ type : chr "RX" "RX" "RX" NA ...
## $ treatment_effectiveness_num: num 3 2 2 3 4 4 4 4 4 4 ...
## $ treatment_effectiveness_cat: Factor w/ 3 levels "low","medium",..: 2 1 1 2 3 3 3 3 3 3 ...
## $ recovery_time_cat : Factor w/ 3 levels "short","medium",..: 2 3 3 1 3 3 3 3 3 3 ...
## $ adverse_flag : num 1 1 1 1 1 1 1 1 1 1 ...
## $ effective_flag : logi NA NA NA NA NA NA ...
Exploratory data analysis
# Age distribution plot
ggplot(df, aes(age)) +
geom_histogram(fill="blue", bins=30) +
ggtitle("Age Distribution")

# BMI distribution plot
ggplot(df, aes(bmi)) +
geom_histogram(fill="green", bins=30) +
ggtitle("BMI Distribution")

# Gender distribution plot
ggplot(df, aes(gender)) +
geom_bar(fill="orange") +
ggtitle("Gender Distribution")

# Creating a correlation matrix of numeric variables
num_vars <- df %>% select(where(is.numeric))
corrplot(cor(num_vars, use="complete.obs"), method="color")
## Warning in cor(num_vars, use = "complete.obs"): the standard deviation is zero

# Looking at the final structure of df
str(df)
## 'data.frame': 2444 obs. of 27 variables:
## $ patient_id : chr "P0001" "P0002" "P0002" "P0003" ...
## $ age : int 78 57 57 29 56 56 56 56 56 56 ...
## $ gender : chr "Other" "Female" "Female" "Female" ...
## $ weight_kg : num 88.7 90.5 90.5 87 81.4 81.4 81.4 81.4 81.4 81.4 ...
## $ height_cm : num 196 196 196 168 189 ...
## $ bmi : num 21.1 30.2 30.2 27 26.9 26.9 26.9 26.9 26.9 26.9 ...
## $ chronic_conditions : chr "None" "Hypertension" "Hypertension" "None" ...
## $ drug_allergies : chr "Penicillin" "None" "None" "Sulfa" ...
## $ genetic_disorders : chr "Cystic Fibrosis" "Cystic Fibrosis" "Cystic Fibrosis" "None" ...
## $ diagnosis : chr "Inflammation" "Depression" "Depression" "Inflammation" ...
## $ symptoms : chr "Fever" "Fatigue, Headache, Dizziness" "Fatigue, Headache, Dizziness" "Joint Pain, Headache, Nausea" ...
## $ recommended_medication : chr "Amlodipine" "Amoxicillin" "Amoxicillin" "None" ...
## $ dosage : chr "None" "5 mg" "5 mg" "None" ...
## $ duration : chr "30 days" "None" "None" "7 days" ...
## $ condition : chr "hypertension" "Acute Bacterial Sinusitis" "Pharyngitis due to Streptococcus Pyogenes" NA ...
## $ easeofuse : num [1:2444, 1] -1.209 -1.581 -0.383 NA 0.333 ...
## ..- attr(*, "scaled:center")= num 4.14
## ..- attr(*, "scaled:scale")= num 0.182
## $ form : chr "Tablet" "Capsule" "Capsule" NA ...
## $ indication : chr "On Label" "On Label" "On Label" NA ...
## $ price : num [1:2444, 1] 2.809 -0.402 -0.402 NA -0.339 ...
## ..- attr(*, "scaled:center")= num 18.3
## ..- attr(*, "scaled:scale")= num 14.1
## $ reviews : num [1:2444, 1] 2.711 -0.23 -0.225 NA -0.598 ...
## ..- attr(*, "scaled:center")= num 137
## ..- attr(*, "scaled:scale")= num 219
## $ satisfaction : num [1:2444, 1] -2.198 -0.569 -0.213 NA 1.179 ...
## ..- attr(*, "scaled:center")= num 3.4
## ..- attr(*, "scaled:scale")= num 0.364
## $ type : chr "RX" "RX" "RX" NA ...
## $ treatment_effectiveness_num: num 3 2 2 3 4 4 4 4 4 4 ...
## $ treatment_effectiveness_cat: Factor w/ 3 levels "low","medium",..: 2 1 1 2 3 3 3 3 3 3 ...
## $ recovery_time_cat : Factor w/ 3 levels "short","medium",..: 2 3 3 1 3 3 3 3 3 3 ...
## $ adverse_flag : num 1 1 1 1 1 1 1 1 1 1 ...
## $ effective_flag : logi NA NA NA NA NA NA ...
Setting up the data for modeling
# Creating a training and test split
colnames(df)
## [1] "patient_id" "age"
## [3] "gender" "weight_kg"
## [5] "height_cm" "bmi"
## [7] "chronic_conditions" "drug_allergies"
## [9] "genetic_disorders" "diagnosis"
## [11] "symptoms" "recommended_medication"
## [13] "dosage" "duration"
## [15] "condition" "easeofuse"
## [17] "form" "indication"
## [19] "price" "reviews"
## [21] "satisfaction" "type"
## [23] "treatment_effectiveness_num" "treatment_effectiveness_cat"
## [25] "recovery_time_cat" "adverse_flag"
## [27] "effective_flag"
df$effect_bin <- df$treatment_effectiveness_cat
df$effect_bin <- as.factor(df$effect_bin)
df$effect_bin <- ifelse(df$treatment_effectiveness_cat == "high", 1, 0)
df$effect_bin <- factor(df$effect_bin, levels = c(0,1))
set.seed(8634)
split <- createDataPartition(df$effect_bin, p = 0.8, list = FALSE)
train <- df[split, ]
test <- df[-split, ]
Logistic regression model
# Logistic regression model
log_model <- glm(effect_bin ~ age + bmi + easeofuse + satisfaction,
data = train,
family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = effect_bin ~ age + bmi + easeofuse + satisfaction,
## family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.308441 0.349785 -6.600 4.12e-11 ***
## age 0.003132 0.002575 1.216 0.223875
## bmi 0.040821 0.011845 3.446 0.000569 ***
## easeofuse 0.060279 0.079099 0.762 0.446019
## satisfaction 0.023386 0.078066 0.300 0.764504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1992.2 on 1744 degrees of freedom
## Residual deviance: 1977.0 on 1740 degrees of freedom
## (211 observations deleted due to missingness)
## AIC: 1987
##
## Number of Fisher Scoring iterations: 4
# Creating predictions using the logistic regression model
log_prob <- predict(log_model, newdata = test, type = "response")
log_pred <- ifelse(log_prob > 0.5, "1", "0")
log_pred <- as.factor(log_pred)
# Looking at the output of the model
levels(test$effect_bin)
## [1] "0" "1"
levels(log_pred)
## [1] "0"
table(test$effect_bin)
##
## 0 1
## 359 129
table(log_pred)
## log_pred
## 0
## 436
# Evaluating the accuracy of the model
confusionMatrix(log_pred, test$effect_bin)
## Warning in confusionMatrix.default(log_pred, test$effect_bin): Levels are not
## in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 324 112
## 1 0 0
##
## Accuracy : 0.7431
## 95% CI : (0.6994, 0.7835)
## No Information Rate : 0.7431
## P-Value [Acc > NIR] : 0.5254
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7431
## Neg Pred Value : NaN
## Prevalence : 0.7431
## Detection Rate : 0.7431
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# Plotting the predicted probabilities
hist(log_prob, main="Logistic Regression Probabilities", xlab="Probability")

Random forest model
# Dropping NA values
train_clean <- train %>% drop_na(age, bmi, easeofuse, satisfaction, effect_bin)
test_clean <- test %>% drop_na(age, bmi, easeofuse, satisfaction, effect_bin)
# Making the effect_bin a factor
train_clean$effect_bin <- factor(train_clean$effect_bin, levels=c(0,1))
test_clean$effect_bin <- factor(test_clean$effect_bin, levels=c(0,1))
# Random forest model
rf_model <- randomForest(effect_bin ~ age + bmi + easeofuse + satisfaction,
data=train_clean,
ntree=200)
rf_pred <- predict(rf_model, newdata=test)
# Evaluating the accuracy of the model
confusionMatrix(rf_pred, test$effect_bin)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 320 41
## 1 4 71
##
## Accuracy : 0.8968
## 95% CI : (0.8643, 0.9237)
## No Information Rate : 0.7431
## P-Value [Acc > NIR] : 7.449e-16
##
## Kappa : 0.6969
##
## Mcnemar's Test P-Value : 8.025e-08
##
## Sensitivity : 0.9877
## Specificity : 0.6339
## Pos Pred Value : 0.8864
## Neg Pred Value : 0.9467
## Prevalence : 0.7431
## Detection Rate : 0.7339
## Detection Prevalence : 0.8280
## Balanced Accuracy : 0.8108
##
## 'Positive' Class : 0
##
# Making a variable importance plot
varImpPlot(rf_model)
