Loading packages and reading in the data

# Installing and loading the packages I will use
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(randomForest)

## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

library(corrplot)

## corrplot 0.95 loaded

# Reading in the data sets
personalized <- read.csv("personalized_medication_dataset.csv")
drug <- read.csv("Drug_clean.csv")

# Checking the structure 
head(personalized)

##   Patient_ID Age Gender Weight_kg Height_cm  BMI Chronic_Conditions
## 1      P0001  78  Other      88.7     196.3 21.1               None
## 2      P0002  57 Female      90.5     195.6 30.2       Hypertension
## 3      P0003  29 Female      87.0     168.2 27.0               None
## 4      P0004  56 Female      81.4     188.9 26.9       Hypertension
## 5      P0005  90   Male      64.2     157.0 33.3               None
## 6      P0006  75 Female      75.2     152.9 19.3               None
##   Drug_Allergies  Genetic_Disorders    Diagnosis                     Symptoms
## 1     Penicillin    Cystic Fibrosis Inflammation                        Fever
## 2           None    Cystic Fibrosis   Depression Fatigue, Headache, Dizziness
## 3          Sulfa               None Inflammation Joint Pain, Headache, Nausea
## 4     Penicillin    Cystic Fibrosis    Infection                   Joint Pain
## 5          Sulfa Sickle Cell Anemia Inflammation     Fatigue, Fever, Headache
## 6     Penicillin               None    Arthritis                        Cough
##   Recommended_Medication Dosage Duration Treatment_Effectiveness
## 1             Amlodipine   None  30 days               Effective
## 2            Amoxicillin   5 mg     None                 Neutral
## 3                   None   None   7 days               Effective
## 4              Ibuprofen 200 mg   7 days          Very Effective
## 5             Amlodipine 500 mg  10 days             Ineffective
## 6            Amoxicillin 400 mg     None          Very Effective
##   Adverse_Reactions Recovery_Time_Days
## 1               Yes                 18
## 2                No                 24
## 3               Yes                 12
## 4                No                 22
## 5               Yes                 25
## 6               Yes                 16

head(drug)

##                   Condition                        Drug EaseOfUse Effective
## 1 Acute Bacterial Sinusitis                 Amoxicillin  3.852353  3.655882
## 2 Acute Bacterial Sinusitis Amoxicillin-Pot Clavulanate  3.470000  3.290000
## 3 Acute Bacterial Sinusitis Amoxicillin-Pot Clavulanate  3.121429  2.962857
## 4 Acute Bacterial Sinusitis                  Ampicillin  2.000000  3.000000
## 5 Acute Bacterial Sinusitis                  Ampicillin  3.250000  3.000000
## 6 Acute Bacterial Sinusitis           Ampicillin Sodium  3.000000  3.000000
##             Form Indication     Price   Reviews Satisfaction Type
## 1        Capsule   On Label  12.59000  86.29412     3.197647   RX
## 2 Liquid (Drink)  Off Label 287.37000  43.00000     2.590000   RX
## 3         Tablet   On Label  70.60857 267.28571     2.248571   RX
## 4        Capsule   On Label  12.59000   1.00000     1.000000   RX
## 5         Tablet   On Label 125.24000  15.00000     3.000000   RX
## 6         Tablet  Off Label 143.21500   1.00000     3.000000   RX

Data cleaning

# Making column names lowercase so everything matches
names(personalized) <- tolower(names(personalized))
names(drug) <- tolower(names(drug))

# Converting ID columns to character values
personalized$patient_id <- as.character(personalized$patient_id)

# Converting the drug column to a character format
drug$drug <- as.character(drug$drug)

# Combining the personalized data with drug information
merged <- left_join(
  personalized,
  drug,
  by = c("recommended_medication" = "drug")
)

## Warning in left_join(personalized, drug, by = c(recommended_medication = "drug")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 527 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Looking at the merged data set
dim(merged)

## [1] 2444   26

head(merged)

##   patient_id age gender weight_kg height_cm  bmi chronic_conditions
## 1      P0001  78  Other      88.7     196.3 21.1               None
## 2      P0002  57 Female      90.5     195.6 30.2       Hypertension
## 3      P0002  57 Female      90.5     195.6 30.2       Hypertension
## 4      P0003  29 Female      87.0     168.2 27.0               None
## 5      P0004  56 Female      81.4     188.9 26.9       Hypertension
## 6      P0004  56 Female      81.4     188.9 26.9       Hypertension
##   drug_allergies genetic_disorders    diagnosis                     symptoms
## 1     Penicillin   Cystic Fibrosis Inflammation                        Fever
## 2           None   Cystic Fibrosis   Depression Fatigue, Headache, Dizziness
## 3           None   Cystic Fibrosis   Depression Fatigue, Headache, Dizziness
## 4          Sulfa              None Inflammation Joint Pain, Headache, Nausea
## 5     Penicillin   Cystic Fibrosis    Infection                   Joint Pain
## 6     Penicillin   Cystic Fibrosis    Infection                   Joint Pain
##   recommended_medication dosage duration treatment_effectiveness
## 1             Amlodipine   None  30 days               Effective
## 2            Amoxicillin   5 mg     None                 Neutral
## 3            Amoxicillin   5 mg     None                 Neutral
## 4                   None   None   7 days               Effective
## 5              Ibuprofen 200 mg   7 days          Very Effective
## 6              Ibuprofen 200 mg   7 days          Very Effective
##   adverse_reactions recovery_time_days
## 1               Yes                 18
## 2                No                 24
## 3                No                 24
## 4               Yes                 12
## 5                No                 22
## 6                No                 22
##                                   condition easeofuse effective    form
## 1                              hypertension  3.920000  3.270000  Tablet
## 2                 Acute Bacterial Sinusitis  3.852353  3.655882 Capsule
## 3 Pharyngitis due to Streptococcus Pyogenes  4.070000  3.561765 Capsule
## 4                                      <NA>        NA        NA    <NA>
## 5                                     fever  4.200000  3.954400  Tablet
## 6                                     fever  4.375000  4.125000  Tablet
##   indication price   reviews satisfaction type
## 1   On Label 57.99 730.00000     2.605000   RX
## 2   On Label 12.59  86.29412     3.197647   RX
## 3   On Label 12.59  87.41176     3.327059   RX
## 4       <NA>    NA        NA           NA <NA>
## 5   On Label 13.49   5.72000     3.833600  OTC
## 6   On Label 13.49   1.50000     3.500000   RX

# Naming the merged data into a data frame
df <- merged

# Scaling certain numeric variables to prepare them for modeling
df$satisfaction <- scale(df$satisfaction)
df$reviews <- scale(df$reviews)
df$easeofuse <- scale(df$easeofuse)
df$price <- scale(df$price)

# Checking the structure of the treatment effectiveness column 
str(df$treatment_effectiveness)

##  chr [1:2444] "Effective" "Neutral" "Neutral" "Effective" "Very Effective" ...

table(df$treatment_effectiveness, useNA = "ifany")

## 
##      Effective    Ineffective        Neutral Very Effective 
##            541            672            586            645

# Converting treatment_effectiveness into an ordered numeric scale to prepare for modeling
df$treatment_effectiveness_num <- case_when(
  df$treatment_effectiveness == "Ineffective" ~ 1,
  df$treatment_effectiveness == "Neutral" ~ 2,
  df$treatment_effectiveness == "Effective" ~ 3,
  df$treatment_effectiveness == "Very Effective" ~ 4,
  TRUE ~ NA_real_
)

# Converting treatment effectiveness into categories
df$treatment_effectiveness_cat <- cut(
  df$treatment_effectiveness_num,
  breaks = c(0, 2, 3, 4),
  labels = c("low", "medium", "high"),
  include.lowest = TRUE
)

# Converting recovery time into groups for modeling
df$recovery_time_cat <- cut(
  df$recovery_time_days,
  breaks = quantile(df$recovery_time_days, c(0, .33, .66, 1), na.rm = TRUE),
  labels = c("short", "medium", "long"),
  include.lowest = TRUE
)

# Converting adverse reactions to a yes or no for modeling
df$adverse_flag <- ifelse(df$adverse_reactions == "None", 0, 1)

# Converting the effective column to a numeric flag
df$effective_flag <- ifelse(df$effective == "Yes", 1,
                            ifelse(df$effective == "No", 0, NA))

# Removing the leaking continuous variables
df$treatment_effectiveness <- NULL
df$recovery_time_days <- NULL
df$adverse_reactions <- NULL
df$effective <- NULL

# Loooking at the final structure of the data frame
str(df)

## 'data.frame':    2444 obs. of  27 variables:
##  $ patient_id                 : chr  "P0001" "P0002" "P0002" "P0003" ...
##  $ age                        : int  78 57 57 29 56 56 56 56 56 56 ...
##  $ gender                     : chr  "Other" "Female" "Female" "Female" ...
##  $ weight_kg                  : num  88.7 90.5 90.5 87 81.4 81.4 81.4 81.4 81.4 81.4 ...
##  $ height_cm                  : num  196 196 196 168 189 ...
##  $ bmi                        : num  21.1 30.2 30.2 27 26.9 26.9 26.9 26.9 26.9 26.9 ...
##  $ chronic_conditions         : chr  "None" "Hypertension" "Hypertension" "None" ...
##  $ drug_allergies             : chr  "Penicillin" "None" "None" "Sulfa" ...
##  $ genetic_disorders          : chr  "Cystic Fibrosis" "Cystic Fibrosis" "Cystic Fibrosis" "None" ...
##  $ diagnosis                  : chr  "Inflammation" "Depression" "Depression" "Inflammation" ...
##  $ symptoms                   : chr  "Fever" "Fatigue, Headache, Dizziness" "Fatigue, Headache, Dizziness" "Joint Pain, Headache, Nausea" ...
##  $ recommended_medication     : chr  "Amlodipine" "Amoxicillin" "Amoxicillin" "None" ...
##  $ dosage                     : chr  "None" "5 mg" "5 mg" "None" ...
##  $ duration                   : chr  "30 days" "None" "None" "7 days" ...
##  $ condition                  : chr  "hypertension" "Acute Bacterial Sinusitis" "Pharyngitis due to Streptococcus Pyogenes" NA ...
##  $ easeofuse                  : num [1:2444, 1] -1.209 -1.581 -0.383 NA 0.333 ...
##   ..- attr(*, "scaled:center")= num 4.14
##   ..- attr(*, "scaled:scale")= num 0.182
##  $ form                       : chr  "Tablet" "Capsule" "Capsule" NA ...
##  $ indication                 : chr  "On Label" "On Label" "On Label" NA ...
##  $ price                      : num [1:2444, 1] 2.809 -0.402 -0.402 NA -0.339 ...
##   ..- attr(*, "scaled:center")= num 18.3
##   ..- attr(*, "scaled:scale")= num 14.1
##  $ reviews                    : num [1:2444, 1] 2.711 -0.23 -0.225 NA -0.598 ...
##   ..- attr(*, "scaled:center")= num 137
##   ..- attr(*, "scaled:scale")= num 219
##  $ satisfaction               : num [1:2444, 1] -2.198 -0.569 -0.213 NA 1.179 ...
##   ..- attr(*, "scaled:center")= num 3.4
##   ..- attr(*, "scaled:scale")= num 0.364
##  $ type                       : chr  "RX" "RX" "RX" NA ...
##  $ treatment_effectiveness_num: num  3 2 2 3 4 4 4 4 4 4 ...
##  $ treatment_effectiveness_cat: Factor w/ 3 levels "low","medium",..: 2 1 1 2 3 3 3 3 3 3 ...
##  $ recovery_time_cat          : Factor w/ 3 levels "short","medium",..: 2 3 3 1 3 3 3 3 3 3 ...
##  $ adverse_flag               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ effective_flag             : logi  NA NA NA NA NA NA ...

Exploratory data analysis

# Age distribution plot
ggplot(df, aes(age)) +
  geom_histogram(fill="blue", bins=30) +
  ggtitle("Age Distribution")

# BMI distribution plot
ggplot(df, aes(bmi)) +
  geom_histogram(fill="green", bins=30) +
  ggtitle("BMI Distribution")

# Gender distribution plot
ggplot(df, aes(gender)) +
  geom_bar(fill="orange") +
  ggtitle("Gender Distribution")

# Creating a correlation matrix of numeric variables
num_vars <- df %>% select(where(is.numeric))
corrplot(cor(num_vars, use="complete.obs"), method="color")

## Warning in cor(num_vars, use = "complete.obs"): the standard deviation is zero

# Looking at the final structure of df
str(df)

## 'data.frame':    2444 obs. of  27 variables:
##  $ patient_id                 : chr  "P0001" "P0002" "P0002" "P0003" ...
##  $ age                        : int  78 57 57 29 56 56 56 56 56 56 ...
##  $ gender                     : chr  "Other" "Female" "Female" "Female" ...
##  $ weight_kg                  : num  88.7 90.5 90.5 87 81.4 81.4 81.4 81.4 81.4 81.4 ...
##  $ height_cm                  : num  196 196 196 168 189 ...
##  $ bmi                        : num  21.1 30.2 30.2 27 26.9 26.9 26.9 26.9 26.9 26.9 ...
##  $ chronic_conditions         : chr  "None" "Hypertension" "Hypertension" "None" ...
##  $ drug_allergies             : chr  "Penicillin" "None" "None" "Sulfa" ...
##  $ genetic_disorders          : chr  "Cystic Fibrosis" "Cystic Fibrosis" "Cystic Fibrosis" "None" ...
##  $ diagnosis                  : chr  "Inflammation" "Depression" "Depression" "Inflammation" ...
##  $ symptoms                   : chr  "Fever" "Fatigue, Headache, Dizziness" "Fatigue, Headache, Dizziness" "Joint Pain, Headache, Nausea" ...
##  $ recommended_medication     : chr  "Amlodipine" "Amoxicillin" "Amoxicillin" "None" ...
##  $ dosage                     : chr  "None" "5 mg" "5 mg" "None" ...
##  $ duration                   : chr  "30 days" "None" "None" "7 days" ...
##  $ condition                  : chr  "hypertension" "Acute Bacterial Sinusitis" "Pharyngitis due to Streptococcus Pyogenes" NA ...
##  $ easeofuse                  : num [1:2444, 1] -1.209 -1.581 -0.383 NA 0.333 ...
##   ..- attr(*, "scaled:center")= num 4.14
##   ..- attr(*, "scaled:scale")= num 0.182
##  $ form                       : chr  "Tablet" "Capsule" "Capsule" NA ...
##  $ indication                 : chr  "On Label" "On Label" "On Label" NA ...
##  $ price                      : num [1:2444, 1] 2.809 -0.402 -0.402 NA -0.339 ...
##   ..- attr(*, "scaled:center")= num 18.3
##   ..- attr(*, "scaled:scale")= num 14.1
##  $ reviews                    : num [1:2444, 1] 2.711 -0.23 -0.225 NA -0.598 ...
##   ..- attr(*, "scaled:center")= num 137
##   ..- attr(*, "scaled:scale")= num 219
##  $ satisfaction               : num [1:2444, 1] -2.198 -0.569 -0.213 NA 1.179 ...
##   ..- attr(*, "scaled:center")= num 3.4
##   ..- attr(*, "scaled:scale")= num 0.364
##  $ type                       : chr  "RX" "RX" "RX" NA ...
##  $ treatment_effectiveness_num: num  3 2 2 3 4 4 4 4 4 4 ...
##  $ treatment_effectiveness_cat: Factor w/ 3 levels "low","medium",..: 2 1 1 2 3 3 3 3 3 3 ...
##  $ recovery_time_cat          : Factor w/ 3 levels "short","medium",..: 2 3 3 1 3 3 3 3 3 3 ...
##  $ adverse_flag               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ effective_flag             : logi  NA NA NA NA NA NA ...

Setting up the data for modeling

# Creating a training and test split
colnames(df)

##  [1] "patient_id"                  "age"                        
##  [3] "gender"                      "weight_kg"                  
##  [5] "height_cm"                   "bmi"                        
##  [7] "chronic_conditions"          "drug_allergies"             
##  [9] "genetic_disorders"           "diagnosis"                  
## [11] "symptoms"                    "recommended_medication"     
## [13] "dosage"                      "duration"                   
## [15] "condition"                   "easeofuse"                  
## [17] "form"                        "indication"                 
## [19] "price"                       "reviews"                    
## [21] "satisfaction"                "type"                       
## [23] "treatment_effectiveness_num" "treatment_effectiveness_cat"
## [25] "recovery_time_cat"           "adverse_flag"               
## [27] "effective_flag"

df$effect_bin <- df$treatment_effectiveness_cat
df$effect_bin <- as.factor(df$effect_bin)

df$effect_bin <- ifelse(df$treatment_effectiveness_cat == "high", 1, 0)
df$effect_bin <- factor(df$effect_bin, levels = c(0,1))


set.seed(8634)
split <- createDataPartition(df$effect_bin, p = 0.8, list = FALSE)

train <- df[split, ]
test <- df[-split, ]

Logistic regression model

# Logistic regression model 
log_model <- glm(effect_bin ~ age + bmi + easeofuse + satisfaction,
                 data = train,
                 family = "binomial")

summary(log_model)

## 
## Call:
## glm(formula = effect_bin ~ age + bmi + easeofuse + satisfaction, 
##     family = "binomial", data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.308441   0.349785  -6.600 4.12e-11 ***
## age           0.003132   0.002575   1.216 0.223875    
## bmi           0.040821   0.011845   3.446 0.000569 ***
## easeofuse     0.060279   0.079099   0.762 0.446019    
## satisfaction  0.023386   0.078066   0.300 0.764504    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1992.2  on 1744  degrees of freedom
## Residual deviance: 1977.0  on 1740  degrees of freedom
##   (211 observations deleted due to missingness)
## AIC: 1987
## 
## Number of Fisher Scoring iterations: 4

# Creating predictions using the logistic regression model 
log_prob <- predict(log_model, newdata = test, type = "response")
log_pred <- ifelse(log_prob > 0.5, "1", "0")
log_pred <- as.factor(log_pred)

# Looking at the output of the model
levels(test$effect_bin)

## [1] "0" "1"

levels(log_pred)

## [1] "0"

table(test$effect_bin)

## 
##   0   1 
## 359 129

table(log_pred)

## log_pred
##   0 
## 436

# Evaluating the accuracy of the model 
confusionMatrix(log_pred, test$effect_bin)

## Warning in confusionMatrix.default(log_pred, test$effect_bin): Levels are not
## in the same order for reference and data. Refactoring data to match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 324 112
##          1   0   0
##                                           
##                Accuracy : 0.7431          
##                  95% CI : (0.6994, 0.7835)
##     No Information Rate : 0.7431          
##     P-Value [Acc > NIR] : 0.5254          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7431          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7431          
##          Detection Rate : 0.7431          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
##

# Plotting the predicted probabilities
hist(log_prob, main="Logistic Regression Probabilities", xlab="Probability")

Random forest model

# Dropping NA values 
train_clean <- train %>% drop_na(age, bmi, easeofuse, satisfaction, effect_bin)
test_clean <- test %>% drop_na(age, bmi, easeofuse, satisfaction, effect_bin)

# Making the effect_bin a factor
train_clean$effect_bin <- factor(train_clean$effect_bin, levels=c(0,1))
test_clean$effect_bin <- factor(test_clean$effect_bin, levels=c(0,1))

# Random forest model
rf_model <- randomForest(effect_bin ~ age + bmi + easeofuse + satisfaction,
                         data=train_clean,
                         ntree=200)


rf_pred <- predict(rf_model, newdata=test)

# Evaluating the accuracy of the model 
confusionMatrix(rf_pred, test$effect_bin)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 320  41
##          1   4  71
##                                           
##                Accuracy : 0.8968          
##                  95% CI : (0.8643, 0.9237)
##     No Information Rate : 0.7431          
##     P-Value [Acc > NIR] : 7.449e-16       
##                                           
##                   Kappa : 0.6969          
##                                           
##  Mcnemar's Test P-Value : 8.025e-08       
##                                           
##             Sensitivity : 0.9877          
##             Specificity : 0.6339          
##          Pos Pred Value : 0.8864          
##          Neg Pred Value : 0.9467          
##              Prevalence : 0.7431          
##          Detection Rate : 0.7339          
##    Detection Prevalence : 0.8280          
##       Balanced Accuracy : 0.8108          
##                                           
##        'Positive' Class : 0               
##

# Making a variable importance plot
varImpPlot(rf_model)

MS Thesis

Ursula Podosenin

2025-11-30

Loading packages and reading in the data

Data cleaning

Exploratory data analysis

Setting up the data for modeling

Logistic regression model

Random forest model