library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Accessing the dataset

url1 <- "https://raw.githubusercontent.com/josephdamilare01/Data-Mining-Data-Mining-the-Healthcare-Dataset/main/disease_train(5).csv"
Train <- read.csv(url1)
head(Train)

Data exploration

str(Train)
## 'data.frame':    4250 obs. of  24 variables:
##  $ id             : chr  "PA1001" "PA1002" "PA1003" "PA1004" ...
##  $ age            : int  59 48 77 42 38 44 90 69 33 42 ...
##  $ gender         : chr  "male" "female" "male" "female" ...
##  $ sick           : chr  "no" "no" "no" "no" ...
##  $ pregnant       : chr  "no" "no" "no" "no" ...
##  $ test_X1        : num  7.8 1.5 7.3 1.2 0.6 3 1.5 6.9 0.1 1.9 ...
##  $ test_X2        : num  NA 2.5 1.2 2.5 1.9 2 1.8 NA 1.7 2.2 ...
##  $ test_X3        : num  89 101 57 106 95 115 98 109 104 126 ...
##  $ test_X4        : num  0.85 0.97 1.28 0.98 NA 1.1 0.94 1.03 0.8 0.97 ...
##  $ test_X5        : num  105 104 44 108 NA 104 104 106 130 130 ...
##  $ test_X6        : num  NA NA NA 27 NA NA NA NA NA NA ...
##  $ concern_type1  : chr  "no" "no" "no" "no" ...
##  $ concern_type2  : chr  "yes" "no" "no" "no" ...
##  $ enlargement    : chr  "no" "no" "no" "no" ...
##  $ tumor          : chr  "no" "no" "no" "no" ...
##  $ disorder       : chr  "no" "no" "no" "no" ...
##  $ medication_A   : chr  "no" "yes" "no" "no" ...
##  $ medication_B   : chr  "no" "no" "no" "no" ...
##  $ mental_health  : chr  "no" "no" "no" "no" ...
##  $ mood_stabiliser: chr  "no" "yes" "no" "no" ...
##  $ surgery        : chr  "no" "no" "no" "no" ...
##  $ treatment_type1: chr  "no" "no" "no" "no" ...
##  $ suspect        : chr  "no" "no" "no" "no" ...
##  $ target         : chr  "moderate_risk" "low_risk" "moderate_risk" "low_risk" ...

Checking for missing values using dlookr package

library(dlookr)
## Warning: package 'dlookr' was built under R version 4.4.0
## Registered S3 methods overwritten by 'dlookr':
##   method          from  
##   plot.transform  scales
##   print.transform scales
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:base':
## 
##     transform
plot_na_pareto(Train)

Checking for outliers

diagnose_outlier(Train)
plot_outlier(Train)

Cleaning the data and handling data issues

Handling missing variables and outliers

Train_corrected <- Train %>% mutate(test_X3 = as.numeric( imputate_na(Train, xvar =test_X3, method = "mean")) %>% as.data.frame(), 
                                    test_X2 = as.numeric( imputate_na(Train, xvar =test_X2, method = "mean")) %>% as.data.frame(),
                                    test_X1 = as.numeric( imputate_na(Train, xvar =test_X1, method = "mean")) %>% as.data.frame(),
                                    test_X4 = as.numeric( imputate_na(Train, xvar =test_X4, method = "mean")) %>% as.data.frame(),
                                    test_X5 = as.numeric( imputate_na(Train, xvar =test_X5, method = "mean")) %>% as.data.frame() )
library(visdat)
## Warning: package 'visdat' was built under R version 4.3.3
vis_miss(Train_corrected)

## Checking if there is duplicate

length(unique(Train_corrected$id))
## [1] 4250
length(Train_corrected$id)
## [1] 4250

We remove test_X6 due to the percent of missing values present in it (96%)

Train_corrected <-Train_corrected %>% select(-c(test_X6, id))

Checking the percentage of empty rows to the decide if to remove it or not

n <- Train_corrected %>% filter(gender == "" )
a <-length(n$sick)
s <- length(Train_corrected$sick)
(a/s)*100 
## [1] 3.317647

Filtering out the rows with empty space

Train_corrected <- Train_corrected %>% filter(!gender =="")

Data exploration

Descriptive Analysis

library(summarytools)
## Warning: package 'summarytools' was built under R version 4.3.3
## 
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
## 
##     view
K1 <- Train_corrected %>% select(starts_with("test_X")) 

summary(K1)
##      test_X1..           test_X2..          test_X3..           test_X4..     
##  Min.   :  0.0050   Min.   : 0.050000   Min.   :  2.0000   Min.   :0.2500000  
##  1st Qu.:  0.7000   1st Qu.: 1.700000   1st Qu.: 88.0000   1st Qu.:0.8800000  
##  Median :  1.7000   Median : 2.035580   Median :104.0000   Median :0.9708460  
##  Mean   :  7.2855   Mean   : 2.029092   Mean   :104.8029   Mean   :0.9717627  
##  3rd Qu.:  4.4000   3rd Qu.: 2.100000   3rd Qu.:120.0000   3rd Qu.:1.0500000  
##  Max.   :530.0000   Max.   :18.000000   Max.   :430.0000   Max.   :1.9600000  
##      test_X5..    
##  Min.   :  1.400  
##  1st Qu.: 94.000  
##  Median :110.000  
##  Mean   :109.593  
##  3rd Qu.:123.000  
##  Max.   :642.000

Correlation Analysis

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
l <- cor(K1)
corrplot(l, "pie")

Visualization

ggplot(Train_corrected, aes(x = gender, fill = gender)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue")) 
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(Train_corrected, aes(x=sick, fill = sick))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=pregnant, fill= pregnant))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x= suspect, fill=suspect))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=treatment_type1, fill=treatment_type1))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=surgery, fill= surgery))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=mood_stabiliser, fill= mood_stabiliser))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=mental_health, fill=mental_health))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=medication_B, fill=medication_B))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=disorder, fill=disorder))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=tumor, fill=tumor))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=enlargement, fill=enlargement))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=concern_type2, fill= concern_type2))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

ggplot(Train_corrected, aes(x=concern_type1, fill=concern_type1))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
  facet_wrap(~ target, scale = "free") +
  scale_fill_manual(values = c("red", "blue"))

### Data preprocessing - Scaling method

Model_data <- scale(K1)

Adding the Target variable to model data

target <- Train_corrected %>% select(target)
Model_data <- cbind(Model_data, target)

Splitting the dataset into 25% test and 75% train set

library(caTools)
sample <- sample.split(Model_data$target, SplitRatio = 0.75)
Model_data_Train <- subset(Model_data, sample==T)
Model_data_Test <- subset(Model_data, sample==F)

Model buiding and training with parameter tuning - Cart Decision tree

library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
ctrl <- trainControl(method = "cv", number = 10)
tunegrid <- expand.grid(.cp = seq(0.01, 0.5, by = 0.01))
model <- train(target ~ ., data = Model_data, method = "rpart", trControl = ctrl, tuneGrid = tunegrid)

Selecting the best decison tree based on the parameter

best_model <- model$finalModel

Visualizing the tree

library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.3.3
rpart.plot(best_model)

Evaluating the performance of the decision tree

pp <- predict(best_model, newdata = Model_data_Test, type = "class")
con_pp <- confusionMatrix(as.factor(Model_data_Test$target), pp)
con_pp
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      high_risk low_risk moderate_risk
##   high_risk            22       12             0
##   low_risk              3      867             6
##   moderate_risk         0        0           118
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9796          
##                  95% CI : (0.9689, 0.9873)
##     No Information Rate : 0.8551          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9204          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: high_risk Class: low_risk Class: moderate_risk
## Sensitivity                   0.88000          0.9863               0.9516
## Specificity                   0.98804          0.9396               1.0000
## Pos Pred Value                0.64706          0.9897               1.0000
## Neg Pred Value                0.99698          0.9211               0.9934
## Prevalence                    0.02432          0.8551               0.1206
## Detection Rate                0.02140          0.8434               0.1148
## Detection Prevalence          0.03307          0.8521               0.1148
## Balanced Accuracy             0.93402          0.9630               0.9758

Building a support vector machine model

library(e1071)
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:dlookr':
## 
##     kurtosis, skewness
sv <- svm(as.factor(target) ~., data = Model_data_Train, cost = 10, kernel = "sigmoid", scale = T)
summary(sv)
## 
## Call:
## svm(formula = as.factor(target) ~ ., data = Model_data_Train, cost = 10, 
##     kernel = "sigmoid", scale = T)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  10 
##      coef.0:  0 
## 
## Number of Support Vectors:  544
## 
##  ( 179 272 93 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  high_risk low_risk moderate_risk

Tuning the svm model

tune <- tune(svm, as.factor(target)~., data = Model_data_Train, kernel="sigmoid", ranges = list(cost = c(0.1, 1,10,20,30,40,50)))
summary(tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##   0.1
## 
## - best performance: 0.113919 
## 
## - Detailed performance results:
##   cost     error dispersion
## 1  0.1 0.1139190 0.02325320
## 2  1.0 0.1577355 0.01753581
## 3 10.0 0.1713687 0.01752048
## 4 20.0 0.1755895 0.01737883
## 5 30.0 0.1755895 0.01710712
## 6 40.0 0.1768882 0.01675247
## 7 50.0 0.1765635 0.01660723

We then retrained the model with cost value of 0.1

sv_r <- svm(as.factor(target) ~., data = Model_data_Train, cost = 1, kernel = "sigmoid", scale = T)
summary(sv_r)
## 
## Call:
## svm(formula = as.factor(target) ~ ., data = Model_data_Train, cost = 1, 
##     kernel = "sigmoid", scale = T)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  1 
##      coef.0:  0 
## 
## Number of Support Vectors:  572
## 
##  ( 192 286 94 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  high_risk low_risk moderate_risk

Model evaluation: Retrained vs trained

sv_pred <- predict(sv, newdata = Model_data_Test, type ="class")
acc <- confusionMatrix(as.factor(Model_data_Test$target), sv_pred)
acc
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      high_risk low_risk moderate_risk
##   high_risk             5       11            18
##   low_risk             29      795            52
##   moderate_risk         0       59            59
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8356          
##                  95% CI : (0.8115, 0.8578)
##     No Information Rate : 0.8414          
##     P-Value [Acc > NIR] : 0.7128          
##                                           
##                   Kappa : 0.3854          
##                                           
##  Mcnemar's Test P-Value : 7.346e-06       
## 
## Statistics by Class:
## 
##                      Class: high_risk Class: low_risk Class: moderate_risk
## Sensitivity                  0.147059          0.9191              0.45736
## Specificity                  0.970825          0.5031              0.93437
## Pos Pred Value               0.147059          0.9075              0.50000
## Neg Pred Value               0.970825          0.5395              0.92308
## Prevalence                   0.033074          0.8414              0.12549
## Detection Rate               0.004864          0.7733              0.05739
## Detection Prevalence         0.033074          0.8521              0.11479
## Balanced Accuracy            0.558942          0.7111              0.69587
sv_rp <- predict(sv_r, newdata = Model_data_Test, type ="class")
acc_r <- confusionMatrix(as.factor(Model_data_Test$target), sv_rp)
acc_r
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      high_risk low_risk moderate_risk
##   high_risk             4       11            19
##   low_risk             22      813            41
##   moderate_risk         0       63            55
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8482          
##                  95% CI : (0.8248, 0.8696)
##     No Information Rate : 0.8628          
##     P-Value [Acc > NIR] : 0.9185          
##                                           
##                   Kappa : 0.3956          
##                                           
##  Mcnemar's Test P-Value : 5.043e-06       
## 
## Statistics by Class:
## 
##                      Class: high_risk Class: low_risk Class: moderate_risk
## Sensitivity                  0.153846          0.9166               0.4783
## Specificity                  0.970060          0.5532               0.9310
## Pos Pred Value               0.117647          0.9281               0.4661
## Neg Pred Value               0.977867          0.5132               0.9341
## Prevalence                   0.025292          0.8628               0.1119
## Detection Rate               0.003891          0.7909               0.0535
## Detection Prevalence         0.033074          0.8521               0.1148
## Balanced Accuracy            0.561953          0.7349               0.7046

Conclusion

In summary, the Decision Tree model exhibited superior performance in accurately predicting medical diagnoses with three categories (high risk, low risk, and moderate) compared to the Support Vector Machine model, achieving higher overall accuracy, precision, and agreement with actual diagnoses.