library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
url1 <- "https://raw.githubusercontent.com/josephdamilare01/Data-Mining-Data-Mining-the-Healthcare-Dataset/main/disease_train(5).csv"
Train <- read.csv(url1)
head(Train)
str(Train)
## 'data.frame': 4250 obs. of 24 variables:
## $ id : chr "PA1001" "PA1002" "PA1003" "PA1004" ...
## $ age : int 59 48 77 42 38 44 90 69 33 42 ...
## $ gender : chr "male" "female" "male" "female" ...
## $ sick : chr "no" "no" "no" "no" ...
## $ pregnant : chr "no" "no" "no" "no" ...
## $ test_X1 : num 7.8 1.5 7.3 1.2 0.6 3 1.5 6.9 0.1 1.9 ...
## $ test_X2 : num NA 2.5 1.2 2.5 1.9 2 1.8 NA 1.7 2.2 ...
## $ test_X3 : num 89 101 57 106 95 115 98 109 104 126 ...
## $ test_X4 : num 0.85 0.97 1.28 0.98 NA 1.1 0.94 1.03 0.8 0.97 ...
## $ test_X5 : num 105 104 44 108 NA 104 104 106 130 130 ...
## $ test_X6 : num NA NA NA 27 NA NA NA NA NA NA ...
## $ concern_type1 : chr "no" "no" "no" "no" ...
## $ concern_type2 : chr "yes" "no" "no" "no" ...
## $ enlargement : chr "no" "no" "no" "no" ...
## $ tumor : chr "no" "no" "no" "no" ...
## $ disorder : chr "no" "no" "no" "no" ...
## $ medication_A : chr "no" "yes" "no" "no" ...
## $ medication_B : chr "no" "no" "no" "no" ...
## $ mental_health : chr "no" "no" "no" "no" ...
## $ mood_stabiliser: chr "no" "yes" "no" "no" ...
## $ surgery : chr "no" "no" "no" "no" ...
## $ treatment_type1: chr "no" "no" "no" "no" ...
## $ suspect : chr "no" "no" "no" "no" ...
## $ target : chr "moderate_risk" "low_risk" "moderate_risk" "low_risk" ...
library(dlookr)
## Warning: package 'dlookr' was built under R version 4.4.0
## Registered S3 methods overwritten by 'dlookr':
## method from
## plot.transform scales
## print.transform scales
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
plot_na_pareto(Train)
diagnose_outlier(Train)
plot_outlier(Train)
Train_corrected <- Train %>% mutate(test_X3 = as.numeric( imputate_na(Train, xvar =test_X3, method = "mean")) %>% as.data.frame(),
test_X2 = as.numeric( imputate_na(Train, xvar =test_X2, method = "mean")) %>% as.data.frame(),
test_X1 = as.numeric( imputate_na(Train, xvar =test_X1, method = "mean")) %>% as.data.frame(),
test_X4 = as.numeric( imputate_na(Train, xvar =test_X4, method = "mean")) %>% as.data.frame(),
test_X5 = as.numeric( imputate_na(Train, xvar =test_X5, method = "mean")) %>% as.data.frame() )
library(visdat)
## Warning: package 'visdat' was built under R version 4.3.3
vis_miss(Train_corrected)
length(unique(Train_corrected$id))
## [1] 4250
length(Train_corrected$id)
## [1] 4250
Train_corrected <-Train_corrected %>% select(-c(test_X6, id))
n <- Train_corrected %>% filter(gender == "" )
a <-length(n$sick)
s <- length(Train_corrected$sick)
(a/s)*100
## [1] 3.317647
Train_corrected <- Train_corrected %>% filter(!gender =="")
library(summarytools)
## Warning: package 'summarytools' was built under R version 4.3.3
##
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
##
## view
K1 <- Train_corrected %>% select(starts_with("test_X"))
summary(K1)
## test_X1.. test_X2.. test_X3.. test_X4..
## Min. : 0.0050 Min. : 0.050000 Min. : 2.0000 Min. :0.2500000
## 1st Qu.: 0.7000 1st Qu.: 1.700000 1st Qu.: 88.0000 1st Qu.:0.8800000
## Median : 1.7000 Median : 2.035580 Median :104.0000 Median :0.9708460
## Mean : 7.2855 Mean : 2.029092 Mean :104.8029 Mean :0.9717627
## 3rd Qu.: 4.4000 3rd Qu.: 2.100000 3rd Qu.:120.0000 3rd Qu.:1.0500000
## Max. :530.0000 Max. :18.000000 Max. :430.0000 Max. :1.9600000
## test_X5..
## Min. : 1.400
## 1st Qu.: 94.000
## Median :110.000
## Mean :109.593
## 3rd Qu.:123.000
## Max. :642.000
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
l <- cor(K1)
corrplot(l, "pie")
ggplot(Train_corrected, aes(x = gender, fill = gender)) +
geom_bar() +
geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(Train_corrected, aes(x=sick, fill = sick))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=pregnant, fill= pregnant))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x= suspect, fill=suspect))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=treatment_type1, fill=treatment_type1))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=surgery, fill= surgery))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=mood_stabiliser, fill= mood_stabiliser))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=mental_health, fill=mental_health))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=medication_B, fill=medication_B))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=disorder, fill=disorder))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=tumor, fill=tumor))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=enlargement, fill=enlargement))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=concern_type2, fill= concern_type2))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
ggplot(Train_corrected, aes(x=concern_type1, fill=concern_type1))+geom_bar() + geom_text(stat = "count", aes(label = paste0(round((..count..)/sum(..count..)*100, 2), "%")), vjust = -0.5) +
facet_wrap(~ target, scale = "free") +
scale_fill_manual(values = c("red", "blue"))
Model_data <- scale(K1)
target <- Train_corrected %>% select(target)
Model_data <- cbind(Model_data, target)
library(caTools)
sample <- sample.split(Model_data$target, SplitRatio = 0.75)
Model_data_Train <- subset(Model_data, sample==T)
Model_data_Test <- subset(Model_data, sample==F)
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
ctrl <- trainControl(method = "cv", number = 10)
tunegrid <- expand.grid(.cp = seq(0.01, 0.5, by = 0.01))
model <- train(target ~ ., data = Model_data, method = "rpart", trControl = ctrl, tuneGrid = tunegrid)
best_model <- model$finalModel
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.3.3
rpart.plot(best_model)
pp <- predict(best_model, newdata = Model_data_Test, type = "class")
con_pp <- confusionMatrix(as.factor(Model_data_Test$target), pp)
con_pp
## Confusion Matrix and Statistics
##
## Reference
## Prediction high_risk low_risk moderate_risk
## high_risk 25 9 0
## low_risk 3 867 6
## moderate_risk 0 0 118
##
## Overall Statistics
##
## Accuracy : 0.9825
## 95% CI : (0.9725, 0.9896)
## No Information Rate : 0.8521
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9324
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: high_risk Class: low_risk Class: moderate_risk
## Sensitivity 0.89286 0.9897 0.9516
## Specificity 0.99100 0.9408 1.0000
## Pos Pred Value 0.73529 0.9897 1.0000
## Neg Pred Value 0.99698 0.9408 0.9934
## Prevalence 0.02724 0.8521 0.1206
## Detection Rate 0.02432 0.8434 0.1148
## Detection Prevalence 0.03307 0.8521 0.1148
## Balanced Accuracy 0.94193 0.9653 0.9758
library(e1071)
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:dlookr':
##
## kurtosis, skewness
sv <- svm(as.factor(target) ~., data = Model_data_Train, cost = 10, kernel = "sigmoid", scale = T)
summary(sv)
##
## Call:
## svm(formula = as.factor(target) ~ ., data = Model_data_Train, cost = 10,
## kernel = "sigmoid", scale = T)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 10
## coef.0: 0
##
## Number of Support Vectors: 542
##
## ( 270 181 91 )
##
##
## Number of Classes: 3
##
## Levels:
## high_risk low_risk moderate_risk
tune <- tune(svm, as.factor(target)~., data = Model_data_Train, kernel="sigmoid", ranges = list(cost = c(0.1, 1,10,20,30,40,50)))
summary(tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.1
##
## - best performance: 0.1113311
##
## - Detailed performance results:
## cost error dispersion
## 1 0.1 0.1113311 0.01548683
## 2 1.0 0.1583911 0.01823425
## 3 10.0 0.1720243 0.02139150
## 4 20.0 0.1729973 0.02122325
## 5 30.0 0.1739713 0.02225767
## 6 40.0 0.1739713 0.02225767
## 7 50.0 0.1739713 0.02225767
sv_r <- svm(as.factor(target) ~., data = Model_data_Train, cost = 1, kernel = "sigmoid", scale = T)
summary(sv_r)
##
## Call:
## svm(formula = as.factor(target) ~ ., data = Model_data_Train, cost = 1,
## kernel = "sigmoid", scale = T)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## coef.0: 0
##
## Number of Support Vectors: 570
##
## ( 284 191 95 )
##
##
## Number of Classes: 3
##
## Levels:
## high_risk low_risk moderate_risk
sv_pred <- predict(sv, newdata = Model_data_Test, type ="class")
acc <- confusionMatrix(as.factor(Model_data_Test$target), sv_pred)
acc
## Confusion Matrix and Statistics
##
## Reference
## Prediction high_risk low_risk moderate_risk
## high_risk 3 22 9
## low_risk 38 783 55
## moderate_risk 0 49 69
##
## Overall Statistics
##
## Accuracy : 0.8317
## 95% CI : (0.8074, 0.8541)
## No Information Rate : 0.8307
## P-Value [Acc > NIR] : 0.487069
##
## Kappa : 0.3901
##
## Mcnemar's Test P-Value : 0.003482
##
## Statistics by Class:
##
## Class: high_risk Class: low_risk Class: moderate_risk
## Sensitivity 0.073171 0.9169 0.51880
## Specificity 0.968592 0.4655 0.94525
## Pos Pred Value 0.088235 0.8938 0.58475
## Neg Pred Value 0.961771 0.5329 0.92967
## Prevalence 0.039883 0.8307 0.12938
## Detection Rate 0.002918 0.7617 0.06712
## Detection Prevalence 0.033074 0.8521 0.11479
## Balanced Accuracy 0.520881 0.6912 0.73202
sv_rp <- predict(sv_r, newdata = Model_data_Test, type ="class")
acc_r <- confusionMatrix(as.factor(Model_data_Test$target), sv_rp)
acc_r
## Confusion Matrix and Statistics
##
## Reference
## Prediction high_risk low_risk moderate_risk
## high_risk 2 22 10
## low_risk 34 796 46
## moderate_risk 0 50 68
##
## Overall Statistics
##
## Accuracy : 0.8424
## 95% CI : (0.8187, 0.8642)
## No Information Rate : 0.8444
## P-Value [Acc > NIR] : 0.588808
##
## Kappa : 0.4064
##
## Mcnemar's Test P-Value : 0.005239
##
## Statistics by Class:
##
## Class: high_risk Class: low_risk Class: moderate_risk
## Sensitivity 0.055556 0.9171 0.54839
## Specificity 0.967742 0.5000 0.94469
## Pos Pred Value 0.058824 0.9087 0.57627
## Neg Pred Value 0.965795 0.5263 0.93846
## Prevalence 0.035019 0.8444 0.12062
## Detection Rate 0.001946 0.7743 0.06615
## Detection Prevalence 0.033074 0.8521 0.11479
## Balanced Accuracy 0.511649 0.7085 0.74654
In summary, the Decision Tree model exhibited superior performance in accurately predicting medical diagnoses with three categories (high risk, low risk, and moderate) compared to the Support Vector Machine model, achieving higher overall accuracy, precision, and agreement with actual diagnoses.