library(here)
library(tidyverse)
library(e1071)
library(caret)
library(pROC)
library(ROSE)
knitr::include_graphics(here("camilo-jimenez-vGu08RYjO-s-unsplash.jpg"))
Photo by @camstejim from Unsplash
The objective of this project is to make an SVM model capable of predicting whether a patient will die or not.
data = read_csv(here('dados/data01.csv'))
data=data[,-2]
data |> head() |> knitr::kable()
| group | outcome | age | gendera | BMI | hypertensive | atrialfibrillation | CHD with no MI | diabetes | deficiencyanemias | depression | Hyperlipemia | Renal failure | COPD | heart rate | Systolic blood pressure | Diastolic blood pressure | Respiratory rate | temperature | SP O2 | Urine output | hematocrit | RBC | MCH | MCHC | MCV | RDW | Leucocyte | Platelets | Neutrophils | Basophils | Lymphocyte | PT | INR | NT-proBNP | Creatine kinase | Creatinine | Urea nitrogen | glucose | Blood potassium | Blood sodium | Blood calcium | Chloride | Anion gap | Magnesium ion | PH | Bicarbonate | Lactic acid | PCO2 | EF |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 72 | 1 | 37.58818 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 68.83784 | 155.8667 | 68.33333 | 16.62162 | 36.71429 | 98.39474 | 2155 | 26.27273 | 2.960000 | 28.25000 | 31.52000 | 89.900 | 16.22000 | 7.650000 | 305.100 | 74.65 | 0.40 | 13.3 | 10.60000 | 1.000000 | 1956 | 148.0000 | 1.9583333 | 50.00000 | 114.63636 | 4.816667 | 138.7500 | 7.463636 | 109.16667 | 13.16667 | 2.618182 | 7.230 | 21.16667 | 0.5 | 40.0 | 55 |
| 1 | 0 | 75 | 2 | NA | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 101.37037 | 140.0000 | 65.00000 | 20.85185 | 36.68254 | 96.92308 | 1425 | 30.78000 | 3.138000 | 31.06000 | 31.66000 | 98.200 | 14.26000 | 12.740000 | 246.400 | NA | NA | NA | NA | NA | 2384 | 60.6000 | 1.1222222 | 20.33333 | 147.50000 | 4.450000 | 138.8889 | 8.162500 | 98.44444 | 11.44444 | 1.887500 | 7.225 | 33.44444 | 0.5 | 78.0 | 55 |
| 1 | 0 | 83 | 2 | 26.57263 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 72.31818 | 135.3333 | 61.37500 | 23.64000 | 36.45370 | 95.29167 | 2425 | 27.70000 | 2.620000 | 34.32000 | 31.30000 | 109.800 | 23.82000 | 5.480000 | 204.200 | 68.10 | 0.55 | 24.5 | 11.27500 | 0.950000 | 4081 | 16.0000 | 1.8714286 | 33.85714 | 149.00000 | 5.825000 | 140.7143 | 8.266667 | 105.85714 | 10.00000 | 2.157143 | 7.268 | 30.57143 | 0.5 | 71.5 | 35 |
| 1 | 0 | 43 | 2 | 83.26463 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 94.50000 | 126.4000 | 73.20000 | 21.85714 | 36.28704 | 93.84615 | 8760 | 36.63750 | 4.277500 | 26.06250 | 30.41250 | 85.625 | 17.03750 | 8.225000 | 216.375 | 81.80 | 0.15 | 14.5 | 27.06667 | 2.666667 | 668 | 85.0000 | 0.5857143 | 15.28571 | 128.25000 | 4.386667 | 138.5000 | 9.476923 | 92.07143 | 12.35714 | 1.942857 | 7.370 | 38.57143 | 0.6 | 75.0 | 55 |
| 1 | 0 | 75 | 2 | 31.82484 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 67.92000 | 156.5600 | 58.12000 | 21.36000 | 36.76190 | 99.28000 | 4455 | 29.93333 | 3.286667 | 30.66667 | 33.66667 | 91.000 | 16.26667 | 8.833333 | 251.000 | NA | NA | NA | NA | NA | 30802 | 111.6667 | 1.9500000 | 43.00000 | 145.75000 | 4.783333 | 136.6667 | 8.733333 | 104.50000 | 15.16667 | 1.650000 | 7.250 | 22.00000 | 0.6 | 50.0 | 55 |
| 1 | 0 | 76 | 1 | 24.26229 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 | 74.18182 | 118.1000 | 52.95000 | 20.54545 | 35.26667 | 96.81818 | 1840 | 27.33333 | 3.235000 | 26.56667 | 31.48333 | 84.500 | 16.51667 | 9.516667 | 273.000 | 85.40 | 0.30 | 9.3 | 18.78333 | 1.700000 | 34183 | 28.0000 | 1.6125000 | 26.62500 | 98.33333 | 4.075000 | 136.2500 | 8.466667 | 96.75000 | 13.12500 | 1.771429 | 7.310 | 30.50000 | 0.6 | 65.5 | 35 |
apply(data, 2, is.numeric)
## group outcome age
## TRUE TRUE TRUE
## gendera BMI hypertensive
## TRUE TRUE TRUE
## atrialfibrillation CHD with no MI diabetes
## TRUE TRUE TRUE
## deficiencyanemias depression Hyperlipemia
## TRUE TRUE TRUE
## Renal failure COPD heart rate
## TRUE TRUE TRUE
## Systolic blood pressure Diastolic blood pressure Respiratory rate
## TRUE TRUE TRUE
## temperature SP O2 Urine output
## TRUE TRUE TRUE
## hematocrit RBC MCH
## TRUE TRUE TRUE
## MCHC MCV RDW
## TRUE TRUE TRUE
## Leucocyte Platelets Neutrophils
## TRUE TRUE TRUE
## Basophils Lymphocyte PT
## TRUE TRUE TRUE
## INR NT-proBNP Creatine kinase
## TRUE TRUE TRUE
## Creatinine Urea nitrogen glucose
## TRUE TRUE TRUE
## Blood potassium Blood sodium Blood calcium
## TRUE TRUE TRUE
## Chloride Anion gap Magnesium ion
## TRUE TRUE TRUE
## PH Bicarbonate Lactic acid
## TRUE TRUE TRUE
## PCO2 EF
## TRUE TRUE
data_no_na= data |>
mutate_if(is.numeric, ~replace_na(.,mean(., na.rm = TRUE)))
data_no_na |> is.na() |> sum()
## [1] 0
data_no_na$group = as.factor(data_no_na$group)
table(data_no_na$group) |> prop.table()
##
## 1 2
## 0.7009346 0.2990654
data_to_model = data_no_na
data_to_model = rename(data_to_model, "Class"="group")
data_to_model = as.data.frame(data_to_model)
names_data_to_model= str_replace_all( names(data_to_model),"-", "_")
names_data_to_model =str_replace_all(names_data_to_model," ","_")
names(data_to_model) = names_data_to_model
names(data_to_model)
## [1] "Class" "outcome"
## [3] "age" "gendera"
## [5] "BMI" "hypertensive"
## [7] "atrialfibrillation" "CHD_with_no_MI"
## [9] "diabetes" "deficiencyanemias"
## [11] "depression" "Hyperlipemia"
## [13] "Renal_failure" "COPD"
## [15] "heart_rate" "Systolic_blood_pressure"
## [17] "Diastolic_blood_pressure" "Respiratory_rate"
## [19] "temperature" "SP_O2"
## [21] "Urine_output" "hematocrit"
## [23] "RBC" "MCH"
## [25] "MCHC" "MCV"
## [27] "RDW" "Leucocyte"
## [29] "Platelets" "Neutrophils"
## [31] "Basophils" "Lymphocyte"
## [33] "PT" "INR"
## [35] "NT_proBNP" "Creatine_kinase"
## [37] "Creatinine" "Urea_nitrogen"
## [39] "glucose" "Blood_potassium"
## [41] "Blood_sodium" "Blood_calcium"
## [43] "Chloride" "Anion_gap"
## [45] "Magnesium_ion" "PH"
## [47] "Bicarbonate" "Lactic_acid"
## [49] "PCO2" "EF"
rows_over=filter(data_to_model, Class == "Alive") |> nrow()
rows_to_fill= nrow(data_to_model) + (nrow(data_to_model) - rows_over)
data_balanced_over <- ovun.sample(Class ~ ., data = data_to_model, method = "over",N = rows_to_fill)$data
data_to_model = data_balanced_over
set.seed(123)
sample_to_model <- sample(c(TRUE, FALSE), nrow(data_to_model), replace=TRUE, prob=c(0.7,0.3))
train <- data_to_model[sample_to_model, ]
test <- data_to_model[!sample_to_model, ]
set.seed (123)
tune.out = tune(svm,
Class ~ .,
data=train,
kernel ="radial",
ranges =list (cost=c(0.1 ,1 ,10 ,100 ,1000),
gamma =c(0.5 ,1 ,2 ,3 ,4) ))
svm_model_tune = svm(Class ~ ., data = train, kernel = "radial", cost = 1,scale = TRUE,gamma=0.5)
caret::confusionMatrix(svm_model_tune$fitted,train$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 581 0
## 2 0 1075
##
## Accuracy : 1
## 95% CI : (0.9978, 1)
## No Information Rate : 0.6492
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.3508
## Detection Rate : 0.3508
## Detection Prevalence : 0.3508
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 1
##
results_tune= predict(svm_model_tune,test[,!names(test) %in% c("Class")])
table(results_tune,test$Class)
##
## results_tune 1 2
## 1 244 23
## 2 0 431
caret::confusionMatrix(results_tune,test$Class)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2
## 1 244 23
## 2 0 431
##
## Accuracy : 0.967
## 95% CI : (0.951, 0.979)
## No Information Rate : 0.6504
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9291
##
## Mcnemar's Test P-Value : 4.49e-06
##
## Sensitivity : 1.0000
## Specificity : 0.9493
## Pos Pred Value : 0.9139
## Neg Pred Value : 1.0000
## Prevalence : 0.3496
## Detection Rate : 0.3496
## Detection Prevalence : 0.3825
## Balanced Accuracy : 0.9747
##
## 'Positive' Class : 1
##
roc_svm_tune_test <- roc(response = test$Class, predictor =as.numeric(results_tune),
smoothed = TRUE,
# arguments for ci
ci=TRUE, ci.alpha=0.9, stratified=FALSE,
# arguments for plot
plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
print.auc=TRUE, show.thres=TRUE)
sens.ci <- ci.se(roc_svm_tune_test)
plot(sens.ci, type="shape", col="lightblue")
plot(sens.ci, type="bars")
This model has great accuracy in predicting which patient is more likely to die or live. Perhaps it can be tested with other datasets and help healthcare staff save more lives.