Library

library(here)
library(tidyverse)
library(e1071)
library(caret)
library(pROC)
library(ROSE)
knitr::include_graphics(here("camilo-jimenez-vGu08RYjO-s-unsplash.jpg"))

Photo by @camstejim from Unsplash

Objective

The objective of this project is to make an SVM model capable of predicting whether a patient will die or not.

Import the data

data = read_csv(here('dados/data01.csv'))
data=data[,-2]

data |> head() |> knitr::kable()
group outcome age gendera BMI hypertensive atrialfibrillation CHD with no MI diabetes deficiencyanemias depression Hyperlipemia Renal failure COPD heart rate Systolic blood pressure Diastolic blood pressure Respiratory rate temperature SP O2 Urine output hematocrit RBC MCH MCHC MCV RDW Leucocyte Platelets Neutrophils Basophils Lymphocyte PT INR NT-proBNP Creatine kinase Creatinine Urea nitrogen glucose Blood potassium Blood sodium Blood calcium Chloride Anion gap Magnesium ion PH Bicarbonate Lactic acid PCO2 EF
1 0 72 1 37.58818 0 0 0 1 1 0 1 1 0 68.83784 155.8667 68.33333 16.62162 36.71429 98.39474 2155 26.27273 2.960000 28.25000 31.52000 89.900 16.22000 7.650000 305.100 74.65 0.40 13.3 10.60000 1.000000 1956 148.0000 1.9583333 50.00000 114.63636 4.816667 138.7500 7.463636 109.16667 13.16667 2.618182 7.230 21.16667 0.5 40.0 55
1 0 75 2 NA 0 0 0 0 1 0 0 0 1 101.37037 140.0000 65.00000 20.85185 36.68254 96.92308 1425 30.78000 3.138000 31.06000 31.66000 98.200 14.26000 12.740000 246.400 NA NA NA NA NA 2384 60.6000 1.1222222 20.33333 147.50000 4.450000 138.8889 8.162500 98.44444 11.44444 1.887500 7.225 33.44444 0.5 78.0 55
1 0 83 2 26.57263 0 0 0 0 1 0 0 1 0 72.31818 135.3333 61.37500 23.64000 36.45370 95.29167 2425 27.70000 2.620000 34.32000 31.30000 109.800 23.82000 5.480000 204.200 68.10 0.55 24.5 11.27500 0.950000 4081 16.0000 1.8714286 33.85714 149.00000 5.825000 140.7143 8.266667 105.85714 10.00000 2.157143 7.268 30.57143 0.5 71.5 35
1 0 43 2 83.26463 0 0 0 0 0 0 0 0 0 94.50000 126.4000 73.20000 21.85714 36.28704 93.84615 8760 36.63750 4.277500 26.06250 30.41250 85.625 17.03750 8.225000 216.375 81.80 0.15 14.5 27.06667 2.666667 668 85.0000 0.5857143 15.28571 128.25000 4.386667 138.5000 9.476923 92.07143 12.35714 1.942857 7.370 38.57143 0.6 75.0 55
1 0 75 2 31.82484 1 0 0 0 1 0 0 1 1 67.92000 156.5600 58.12000 21.36000 36.76190 99.28000 4455 29.93333 3.286667 30.66667 33.66667 91.000 16.26667 8.833333 251.000 NA NA NA NA NA 30802 111.6667 1.9500000 43.00000 145.75000 4.783333 136.6667 8.733333 104.50000 15.16667 1.650000 7.250 22.00000 0.6 50.0 55
1 0 76 1 24.26229 1 1 0 0 1 0 1 1 1 74.18182 118.1000 52.95000 20.54545 35.26667 96.81818 1840 27.33333 3.235000 26.56667 31.48333 84.500 16.51667 9.516667 273.000 85.40 0.30 9.3 18.78333 1.700000 34183 28.0000 1.6125000 26.62500 98.33333 4.075000 136.2500 8.466667 96.75000 13.12500 1.771429 7.310 30.50000 0.6 65.5 35

Cleaning

apply(data, 2, is.numeric) 
##                    group                  outcome                      age 
##                     TRUE                     TRUE                     TRUE 
##                  gendera                      BMI             hypertensive 
##                     TRUE                     TRUE                     TRUE 
##       atrialfibrillation           CHD with no MI                 diabetes 
##                     TRUE                     TRUE                     TRUE 
##        deficiencyanemias               depression             Hyperlipemia 
##                     TRUE                     TRUE                     TRUE 
##            Renal failure                     COPD               heart rate 
##                     TRUE                     TRUE                     TRUE 
##  Systolic blood pressure Diastolic blood pressure         Respiratory rate 
##                     TRUE                     TRUE                     TRUE 
##              temperature                    SP O2             Urine output 
##                     TRUE                     TRUE                     TRUE 
##               hematocrit                      RBC                      MCH 
##                     TRUE                     TRUE                     TRUE 
##                     MCHC                      MCV                      RDW 
##                     TRUE                     TRUE                     TRUE 
##                Leucocyte                Platelets              Neutrophils 
##                     TRUE                     TRUE                     TRUE 
##                Basophils               Lymphocyte                       PT 
##                     TRUE                     TRUE                     TRUE 
##                      INR                NT-proBNP          Creatine kinase 
##                     TRUE                     TRUE                     TRUE 
##               Creatinine            Urea nitrogen                  glucose 
##                     TRUE                     TRUE                     TRUE 
##          Blood potassium             Blood sodium            Blood calcium 
##                     TRUE                     TRUE                     TRUE 
##                 Chloride                Anion gap            Magnesium ion 
##                     TRUE                     TRUE                     TRUE 
##                       PH              Bicarbonate              Lactic acid 
##                     TRUE                     TRUE                     TRUE 
##                     PCO2                       EF 
##                     TRUE                     TRUE
data_no_na= data |> 
  mutate_if(is.numeric, ~replace_na(.,mean(., na.rm = TRUE)))

data_no_na |>  is.na() |> sum()
## [1] 0

Turning grouping in factor

data_no_na$group = as.factor(data_no_na$group)
table(data_no_na$group) |> prop.table()
## 
##         1         2 
## 0.7009346 0.2990654

Rename Columns

data_to_model = data_no_na
data_to_model = rename(data_to_model, "Class"="group")
data_to_model = as.data.frame(data_to_model)

names_data_to_model= str_replace_all( names(data_to_model),"-", "_")  
names_data_to_model =str_replace_all(names_data_to_model," ","_")


names(data_to_model) = names_data_to_model
names(data_to_model)
##  [1] "Class"                    "outcome"                 
##  [3] "age"                      "gendera"                 
##  [5] "BMI"                      "hypertensive"            
##  [7] "atrialfibrillation"       "CHD_with_no_MI"          
##  [9] "diabetes"                 "deficiencyanemias"       
## [11] "depression"               "Hyperlipemia"            
## [13] "Renal_failure"            "COPD"                    
## [15] "heart_rate"               "Systolic_blood_pressure" 
## [17] "Diastolic_blood_pressure" "Respiratory_rate"        
## [19] "temperature"              "SP_O2"                   
## [21] "Urine_output"             "hematocrit"              
## [23] "RBC"                      "MCH"                     
## [25] "MCHC"                     "MCV"                     
## [27] "RDW"                      "Leucocyte"               
## [29] "Platelets"                "Neutrophils"             
## [31] "Basophils"                "Lymphocyte"              
## [33] "PT"                       "INR"                     
## [35] "NT_proBNP"                "Creatine_kinase"         
## [37] "Creatinine"               "Urea_nitrogen"           
## [39] "glucose"                  "Blood_potassium"         
## [41] "Blood_sodium"             "Blood_calcium"           
## [43] "Chloride"                 "Anion_gap"               
## [45] "Magnesium_ion"            "PH"                      
## [47] "Bicarbonate"              "Lactic_acid"             
## [49] "PCO2"                     "EF"

OverSample

rows_over=filter(data_to_model, Class == "Alive") |> nrow() 
rows_to_fill= nrow(data_to_model) + (nrow(data_to_model) - rows_over)


data_balanced_over <- ovun.sample(Class ~ ., data = data_to_model, method = "over",N = rows_to_fill)$data

Split in test and training

data_to_model = data_balanced_over
set.seed(123)
sample_to_model <- sample(c(TRUE, FALSE), nrow(data_to_model), replace=TRUE, prob=c(0.7,0.3))
train  <- data_to_model[sample_to_model, ]
test   <- data_to_model[!sample_to_model, ]

Training the model

set.seed (123)
tune.out = tune(svm,
                Class ~ .,
                data=train,
                kernel ="radial",
                ranges =list (cost=c(0.1 ,1 ,10 ,100 ,1000),
                                gamma =c(0.5 ,1 ,2 ,3 ,4) ))



svm_model_tune = svm(Class ~ ., data = train, kernel = "radial", cost = 1,scale = TRUE,gamma=0.5)


caret::confusionMatrix(svm_model_tune$fitted,train$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1  581    0
##          2    0 1075
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9978, 1)
##     No Information Rate : 0.6492     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3508     
##          Detection Rate : 0.3508     
##    Detection Prevalence : 0.3508     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
## 

Test Model

results_tune= predict(svm_model_tune,test[,!names(test) %in% c("Class")])

table(results_tune,test$Class)
##             
## results_tune   1   2
##            1 244  23
##            2   0 431
caret::confusionMatrix(results_tune,test$Class)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 244  23
##          2   0 431
##                                         
##                Accuracy : 0.967         
##                  95% CI : (0.951, 0.979)
##     No Information Rate : 0.6504        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.9291        
##                                         
##  Mcnemar's Test P-Value : 4.49e-06      
##                                         
##             Sensitivity : 1.0000        
##             Specificity : 0.9493        
##          Pos Pred Value : 0.9139        
##          Neg Pred Value : 1.0000        
##              Prevalence : 0.3496        
##          Detection Rate : 0.3496        
##    Detection Prevalence : 0.3825        
##       Balanced Accuracy : 0.9747        
##                                         
##        'Positive' Class : 1             
## 
roc_svm_tune_test <- roc(response = test$Class, predictor =as.numeric(results_tune),
                    smoothed = TRUE,
                    # arguments for ci
                    ci=TRUE, ci.alpha=0.9, stratified=FALSE,
                    # arguments for plot
                    plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                    print.auc=TRUE, show.thres=TRUE)

sens.ci <- ci.se(roc_svm_tune_test)
plot(sens.ci, type="shape", col="lightblue")
plot(sens.ci, type="bars")

Conclusion

This model has great accuracy in predicting which patient is more likely to die or live. Perhaps it can be tested with other datasets and help healthcare staff save more lives.