Library

library(here)
library(tidyverse)
library(e1071)
library(caret)
library(pROC)
library(ROSE)

knitr::include_graphics(here("camilo-jimenez-vGu08RYjO-s-unsplash.jpg"))

Photo by @camstejim from Unsplash

Objective

The objective of this project is to make an SVM model capable of predicting whether a patient will die or not.

Import the data

data = read_csv(here('dados/data01.csv'))
data=data[,-2]

data |> head() |> knitr::kable()

group	age	gendera	BMI	hypertensive	atrialfibrillation	diabetes	deficiencyanemias	Hyperlipemia	Renal failure	COPD	heart rate	Systolic blood pressure	Diastolic blood pressure	Respiratory rate	temperature	SP O2	Urine output	hematocrit	RBC	MCH	MCHC	MCV	RDW	Leucocyte	Platelets	Neutrophils	Basophils	Lymphocyte	PT	INR	NT-proBNP	Creatine kinase	Creatinine	Urea nitrogen	glucose	Blood potassium	Blood sodium	Blood calcium	Chloride	Anion gap	Magnesium ion	PH	Bicarbonate	Lactic acid	PCO2	EF
1	72	1	37.58818	0	0	1	1	1	1	0	68.83784	155.8667	68.33333	16.62162	36.71429	98.39474	2155	26.27273	2.960000	28.25000	31.52000	89.900	16.22000	7.650000	305.100	74.65	0.40	13.3	10.60000	1.000000	1956	148.0000	1.9583333	50.00000	114.63636	4.816667	138.7500	7.463636	109.16667	13.16667	2.618182	7.230	21.16667	0.5	40.0	55
1	75	2	NA	0	0	0	1	0	0	1	101.37037	140.0000	65.00000	20.85185	36.68254	96.92308	1425	30.78000	3.138000	31.06000	31.66000	98.200	14.26000	12.740000	246.400	NA	NA	NA	NA	NA	2384	60.6000	1.1222222	20.33333	147.50000	4.450000	138.8889	8.162500	98.44444	11.44444	1.887500	7.225	33.44444	0.5	78.0	55
1	83	2	26.57263	0	0	0	1	0	1	0	72.31818	135.3333	61.37500	23.64000	36.45370	95.29167	2425	27.70000	2.620000	34.32000	31.30000	109.800	23.82000	5.480000	204.200	68.10	0.55	24.5	11.27500	0.950000	4081	16.0000	1.8714286	33.85714	149.00000	5.825000	140.7143	8.266667	105.85714	10.00000	2.157143	7.268	30.57143	0.5	71.5	35
1	43	2	83.26463	0	0	0	0	0	0	0	94.50000	126.4000	73.20000	21.85714	36.28704	93.84615	8760	36.63750	4.277500	26.06250	30.41250	85.625	17.03750	8.225000	216.375	81.80	0.15	14.5	27.06667	2.666667	668	85.0000	0.5857143	15.28571	128.25000	4.386667	138.5000	9.476923	92.07143	12.35714	1.942857	7.370	38.57143	0.6	75.0	55
1	75	2	31.82484	1	0	0	1	0	1	1	67.92000	156.5600	58.12000	21.36000	36.76190	99.28000	4455	29.93333	3.286667	30.66667	33.66667	91.000	16.26667	8.833333	251.000	NA	NA	NA	NA	NA	30802	111.6667	1.9500000	43.00000	145.75000	4.783333	136.6667	8.733333	104.50000	15.16667	1.650000	7.250	22.00000	0.6	50.0	55
1	76	1	24.26229	1	1	0	1	1	1	1	74.18182	118.1000	52.95000	20.54545	35.26667	96.81818	1840	27.33333	3.235000	26.56667	31.48333	84.500	16.51667	9.516667	273.000	85.40	0.30	9.3	18.78333	1.700000	34183	28.0000	1.6125000	26.62500	98.33333	4.075000	136.2500	8.466667	96.75000	13.12500	1.771429	7.310	30.50000	0.6	65.5	35

Cleaning

apply(data, 2, is.numeric)

##                    group                  outcome                      age 
##                     TRUE                     TRUE                     TRUE 
##                  gendera                      BMI             hypertensive 
##                     TRUE                     TRUE                     TRUE 
##       atrialfibrillation           CHD with no MI                 diabetes 
##                     TRUE                     TRUE                     TRUE 
##        deficiencyanemias               depression             Hyperlipemia 
##                     TRUE                     TRUE                     TRUE 
##            Renal failure                     COPD               heart rate 
##                     TRUE                     TRUE                     TRUE 
##  Systolic blood pressure Diastolic blood pressure         Respiratory rate 
##                     TRUE                     TRUE                     TRUE 
##              temperature                    SP O2             Urine output 
##                     TRUE                     TRUE                     TRUE 
##               hematocrit                      RBC                      MCH 
##                     TRUE                     TRUE                     TRUE 
##                     MCHC                      MCV                      RDW 
##                     TRUE                     TRUE                     TRUE 
##                Leucocyte                Platelets              Neutrophils 
##                     TRUE                     TRUE                     TRUE 
##                Basophils               Lymphocyte                       PT 
##                     TRUE                     TRUE                     TRUE 
##                      INR                NT-proBNP          Creatine kinase 
##                     TRUE                     TRUE                     TRUE 
##               Creatinine            Urea nitrogen                  glucose 
##                     TRUE                     TRUE                     TRUE 
##          Blood potassium             Blood sodium            Blood calcium 
##                     TRUE                     TRUE                     TRUE 
##                 Chloride                Anion gap            Magnesium ion 
##                     TRUE                     TRUE                     TRUE 
##                       PH              Bicarbonate              Lactic acid 
##                     TRUE                     TRUE                     TRUE 
##                     PCO2                       EF 
##                     TRUE                     TRUE

data_no_na= data |> 
  mutate_if(is.numeric, ~replace_na(.,mean(., na.rm = TRUE)))

data_no_na |>  is.na() |> sum()

## [1] 0

Turning grouping in factor

data_no_na$group = as.factor(data_no_na$group)
table(data_no_na$group) |> prop.table()

## 
##         1         2 
## 0.7009346 0.2990654

Rename Columns

data_to_model = data_no_na
data_to_model = rename(data_to_model, "Class"="group")
data_to_model = as.data.frame(data_to_model)

names_data_to_model= str_replace_all( names(data_to_model),"-", "_")  
names_data_to_model =str_replace_all(names_data_to_model," ","_")


names(data_to_model) = names_data_to_model
names(data_to_model)

##  [1] "Class"                    "outcome"                 
##  [3] "age"                      "gendera"                 
##  [5] "BMI"                      "hypertensive"            
##  [7] "atrialfibrillation"       "CHD_with_no_MI"          
##  [9] "diabetes"                 "deficiencyanemias"       
## [11] "depression"               "Hyperlipemia"            
## [13] "Renal_failure"            "COPD"                    
## [15] "heart_rate"               "Systolic_blood_pressure" 
## [17] "Diastolic_blood_pressure" "Respiratory_rate"        
## [19] "temperature"              "SP_O2"                   
## [21] "Urine_output"             "hematocrit"              
## [23] "RBC"                      "MCH"                     
## [25] "MCHC"                     "MCV"                     
## [27] "RDW"                      "Leucocyte"               
## [29] "Platelets"                "Neutrophils"             
## [31] "Basophils"                "Lymphocyte"              
## [33] "PT"                       "INR"                     
## [35] "NT_proBNP"                "Creatine_kinase"         
## [37] "Creatinine"               "Urea_nitrogen"           
## [39] "glucose"                  "Blood_potassium"         
## [41] "Blood_sodium"             "Blood_calcium"           
## [43] "Chloride"                 "Anion_gap"               
## [45] "Magnesium_ion"            "PH"                      
## [47] "Bicarbonate"              "Lactic_acid"             
## [49] "PCO2"                     "EF"

OverSample

rows_over=filter(data_to_model, Class == "Alive") |> nrow() 
rows_to_fill= nrow(data_to_model) + (nrow(data_to_model) - rows_over)


data_balanced_over <- ovun.sample(Class ~ ., data = data_to_model, method = "over",N = rows_to_fill)$data

Split in test and training

data_to_model = data_balanced_over
set.seed(123)
sample_to_model <- sample(c(TRUE, FALSE), nrow(data_to_model), replace=TRUE, prob=c(0.7,0.3))
train  <- data_to_model[sample_to_model, ]
test   <- data_to_model[!sample_to_model, ]

Training the model

set.seed (123)
tune.out = tune(svm,
                Class ~ .,
                data=train,
                kernel ="radial",
                ranges =list (cost=c(0.1 ,1 ,10 ,100 ,1000),
                                gamma =c(0.5 ,1 ,2 ,3 ,4) ))



svm_model_tune = svm(Class ~ ., data = train, kernel = "radial", cost = 1,scale = TRUE,gamma=0.5)


caret::confusionMatrix(svm_model_tune$fitted,train$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    1    2
##          1  581    0
##          2    0 1075
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9978, 1)
##     No Information Rate : 0.6492     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.3508     
##          Detection Rate : 0.3508     
##    Detection Prevalence : 0.3508     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : 1          
##

Test Model

results_tune= predict(svm_model_tune,test[,!names(test) %in% c("Class")])

table(results_tune,test$Class)

##             
## results_tune   1   2
##            1 244  23
##            2   0 431

caret::confusionMatrix(results_tune,test$Class)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2
##          1 244  23
##          2   0 431
##                                         
##                Accuracy : 0.967         
##                  95% CI : (0.951, 0.979)
##     No Information Rate : 0.6504        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.9291        
##                                         
##  Mcnemar's Test P-Value : 4.49e-06      
##                                         
##             Sensitivity : 1.0000        
##             Specificity : 0.9493        
##          Pos Pred Value : 0.9139        
##          Neg Pred Value : 1.0000        
##              Prevalence : 0.3496        
##          Detection Rate : 0.3496        
##    Detection Prevalence : 0.3825        
##       Balanced Accuracy : 0.9747        
##                                         
##        'Positive' Class : 1             
##

roc_svm_tune_test <- roc(response = test$Class, predictor =as.numeric(results_tune),
                    smoothed = TRUE,
                    # arguments for ci
                    ci=TRUE, ci.alpha=0.9, stratified=FALSE,
                    # arguments for plot
                    plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                    print.auc=TRUE, show.thres=TRUE)

sens.ci <- ci.se(roc_svm_tune_test)
plot(sens.ci, type="shape", col="lightblue")
plot(sens.ci, type="bars")

Conclusion

This model has great accuracy in predicting which patient is more likely to die or live. Perhaps it can be tested with other datasets and help healthcare staff save more lives.

Kaggle - Hospital Mortality Model

Romario Gomes

2023-04-26