#Predicting survival of patients with heart failure. This is a classification problem.
#Import the dataset
library(readr)
heartFailure <- read_csv("C:/Users/dnred/OneDrive/Desktop/CS 583/heartFailure.csv")
## Rows: 299 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): age, anaemia, creatinine_phosphokinase, diabetes, ejection_fractio...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(heartFailure)
#Load the packages
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.3.3
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
##
## Attaching package: 'dendextend'
##
## The following object is masked from 'package:stats':
##
## cutree
library(cluster)
library(fpc)
## Warning: package 'fpc' was built under R version 4.3.3
library(clValid)
## Warning: package 'clValid' was built under R version 4.3.3
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
library(rpart)
##
## Attaching package: 'rpart'
##
## The following object is masked from 'package:dendextend':
##
## prune
library(rattle)
## Warning: package 'rattle' was built under R version 4.3.3
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.3.3
library(RColorBrewer)
library(class)
## Warning: package 'class' was built under R version 4.3.3
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.3.3
library(kernlab)
##
## Attaching package: 'kernlab'
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(C50)
## Warning: package 'C50' was built under R version 4.3.3
#Statistical Analysis
hf <- heartFailure
str(hf)
## spc_tbl_ [299 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:299] 75 55 65 50 65 90 75 60 65 80 ...
## $ anaemia : num [1:299] 0 0 0 1 1 1 1 1 0 1 ...
## $ creatinine_phosphokinase: num [1:299] 582 7861 146 111 160 ...
## $ diabetes : num [1:299] 0 0 0 0 1 0 0 1 0 0 ...
## $ ejection_fraction : num [1:299] 20 38 20 20 20 40 15 60 65 35 ...
## $ high_blood_pressure : num [1:299] 1 0 0 0 0 1 0 0 0 1 ...
## $ platelets : num [1:299] 265000 263358 162000 210000 327000 ...
## $ serum_creatinine : num [1:299] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
## $ serum_sodium : num [1:299] 130 136 129 137 116 132 137 131 138 133 ...
## $ sex : num [1:299] 1 1 1 1 0 1 1 1 0 1 ...
## $ smoking : num [1:299] 0 0 1 0 0 1 0 1 0 1 ...
## $ time : num [1:299] 4 6 7 7 8 8 10 10 10 10 ...
## $ DEATH_EVENT : num [1:299] 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. anaemia = col_double(),
## .. creatinine_phosphokinase = col_double(),
## .. diabetes = col_double(),
## .. ejection_fraction = col_double(),
## .. high_blood_pressure = col_double(),
## .. platelets = col_double(),
## .. serum_creatinine = col_double(),
## .. serum_sodium = col_double(),
## .. sex = col_double(),
## .. smoking = col_double(),
## .. time = col_double(),
## .. DEATH_EVENT = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
View(hf)
#summarize the dataset
summary(hf)
## age anaemia creatinine_phosphokinase diabetes
## Min. :40.00 Min. :0.0000 Min. : 23.0 Min. :0.0000
## 1st Qu.:51.00 1st Qu.:0.0000 1st Qu.: 116.5 1st Qu.:0.0000
## Median :60.00 Median :0.0000 Median : 250.0 Median :0.0000
## Mean :60.83 Mean :0.4314 Mean : 581.8 Mean :0.4181
## 3rd Qu.:70.00 3rd Qu.:1.0000 3rd Qu.: 582.0 3rd Qu.:1.0000
## Max. :95.00 Max. :1.0000 Max. :7861.0 Max. :1.0000
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :14.00 Min. :0.0000 Min. : 25100 Min. :0.500
## 1st Qu.:30.00 1st Qu.:0.0000 1st Qu.:212500 1st Qu.:0.900
## Median :38.00 Median :0.0000 Median :262000 Median :1.100
## Mean :38.08 Mean :0.3512 Mean :263358 Mean :1.394
## 3rd Qu.:45.00 3rd Qu.:1.0000 3rd Qu.:303500 3rd Qu.:1.400
## Max. :80.00 Max. :1.0000 Max. :850000 Max. :9.400
## serum_sodium sex smoking time
## Min. :113.0 Min. :0.0000 Min. :0.0000 Min. : 4.0
## 1st Qu.:134.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 73.0
## Median :137.0 Median :1.0000 Median :0.0000 Median :115.0
## Mean :136.6 Mean :0.6488 Mean :0.3211 Mean :130.3
## 3rd Qu.:140.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:203.0
## Max. :148.0 Max. :1.0000 Max. :1.0000 Max. :285.0
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3211
## 3rd Qu.:1.0000
## Max. :1.0000
#display first 20 rows of data
head(hf, n=20)
## # A tibble: 20 × 13
## age anaemia creatinine_phosphokinase diabetes ejection_fraction
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 75 0 582 0 20
## 2 55 0 7861 0 38
## 3 65 0 146 0 20
## 4 50 1 111 0 20
## 5 65 1 160 1 20
## 6 90 1 47 0 40
## 7 75 1 246 0 15
## 8 60 1 315 1 60
## 9 65 0 157 0 65
## 10 80 1 123 0 35
## 11 75 1 81 0 38
## 12 62 0 231 0 25
## 13 45 1 981 0 30
## 14 50 1 168 0 38
## 15 49 1 80 0 30
## 16 82 1 379 0 50
## 17 87 1 149 0 38
## 18 45 0 582 0 14
## 19 70 1 125 0 25
## 20 48 1 582 1 55
## # ℹ 8 more variables: high_blood_pressure <dbl>, platelets <dbl>,
## # serum_creatinine <dbl>, serum_sodium <dbl>, sex <dbl>, smoking <dbl>,
## # time <dbl>, DEATH_EVENT <dbl>
#display the dimensions of the dataset
dim(hf)
## [1] 299 13
#list types for each attribute
sapply(hf, class)
## age anaemia creatinine_phosphokinase
## "numeric" "numeric" "numeric"
## diabetes ejection_fraction high_blood_pressure
## "numeric" "numeric" "numeric"
## platelets serum_creatinine serum_sodium
## "numeric" "numeric" "numeric"
## sex smoking time
## "numeric" "numeric" "numeric"
## DEATH_EVENT
## "numeric"
# distribution of class variable
y <- hf$DEATH_EVENT
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
## 0 203 67.89298
## 1 96 32.10702
#calculate standard deviation for all attributes
sapply(hf[,1:13], sd)
## age anaemia creatinine_phosphokinase
## 1.189481e+01 4.961073e-01 9.702879e+02
## diabetes ejection_fraction high_blood_pressure
## 4.940671e-01 1.183484e+01 4.781364e-01
## platelets serum_creatinine serum_sodium
## 9.780424e+04 1.034510e+00 4.412477e+00
## sex smoking time
## 4.781364e-01 4.676704e-01 7.761421e+01
## DEATH_EVENT
## 4.676704e-01
#calculate skewness for each variable
skew <- apply(hf[,1:13], 2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)
## age anaemia creatinine_phosphokinase
## 0.4188266 0.2754750 4.4184296
## diabetes ejection_fraction high_blood_pressure
## 0.3305857 0.5498228 0.6204576
## platelets serum_creatinine serum_sodium
## 1.4476814 4.4113866 -1.0376430
## sex smoking time
## -0.6204576 0.7626368 0.1265232
## DEATH_EVENT
## 0.7626368
#calculate a correlation matrix for numeric variables
correlations <- cor(hf[,1:13])
#display the correlation matrix
print(correlations)
## age anaemia creatinine_phosphokinase
## age 1.00000000 0.08800644 -0.081583900
## anaemia 0.08800644 1.00000000 -0.190741030
## creatinine_phosphokinase -0.08158390 -0.19074103 1.000000000
## diabetes -0.10101239 -0.01272905 -0.009638514
## ejection_fraction 0.06009836 0.03155697 -0.044079554
## high_blood_pressure 0.09328868 0.03818200 -0.070589980
## platelets -0.05235437 -0.04378555 0.024463389
## serum_creatinine 0.15918713 0.05217360 -0.016408480
## serum_sodium -0.04596584 0.04188161 0.059550156
## sex 0.06542952 -0.09476896 0.079790629
## smoking 0.01866787 -0.10728984 0.002421235
## time -0.22406842 -0.14141398 -0.009345653
## DEATH_EVENT 0.25372854 0.06627010 0.062728160
## diabetes ejection_fraction high_blood_pressure
## age -0.101012385 0.06009836 0.093288685
## anaemia -0.012729046 0.03155697 0.038182003
## creatinine_phosphokinase -0.009638514 -0.04407955 -0.070589980
## diabetes 1.000000000 -0.00485031 -0.012732382
## ejection_fraction -0.004850310 1.00000000 0.024444731
## high_blood_pressure -0.012732382 0.02444473 1.000000000
## platelets 0.092192828 0.07217747 0.049963481
## serum_creatinine -0.046975315 -0.01130247 -0.004934525
## serum_sodium -0.089550619 0.17590228 0.037109470
## sex -0.157729504 -0.14838597 -0.104614629
## smoking -0.147173413 -0.06731457 -0.055711369
## time 0.033725509 0.04172924 -0.196439479
## DEATH_EVENT -0.001942883 -0.26860331 0.079351058
## platelets serum_creatinine serum_sodium sex
## age -0.05235437 0.159187133 -0.045965841 0.065429524
## anaemia -0.04378555 0.052173604 0.041881610 -0.094768961
## creatinine_phosphokinase 0.02446339 -0.016408480 0.059550156 0.079790629
## diabetes 0.09219283 -0.046975315 -0.089550619 -0.157729504
## ejection_fraction 0.07217747 -0.011302475 0.175902282 -0.148385965
## high_blood_pressure 0.04996348 -0.004934525 0.037109470 -0.104614629
## platelets 1.00000000 -0.041198077 0.062124619 -0.125120483
## serum_creatinine -0.04119808 1.000000000 -0.189095210 0.006969778
## serum_sodium 0.06212462 -0.189095210 1.000000000 -0.027566123
## sex -0.12512048 0.006969778 -0.027566123 1.000000000
## smoking 0.02823445 -0.027414135 0.004813195 0.445891712
## time 0.01051391 -0.149315418 0.087640000 -0.015608220
## DEATH_EVENT -0.04913887 0.294277561 -0.195203596 -0.004316376
## smoking time DEATH_EVENT
## age 0.018667868 -0.224068420 0.253728543
## anaemia -0.107289838 -0.141413982 0.066270098
## creatinine_phosphokinase 0.002421235 -0.009345653 0.062728160
## diabetes -0.147173413 0.033725509 -0.001942883
## ejection_fraction -0.067314567 0.041729235 -0.268603312
## high_blood_pressure -0.055711369 -0.196439479 0.079351058
## platelets 0.028234448 0.010513909 -0.049138868
## serum_creatinine -0.027414135 -0.149315418 0.294277561
## serum_sodium 0.004813195 0.087640000 -0.195203596
## sex 0.445891712 -0.015608220 -0.004316376
## smoking 1.000000000 -0.022838942 -0.012623153
## time -0.022838942 1.000000000 -0.526963779
## DEATH_EVENT -0.012623153 -0.526963779 1.000000000
#create correlation plot
corrplot(correlations, method="circle")

#calculate proportion of death events
death_proportion <- mean(hf$DEATH_EVENT)
#print the proportion
print(death_proportion)
## [1] 0.3210702
#Removing null values
any(is.na(hf))
## [1] FALSE
hf<-na.omit(hf)
str(hf)
## tibble [299 × 13] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:299] 75 55 65 50 65 90 75 60 65 80 ...
## $ anaemia : num [1:299] 0 0 0 1 1 1 1 1 0 1 ...
## $ creatinine_phosphokinase: num [1:299] 582 7861 146 111 160 ...
## $ diabetes : num [1:299] 0 0 0 0 1 0 0 1 0 0 ...
## $ ejection_fraction : num [1:299] 20 38 20 20 20 40 15 60 65 35 ...
## $ high_blood_pressure : num [1:299] 1 0 0 0 0 1 0 0 0 1 ...
## $ platelets : num [1:299] 265000 263358 162000 210000 327000 ...
## $ serum_creatinine : num [1:299] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
## $ serum_sodium : num [1:299] 130 136 129 137 116 132 137 131 138 133 ...
## $ sex : num [1:299] 1 1 1 1 0 1 1 1 0 1 ...
## $ smoking : num [1:299] 0 0 1 0 0 1 0 1 0 1 ...
## $ time : num [1:299] 4 6 7 7 8 8 10 10 10 10 ...
## $ DEATH_EVENT : num [1:299] 1 1 1 1 1 1 1 1 1 1 ...
View(hf)
#Scale Data
#The scale transform calculates the standard deviation for an attribute and divides each value by that standard deviation
#summarize data
summary(hf[,1:13])
## age anaemia creatinine_phosphokinase diabetes
## Min. :40.00 Min. :0.0000 Min. : 23.0 Min. :0.0000
## 1st Qu.:51.00 1st Qu.:0.0000 1st Qu.: 116.5 1st Qu.:0.0000
## Median :60.00 Median :0.0000 Median : 250.0 Median :0.0000
## Mean :60.83 Mean :0.4314 Mean : 581.8 Mean :0.4181
## 3rd Qu.:70.00 3rd Qu.:1.0000 3rd Qu.: 582.0 3rd Qu.:1.0000
## Max. :95.00 Max. :1.0000 Max. :7861.0 Max. :1.0000
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :14.00 Min. :0.0000 Min. : 25100 Min. :0.500
## 1st Qu.:30.00 1st Qu.:0.0000 1st Qu.:212500 1st Qu.:0.900
## Median :38.00 Median :0.0000 Median :262000 Median :1.100
## Mean :38.08 Mean :0.3512 Mean :263358 Mean :1.394
## 3rd Qu.:45.00 3rd Qu.:1.0000 3rd Qu.:303500 3rd Qu.:1.400
## Max. :80.00 Max. :1.0000 Max. :850000 Max. :9.400
## serum_sodium sex smoking time
## Min. :113.0 Min. :0.0000 Min. :0.0000 Min. : 4.0
## 1st Qu.:134.0 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 73.0
## Median :137.0 Median :1.0000 Median :0.0000 Median :115.0
## Mean :136.6 Mean :0.6488 Mean :0.3211 Mean :130.3
## 3rd Qu.:140.0 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:203.0
## Max. :148.0 Max. :1.0000 Max. :1.0000 Max. :285.0
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3211
## 3rd Qu.:1.0000
## Max. :1.0000
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("scale"))
#summarize transform parameters
print(preprocessParams)
## Created from 299 samples and 13 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (13)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)
## age anaemia creatinine_phosphokinase diabetes
## Min. :3.363 Min. :0.0000 Min. :0.0237 Min. :0.0000
## 1st Qu.:4.288 1st Qu.:0.0000 1st Qu.:0.1201 1st Qu.:0.0000
## Median :5.044 Median :0.0000 Median :0.2577 Median :0.0000
## Mean :5.114 Mean :0.8696 Mean :0.5997 Mean :0.8462
## 3rd Qu.:5.885 3rd Qu.:2.0157 3rd Qu.:0.5998 3rd Qu.:2.0240
## Max. :7.987 Max. :2.0157 Max. :8.1017 Max. :2.0240
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :1.183 Min. :0.0000 Min. :0.2566 Min. :0.4833
## 1st Qu.:2.535 1st Qu.:0.0000 1st Qu.:2.1727 1st Qu.:0.8700
## Median :3.211 Median :0.0000 Median :2.6788 Median :1.0633
## Mean :3.218 Mean :0.7345 Mean :2.6927 Mean :1.3474
## 3rd Qu.:3.802 3rd Qu.:2.0915 3rd Qu.:3.1031 3rd Qu.:1.3533
## Max. :6.760 Max. :2.0915 Max. :8.6908 Max. :9.0864
## serum_sodium sex smoking time
## Min. :25.61 Min. :0.000 Min. :0.0000 Min. :0.05154
## 1st Qu.:30.37 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.94055
## Median :31.05 Median :2.091 Median :0.0000 Median :1.48169
## Mean :30.96 Mean :1.357 Mean :0.6865 Mean :1.67831
## 3rd Qu.:31.73 3rd Qu.:2.091 3rd Qu.:2.1383 3rd Qu.:2.61550
## Max. :33.54 Max. :2.091 Max. :2.1383 Max. :3.67201
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.6865
## 3rd Qu.:2.1383
## Max. :2.1383
#Center Data
#The center transform calculates the mean for an attribute and subtracts it from each value.
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("center"))
#summarize transform parameters
print(preprocessParams)
## Created from 299 samples and 13 variables
##
## Pre-processing:
## - centered (13)
## - ignored (0)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)
## age anaemia creatinine_phosphokinase
## Min. :-20.8339 Min. :-0.4314 Min. :-558.839
## 1st Qu.: -9.8339 1st Qu.:-0.4314 1st Qu.:-465.339
## Median : -0.8339 Median :-0.4314 Median :-331.839
## Mean : 0.0000 Mean : 0.0000 Mean : 0.000
## 3rd Qu.: 9.1661 3rd Qu.: 0.5686 3rd Qu.: 0.161
## Max. : 34.1661 Max. : 0.5686 Max. :7279.161
## diabetes ejection_fraction high_blood_pressure platelets
## Min. :-0.4181 Min. :-24.08361 Min. :-0.3512 Min. :-238258
## 1st Qu.:-0.4181 1st Qu.: -8.08361 1st Qu.:-0.3512 1st Qu.: -50858
## Median :-0.4181 Median : -0.08361 Median :-0.3512 Median : -1358
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0
## 3rd Qu.: 0.5819 3rd Qu.: 6.91639 3rd Qu.: 0.6488 3rd Qu.: 40142
## Max. : 0.5819 Max. : 41.91639 Max. : 0.6488 Max. : 586642
## serum_creatinine serum_sodium sex smoking
## Min. :-0.89388 Min. :-23.6254 Min. :-0.6488 Min. :-0.3211
## 1st Qu.:-0.49388 1st Qu.: -2.6254 1st Qu.:-0.6488 1st Qu.:-0.3211
## Median :-0.29388 Median : 0.3746 Median : 0.3512 Median :-0.3211
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.00612 3rd Qu.: 3.3746 3rd Qu.: 0.3512 3rd Qu.: 0.6789
## Max. : 8.00612 Max. : 11.3746 Max. : 0.3512 Max. : 0.6789
## time DEATH_EVENT
## Min. :-126.26 Min. :-0.3211
## 1st Qu.: -57.26 1st Qu.:-0.3211
## Median : -15.26 Median :-0.3211
## Mean : 0.00 Mean : 0.0000
## 3rd Qu.: 72.74 3rd Qu.: 0.6789
## Max. : 154.74 Max. : 0.6789
#Standardize Data
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("center", "scale"))
#summarize transform parameters
print(preprocessParams)
## Created from 299 samples and 13 variables
##
## Pre-processing:
## - centered (13)
## - ignored (0)
## - scaled (13)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)
## age anaemia creatinine_phosphokinase
## Min. :-1.75151 Min. :-0.8696 Min. :-0.575952
## 1st Qu.:-0.82674 1st Qu.:-0.8696 1st Qu.:-0.479589
## Median :-0.07011 Median :-0.8696 Median :-0.342001
## Mean : 0.00000 Mean : 0.0000 Mean : 0.000000
## 3rd Qu.: 0.77060 3rd Qu.: 1.1460 3rd Qu.: 0.000165
## Max. : 2.87235 Max. : 1.1460 Max. : 7.502063
## diabetes ejection_fraction high_blood_pressure platelets
## Min. :-0.8462 Min. :-2.034976 Min. :-0.7345 Min. :-2.43607
## 1st Qu.:-0.8462 1st Qu.:-0.683035 1st Qu.:-0.7345 1st Qu.:-0.52000
## Median :-0.8462 Median :-0.007065 Median :-0.7345 Median :-0.01388
## Mean : 0.0000 Mean : 0.000000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 1.1779 3rd Qu.: 0.584409 3rd Qu.: 1.3570 3rd Qu.: 0.41043
## Max. : 1.1779 Max. : 3.541779 Max. : 1.3570 Max. : 5.99812
## serum_creatinine serum_sodium sex smoking
## Min. :-0.864061 Min. :-5.35423 Min. :-1.3570 Min. :-0.6865
## 1st Qu.:-0.477404 1st Qu.:-0.59500 1st Qu.:-1.3570 1st Qu.:-0.6865
## Median :-0.284076 Median : 0.08489 Median : 0.7345 Median :-0.6865
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.005916 3rd Qu.: 0.76478 3rd Qu.: 0.7345 3rd Qu.: 1.4517
## Max. : 7.739045 Max. : 2.57782 Max. : 0.7345 Max. : 1.4517
## time DEATH_EVENT
## Min. :-1.6268 Min. :-0.6865
## 1st Qu.:-0.7378 1st Qu.:-0.6865
## Median :-0.1966 Median :-0.6865
## Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.9372 3rd Qu.: 1.4517
## Max. : 1.9937 Max. : 1.4517
#Normalize Data
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(hf[,1:13], method=c("range"))
#summarize transform parameters
print(preprocessParams)
## Created from 299 samples and 13 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (13)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, hf[,1:13])
#summarize the transformed dataset
summary(transformed)
## age anaemia creatinine_phosphokinase diabetes
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.2000 1st Qu.:0.0000 1st Qu.:0.01193 1st Qu.:0.0000
## Median :0.3636 Median :0.0000 Median :0.02896 Median :0.0000
## Mean :0.3788 Mean :0.4314 Mean :0.07130 Mean :0.4181
## 3rd Qu.:0.5455 3rd Qu.:1.0000 3rd Qu.:0.07132 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## ejection_fraction high_blood_pressure platelets serum_creatinine
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.2424 1st Qu.:0.0000 1st Qu.:0.2272 1st Qu.:0.04494
## Median :0.3636 Median :0.0000 Median :0.2872 Median :0.06742
## Mean :0.3649 Mean :0.3512 Mean :0.2888 Mean :0.10044
## 3rd Qu.:0.4697 3rd Qu.:1.0000 3rd Qu.:0.3375 3rd Qu.:0.10112
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## serum_sodium sex smoking time
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.2456
## Median :0.6857 Median :1.0000 Median :0.0000 Median :0.3950
## Mean :0.6750 Mean :0.6488 Mean :0.3211 Mean :0.4493
## 3rd Qu.:0.7714 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.7082
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## DEATH_EVENT
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3211
## 3rd Qu.:1.0000
## Max. :1.0000
#REGRESSION TREE
#set seed for reproducibility
set.seed(200)
#split the dataset into training and testing based on indices
train_indices <- 1:239
test_indices <- 240:299 #create training and testing datasets
training_data <- hf[train_indices, ]
testing_data <- hf[test_indices, ]
#train your learner on the training dataset and save your model in a variable fit
fit.rg <- rpart(DEATH_EVENT ~ age + ejection_fraction + serum_creatinine + serum_sodium + time,
data=training_data, method="class")
plot(fit.rg)

#save the plot as a PNG file
png("regression_tree.png")
plot(fit.rg)
dev.off()
## png
## 2
#obtain a more readable plot
fancyRpartPlot(fit.rg)

#examine the tree
summary(fit.rg)
## Call:
## rpart(formula = DEATH_EVENT ~ age + ejection_fraction + serum_creatinine +
## serum_sodium + time, data = training_data, method = "class")
## n= 239
##
## CP nsplit rel error xerror xstd
## 1 0.53763441 0 1.0000000 1.0000000 0.08104682
## 2 0.05376344 1 0.4623656 0.5053763 0.06607203
## 3 0.03225806 2 0.4086022 0.5376344 0.06761359
## 4 0.01075269 3 0.3763441 0.5268817 0.06711088
## 5 0.01000000 4 0.3655914 0.5376344 0.06761359
##
## Variable importance
## time serum_creatinine ejection_fraction age
## 62 24 6 6
## serum_sodium
## 2
##
## Node number 1: 239 observations, complexity param=0.5376344
## predicted class=0 expected loss=0.3891213 P(node) =1
## class counts: 146 93
## probabilities: 0.611 0.389
## left son=2 (169 obs) right son=3 (70 obs)
## Primary splits:
## time < 67.5 to the right, improve=43.368150, (0 missing)
## serum_creatinine < 1.55 to the left, improve=18.100590, (0 missing)
## ejection_fraction < 22.5 to the right, improve=10.911660, (0 missing)
## serum_sodium < 135.5 to the right, improve= 7.373314, (0 missing)
## age < 79.5 to the left, improve= 6.949012, (0 missing)
## Surrogate splits:
## age < 86.5 to the left, agree=0.732, adj=0.086, (0 split)
## serum_creatinine < 1.815 to the left, agree=0.732, adj=0.086, (0 split)
## ejection_fraction < 22.5 to the right, agree=0.724, adj=0.057, (0 split)
## serum_sodium < 122.5 to the right, agree=0.711, adj=0.014, (0 split)
##
## Node number 2: 169 observations, complexity param=0.05376344
## predicted class=0 expected loss=0.1952663 P(node) =0.707113
## class counts: 136 33
## probabilities: 0.805 0.195
## left son=4 (138 obs) right son=5 (31 obs)
## Primary splits:
## serum_creatinine < 1.55 to the left, improve=11.276520, (0 missing)
## ejection_fraction < 32.5 to the right, improve= 6.255623, (0 missing)
## serum_sodium < 135.5 to the right, improve= 4.114378, (0 missing)
## age < 71 to the left, improve= 3.057750, (0 missing)
## time < 182.5 to the right, improve= 1.205818, (0 missing)
## Surrogate splits:
## serum_sodium < 124.5 to the right, agree=0.828, adj=0.065, (0 split)
##
## Node number 3: 70 observations
## predicted class=1 expected loss=0.1428571 P(node) =0.292887
## class counts: 10 60
## probabilities: 0.143 0.857
##
## Node number 4: 138 observations
## predicted class=0 expected loss=0.1086957 P(node) =0.5774059
## class counts: 123 15
## probabilities: 0.891 0.109
##
## Node number 5: 31 observations, complexity param=0.03225806
## predicted class=1 expected loss=0.4193548 P(node) =0.1297071
## class counts: 13 18
## probabilities: 0.419 0.581
## left son=10 (7 obs) right son=11 (24 obs)
## Primary splits:
## serum_creatinine < 2.85 to the right, improve=1.5729650, (0 missing)
## time < 109.5 to the left, improve=1.1808080, (0 missing)
## ejection_fraction < 32.5 to the right, improve=1.1808080, (0 missing)
## serum_sodium < 135.5 to the right, improve=1.0529150, (0 missing)
## age < 71 to the left, improve=0.7331378, (0 missing)
## Surrogate splits:
## ejection_fraction < 55 to the right, agree=0.839, adj=0.286, (0 split)
##
## Node number 10: 7 observations
## predicted class=0 expected loss=0.2857143 P(node) =0.0292887
## class counts: 5 2
## probabilities: 0.714 0.286
##
## Node number 11: 24 observations, complexity param=0.01075269
## predicted class=1 expected loss=0.3333333 P(node) =0.1004184
## class counts: 8 16
## probabilities: 0.333 0.667
## left son=22 (9 obs) right son=23 (15 obs)
## Primary splits:
## ejection_fraction < 32.5 to the right, improve=1.4222220, (0 missing)
## time < 92.5 to the left, improve=1.1204480, (0 missing)
## serum_sodium < 135.5 to the right, improve=0.6666667, (0 missing)
## age < 71 to the left, improve=0.6095238, (0 missing)
## serum_creatinine < 1.815 to the left, improve=0.3333333, (0 missing)
## Surrogate splits:
## age < 74 to the right, agree=0.792, adj=0.444, (0 split)
## serum_creatinine < 1.75 to the right, agree=0.667, adj=0.111, (0 split)
## serum_sodium < 141.5 to the right, agree=0.667, adj=0.111, (0 split)
## time < 80 to the left, agree=0.667, adj=0.111, (0 split)
##
## Node number 22: 9 observations
## predicted class=0 expected loss=0.4444444 P(node) =0.0376569
## class counts: 5 4
## probabilities: 0.556 0.444
##
## Node number 23: 15 observations
## predicted class=1 expected loss=0.2 P(node) =0.06276151
## class counts: 3 12
## probabilities: 0.200 0.800
#KNN
#we must split the dataset in order to evaluate how good our classification is
#divide into training dataset and test dataset
#training dataset is what we will use the to build the knn model
#test dataset will allow us to determine how well out model performs
hf_knn_train<-hf[1:239, ]
head(hf_knn_train)
## # A tibble: 6 × 13
## age anaemia creatinine_phosphokinase diabetes ejection_fraction
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 75 0 582 0 20
## 2 55 0 7861 0 38
## 3 65 0 146 0 20
## 4 50 1 111 0 20
## 5 65 1 160 1 20
## 6 90 1 47 0 40
## # ℹ 8 more variables: high_blood_pressure <dbl>, platelets <dbl>,
## # serum_creatinine <dbl>, serum_sodium <dbl>, sex <dbl>, smoking <dbl>,
## # time <dbl>, DEATH_EVENT <dbl>
hf_knn_test<-hf[240:299, ]
head(hf_knn_test)
## # A tibble: 6 × 13
## age anaemia creatinine_phosphokinase diabetes ejection_fraction
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 55 1 180 0 45
## 2 70 0 81 1 35
## 3 65 0 582 1 30
## 4 40 0 90 0 35
## 5 73 1 1185 0 40
## 6 54 0 582 1 38
## # ℹ 8 more variables: high_blood_pressure <dbl>, platelets <dbl>,
## # serum_creatinine <dbl>, serum_sodium <dbl>, sex <dbl>, smoking <dbl>,
## # time <dbl>, DEATH_EVENT <dbl>
#the death event label has been excluded from the training and tests datasets
#but we need it for training the knn model,
#So we will add two commands to store those values into 2 vectors
hf_knn_train_labels<-hf[1:239, 13]
hf_knn_test_labels<-hf[240:299, 13]
head(hf_knn_test_labels)
## # A tibble: 6 × 1
## DEATH_EVENT
## <dbl>
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
#rule of thumb to take sqrt of training dataset to determine class, k value
sqrt(239)
## [1] 15.45962
#Begin training a model on the data
#to classify our test instances, we will use the knn implementation from the class
#package, which provides a set of basic R functions for classifications
hf_knn_test_pred<-knn(train=hf_knn_train, test=hf_knn_test,
cl=hf_knn_train_labels$DEATH_EVENT, k=15)
hf_knn_test_pred
## [1] 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 0
## [39] 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0
## Levels: 0 1
#test model; evaluate how well the predicted classes are
#create a cross tabulation of predicted vs. actual
CrossTable(x=hf_knn_test_labels$DEATH_EVENT, y=hf_knn_test_pred, prop.chisq=FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 60
##
##
## | hf_knn_test_pred
## hf_knn_test_labels$DEATH_EVENT | 0 | 1 | Row Total |
## -------------------------------|-----------|-----------|-----------|
## 0 | 47 | 10 | 57 |
## | 0.825 | 0.175 | 0.950 |
## | 0.959 | 0.909 | |
## | 0.783 | 0.167 | |
## -------------------------------|-----------|-----------|-----------|
## 1 | 2 | 1 | 3 |
## | 0.667 | 0.333 | 0.050 |
## | 0.041 | 0.091 | |
## | 0.033 | 0.017 | |
## -------------------------------|-----------|-----------|-----------|
## Column Total | 49 | 11 | 60 |
## | 0.817 | 0.183 | |
## -------------------------------|-----------|-----------|-----------|
##
##
#this is called a confusion matrix, we can see that we have 47 TN and 1 TP
#we have 10 FN and 2 FP
#SVM
#divide into train and test data
heartfailure_train<-heartFailure[1:239, ]
heartfailure_test<-heartFailure[240:299, ]
heartfailure_train$DEATH_EVENT <- factor(heartfailure_train$DEATH_EVENT)
#svm kernel
heartfailure_classifier<- ksvm(DEATH_EVENT~ ., data=heartfailure_train, kernel="vanilladot")
## Setting default kernel parameters
heartfailure_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 115
##
## Objective Function Value : -108.2006
## Training error : 0.171548
hf_predict<-predict(heartfailure_classifier, heartfailure_test)
hf_predict
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
head(hf_predict)
## [1] 0 0 0 0 0 0
## Levels: 0 1
#accuracy kernel
agreement<-hf_predict==heartfailure_test$DEATH_EVENT
agreement
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [25] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
table(agreement)
## agreement
## FALSE TRUE
## 3 57
prop.table(table(agreement))
## agreement
## FALSE TRUE
## 0.05 0.95
table(hf_predict,heartfailure_test$DEATH_EVENT)
##
## hf_predict 0 1
## 0 57 3
## 1 0 0
#svm rbf
heartfailure_classifier_rbf<- ksvm(DEATH_EVENT~ ., data=heartfailure_train, kernel="rbfdot")
heartfailure_classifier_rbf
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Gaussian Radial Basis kernel function.
## Hyperparameter : sigma = 0.0562980380456991
##
## Number of Support Vectors : 152
##
## Objective Function Value : -108.7988
## Training error : 0.133891
hf_predict_rbf<-predict(heartfailure_classifier_rbf, heartfailure_test)
hf_predict_rbf
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [39] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## Levels: 0 1
head(hf_predict_rbf)
## [1] 0 0 0 0 0 0
## Levels: 0 1
table(hf_predict_rbf,heartfailure_test$DEATH_EVENT)
##
## hf_predict_rbf 0 1
## 0 56 3
## 1 1 0
#calculate accuracy
agreement_rbf<-hf_predict_rbf==heartfailure_test$DEATH_EVENT
agreement_rbf
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [13] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [25] TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [37] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [49] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
table(agreement_rbf)
## agreement_rbf
## FALSE TRUE
## 4 56
prop.table(table(agreement_rbf))
## agreement_rbf
## FALSE TRUE
## 0.06666667 0.93333333
#DECISION TREE
# Convert DEATH_EVENT to a factor in the training dataset
heartfailure_train$DEATH_EVENT <- factor(heartfailure_train$DEATH_EVENT)
str(heartfailure_train) #ensure death event was converted to factor
## tibble [239 × 13] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:239] 75 55 65 50 65 90 75 60 65 80 ...
## $ anaemia : num [1:239] 0 0 0 1 1 1 1 1 0 1 ...
## $ creatinine_phosphokinase: num [1:239] 582 7861 146 111 160 ...
## $ diabetes : num [1:239] 0 0 0 0 1 0 0 1 0 0 ...
## $ ejection_fraction : num [1:239] 20 38 20 20 20 40 15 60 65 35 ...
## $ high_blood_pressure : num [1:239] 1 0 0 0 0 1 0 0 0 1 ...
## $ platelets : num [1:239] 265000 263358 162000 210000 327000 ...
## $ serum_creatinine : num [1:239] 1.9 1.1 1.3 1.9 2.7 2.1 1.2 1.1 1.5 9.4 ...
## $ serum_sodium : num [1:239] 130 136 129 137 116 132 137 131 138 133 ...
## $ sex : num [1:239] 1 1 1 1 0 1 1 1 0 1 ...
## $ smoking : num [1:239] 0 0 1 0 0 1 0 1 0 1 ...
## $ time : num [1:239] 4 6 7 7 8 8 10 10 10 10 ...
## $ DEATH_EVENT : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#build the C5.0 model
heartfailure_model <- C5.0(heartfailure_train[-13], heartfailure_train$DEATH_EVENT)
summary(heartfailure_model)
##
## Call:
## C5.0.default(x = heartfailure_train[-13], y = heartfailure_train$DEATH_EVENT)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue May 7 18:34:17 2024
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 239 cases (13 attributes) from undefined.data
##
## Decision tree:
##
## time <= 67: 1 (70/10)
## time > 67:
## :...serum_creatinine > 1.5:
## :...creatinine_phosphokinase <= 62: 0 (4)
## : creatinine_phosphokinase > 62:
## : :...high_blood_pressure > 0: 1 (8)
## : high_blood_pressure <= 0:
## : :...diabetes <= 0:
## : :...platelets <= 277000: 1 (7)
## : : platelets > 277000: 0 (3)
## : diabetes > 0:
## : :...sex <= 0: 1 (2)
## : sex > 0: 0 (7/1)
## serum_creatinine <= 1.5:
## :...age > 79:
## :...serum_sodium <= 135: 1 (5/1)
## : serum_sodium > 135: 0 (3)
## age <= 79:
## :...ejection_fraction > 30: 0 (99/3)
## ejection_fraction <= 30:
## :...time <= 78: 1 (3)
## time > 78:
## :...high_blood_pressure > 0: 0 (8)
## high_blood_pressure <= 0:
## :...diabetes <= 0: 0 (12/1)
## diabetes > 0:
## :...serum_creatinine <= 0.9: 0 (2)
## serum_creatinine > 0.9:
## :...time <= 175: 1 (4)
## time > 175: 0 (2)
##
##
## Evaluation on training data (239 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 16 16( 6.7%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 135 11 (a): class 0
## 5 88 (b): class 1
##
##
## Attribute usage:
##
## 100.00% time
## 70.71% serum_creatinine
## 57.74% age
## 54.39% ejection_fraction
## 23.01% high_blood_pressure
## 16.32% diabetes
## 12.97% creatinine_phosphokinase
## 4.18% platelets
## 3.77% sex
## 3.35% serum_sodium
##
##
## Time: 0.0 secs
#predict
heartfailure_pred <- predict(heartfailure_model, heartfailure_test)
heartfailure_pred
## [1] 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## [39] 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
library(gmodels)
CrossTable(heartfailure_test$DEATH_EVENT, heartfailure_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual death', 'predicted death'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 60
##
##
## | predicted death
## actual death | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 53 | 4 | 57 |
## | 0.883 | 0.067 | |
## -------------|-----------|-----------|-----------|
## 1 | 2 | 1 | 3 |
## | 0.033 | 0.017 | |
## -------------|-----------|-----------|-----------|
## Column Total | 55 | 5 | 60 |
## -------------|-----------|-----------|-----------|
##
##
#boost and analyze outcome
heartfailure_boost <- C5.0(heartfailure_train[-13], heartfailure_train$DEATH_EVENT, trial = 10)
summary(heartfailure_boost)
##
## Call:
## C5.0.default(x = heartfailure_train[-13], y =
## heartfailure_train$DEATH_EVENT, trials = 10)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue May 7 18:34:17 2024
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 239 cases (13 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## time <= 67: 1 (70/10)
## time > 67:
## :...serum_creatinine > 1.5:
## :...creatinine_phosphokinase <= 62: 0 (4)
## : creatinine_phosphokinase > 62:
## : :...high_blood_pressure > 0: 1 (8)
## : high_blood_pressure <= 0:
## : :...diabetes <= 0:
## : :...platelets <= 277000: 1 (7)
## : : platelets > 277000: 0 (3)
## : diabetes > 0:
## : :...sex <= 0: 1 (2)
## : sex > 0: 0 (7/1)
## serum_creatinine <= 1.5:
## :...age > 79:
## :...serum_sodium <= 135: 1 (5/1)
## : serum_sodium > 135: 0 (3)
## age <= 79:
## :...ejection_fraction > 30: 0 (99/3)
## ejection_fraction <= 30:
## :...time <= 78: 1 (3)
## time > 78:
## :...high_blood_pressure > 0: 0 (8)
## high_blood_pressure <= 0:
## :...diabetes <= 0: 0 (12/1)
## diabetes > 0:
## :...serum_creatinine <= 0.9: 0 (2)
## serum_creatinine > 0.9:
## :...time <= 175: 1 (4)
## time > 175: 0 (2)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## ejection_fraction <= 20: 1 (20.4/2.3)
## ejection_fraction > 20:
## :...time > 50:
## :...ejection_fraction <= 30:
## : :...anaemia <= 0: 0 (18.4/5.4)
## : : anaemia > 0: 1 (18.8/6.9)
## : ejection_fraction > 30:
## : :...platelets <= 153000: 1 (14.6/5.4)
## : platelets > 153000: 0 (109.9/12.7)
## time <= 50:
## :...smoking > 0: 1 (10.8)
## smoking <= 0:
## :...serum_sodium > 139: 1 (6.1)
## serum_sodium <= 139:
## :...age <= 73: 0 (34.6/9.2)
## age > 73: 1 (5.4)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## time <= 11: 1 (12.3)
## time > 11:
## :...serum_creatinine > 1.5:
## :...serum_sodium <= 135:
## : :...creatinine_phosphokinase <= 607: 1 (30.6/1.9)
## : : creatinine_phosphokinase > 607: 0 (2.8)
## : serum_sodium > 135:
## : :...sex <= 0: 1 (7.4/1.9)
## : sex > 0: 0 (13/2.8)
## serum_creatinine <= 1.5:
## :...time > 172: 0 (31.2)
## time <= 172:
## :...time > 148: 1 (10.2/0.6)
## time <= 148:
## :...age > 67:
## :...time <= 115: 1 (30.6/8.3)
## : time > 115: 0 (7.7/1.5)
## age <= 67:
## :...creatinine_phosphokinase > 2281: 1 (7.7/1.2)
## creatinine_phosphokinase <= 2281:
## :...serum_sodium <= 131: 1 (4.3/0.6)
## serum_sodium > 131:
## :...time > 78: 0 (44.1)
## time <= 78:
## :...serum_sodium <= 140: 0 (34.3/4.9)
## serum_sodium > 140: 1 (2.8)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## ejection_fraction <= 20:
## :...time <= 140: 1 (17)
## : time > 140: 0 (4.6/1)
## ejection_fraction > 20:
## :...time > 50:
## :...serum_creatinine <= 1.18:
## : :...smoking > 0: 0 (38.4/2.2)
## : : smoking <= 0:
## : : :...creatinine_phosphokinase > 2281: 1 (6.2/1.2)
## : : creatinine_phosphokinase <= 2281:
## : : :...serum_creatinine <= 0.6: 1 (2.6)
## : : serum_creatinine > 0.6: 0 (45.1/2.9)
## : serum_creatinine > 1.18:
## : :...creatinine_phosphokinase <= 75: 0 (14.9)
## : creatinine_phosphokinase > 75:
## : :...sex <= 0: 1 (20.9/4.8)
## : sex > 0:
## : :...diabetes <= 0: 1 (19.7/7.2)
## : diabetes > 0: 0 (14.6/2.9)
## time <= 50:
## :...smoking > 0: 1 (11.8)
## smoking <= 0:
## :...serum_sodium > 139: 1 (7.2)
## serum_sodium <= 139:
## :...serum_sodium <= 133: 1 (4.6)
## serum_sodium > 133:
## :...time <= 11: 1 (4.8)
## time > 11:
## :...time <= 38: 0 (24.2/6)
## time > 38: 1 (2.4)
##
## ----- Trial 4: -----
##
## Decision tree:
##
## time > 73:
## :...serum_creatinine > 1.4:
## : :...ejection_fraction <= 45: 1 (30.4/10.8)
## : : ejection_fraction > 45: 0 (9.5/0.4)
## : serum_creatinine <= 1.4:
## : :...time > 172: 0 (28.9)
## : time <= 172:
## : :...time <= 148: 0 (71.3/13.8)
## : time > 148: 1 (10.2/1.7)
## time <= 73:
## :...age > 73: 1 (17.5)
## age <= 73:
## :...ejection_fraction <= 20: 1 (9.8)
## ejection_fraction > 20:
## :...serum_sodium > 139: 1 (11.3/0.4)
## serum_sodium <= 139:
## :...serum_sodium > 136: 0 (25.2/6.8)
## serum_sodium <= 136:
## :...age <= 70: 1 (21.1/2.1)
## age > 70: 0 (3.9)
##
## ----- Trial 5: -----
##
## Decision tree:
##
## time <= 67:
## :...platelets <= 213000: 1 (20.3)
## : platelets > 213000:
## : :...platelets <= 224000: 0 (7.1/0.7)
## : platelets > 224000:
## : :...creatinine_phosphokinase <= 109: 0 (12.6/4.1)
## : creatinine_phosphokinase > 109: 1 (38.4/4.9)
## time > 67:
## :...ejection_fraction > 38:
## :...diabetes > 0: 0 (14.9)
## : diabetes <= 0:
## : :...creatinine_phosphokinase <= 124: 0 (15.6)
## : creatinine_phosphokinase > 124:
## : :...serum_creatinine > 1.6: 1 (5.2)
## : serum_creatinine <= 1.6:
## : :...creatinine_phosphokinase <= 145: 1 (7.5/0.6)
## : creatinine_phosphokinase > 145: 0 (24.3/1.6)
## ejection_fraction <= 38:
## :...platelets <= 87000: 1 (5.9)
## platelets > 87000:
## :...serum_creatinine > 2.9: 0 (6.8)
## serum_creatinine <= 2.9:
## :...ejection_fraction > 35: 1 (13.9/4.6)
## ejection_fraction <= 35:
## :...time <= 78: 1 (8/0.7)
## time > 78:
## :...ejection_fraction > 30: 0 (17.5)
## ejection_fraction <= 30:
## :...time <= 110: 0 (15.3/0.9)
## time > 110:
## :...time <= 198: 1 (21.4/6.9)
## time > 198: 0 (4.2)
##
## ----- Trial 6: -----
##
## Decision tree:
##
## time <= 67:
## :...age > 73: 1 (13.9)
## : age <= 73:
## : :...high_blood_pressure > 0: 1 (28.7/5)
## : high_blood_pressure <= 0:
## : :...time <= 16: 1 (6)
## : time > 16:
## : :...serum_sodium > 139: 1 (5)
## : serum_sodium <= 139:
## : :...serum_sodium <= 136: 1 (9.2/2.7)
## : serum_sodium > 136: 0 (12.9)
## time > 67:
## :...sex <= 0:
## :...creatinine_phosphokinase <= 122: 0 (16.2/0.2)
## : creatinine_phosphokinase > 122:
## : :...serum_creatinine > 1.2: 1 (16.1/2.3)
## : serum_creatinine <= 1.2:
## : :...platelets <= 153000: 1 (6.4/0.6)
## : platelets > 153000: 0 (19.1/2.5)
## sex > 0:
## :...high_blood_pressure > 0: 0 (26.2/2.8)
## high_blood_pressure <= 0:
## :...age <= 55: 0 (25.7/1.9)
## age > 55:
## :...ejection_fraction <= 20: 1 (3.7)
## ejection_fraction > 20:
## :...platelets <= 201000: 0 (14.7)
## platelets > 201000:
## :...platelets <= 228000: 1 (6.4)
## platelets > 228000: 0 (28.7/9.2)
##
## ----- Trial 7: -----
##
## Decision tree:
##
## time > 180: 0 (29.9/2.6)
## time <= 180:
## :...serum_creatinine <= 0.9: 0 (38.4/8.4)
## serum_creatinine > 0.9:
## :...ejection_fraction <= 20: 1 (11.6)
## ejection_fraction > 20:
## :...age > 73:
## :...serum_sodium <= 132: 1 (11.7)
## : serum_sodium > 132:
## : :...creatinine_phosphokinase > 855: 0 (3.6)
## : creatinine_phosphokinase <= 855:
## : :...platelets <= 334000: 1 (22.2/2.5)
## : platelets > 334000: 0 (3.8/0.2)
## age <= 73:
## :...platelets > 328000:
## :...time <= 20: 0 (4/0.2)
## : time > 20: 1 (24.5/1.7)
## platelets <= 328000:
## :...time > 82:
## :...time <= 148: 0 (30.8/0.9)
## : time > 148: 1 (9.8/3.8)
## time <= 82:
## :...platelets <= 217000: 1 (14.4/1.1)
## platelets > 217000:
## :...serum_sodium > 139: 1 (7/0.2)
## serum_sodium <= 139:
## :...time <= 13: 1 (3.4)
## time > 13: 0 (23.8/3.5)
##
## ----- Trial 8: -----
##
## Decision tree:
##
## time <= 73:
## :...serum_sodium <= 133: 1 (17.6)
## : serum_sodium > 133:
## : :...ejection_fraction > 45: 0 (17.5/5.9)
## : ejection_fraction <= 45:
## : :...serum_sodium <= 136: 1 (14.1)
## : serum_sodium > 136:
## : :...sex <= 0: 0 (11.5/3.6)
## : sex > 0:
## : :...serum_creatinine <= 2.1: 1 (21.5/3.7)
## : serum_creatinine > 2.1: 0 (2.6)
## time > 73:
## :...platelets <= 126000: 1 (9.6/0.7)
## platelets > 126000:
## :...creatinine_phosphokinase <= 72: 0 (13.7)
## creatinine_phosphokinase > 72:
## :...serum_creatinine > 1.4: 1 (36.8/12)
## serum_creatinine <= 1.4:
## :...time > 172: 0 (20.5)
## time <= 172:
## :...age > 77: 1 (12.5/2.8)
## age <= 77:
## :...time > 148: 1 (11.6/3.2)
## time <= 148:
## :...platelets <= 300000: 0 (27.5)
## platelets > 300000:
## :...smoking <= 0: 1 (13.9/4.4)
## smoking > 0: 0 (8.1)
##
## ----- Trial 9: -----
##
## Decision tree:
##
## time <= 50:
## :...smoking > 0: 1 (13.8)
## : smoking <= 0:
## : :...serum_sodium > 138: 1 (12.6)
## : serum_sodium <= 138:
## : :...platelets <= 362000: 1 (33.3/8.4)
## : platelets > 362000: 0 (5.8/0.1)
## time > 50:
## :...ejection_fraction > 30:
## :...serum_sodium > 135: 0 (64.7/7.9)
## : serum_sodium <= 135:
## : :...anaemia > 0: 0 (14.5)
## : anaemia <= 0:
## : :...time <= 172: 1 (23/3.7)
## : time > 172: 0 (7.3)
## ejection_fraction <= 30:
## :...time <= 78: 1 (11.5)
## time > 78:
## :...serum_sodium > 139: 0 (12.5/0.3)
## serum_sodium <= 139:
## :...serum_creatinine <= 1: 0 (9.7/0.5)
## serum_creatinine > 1:
## :...time <= 109: 0 (9.3/2.4)
## time > 109: 1 (21.1/3.2)
##
##
## Evaluation on training data (239 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 16 16( 6.7%)
## 1 9 50(20.9%)
## 2 14 31(13.0%)
## 3 16 34(14.2%)
## 4 11 29(12.1%)
## 5 17 34(14.2%)
## 6 16 18( 7.5%)
## 7 15 36(15.1%)
## 8 15 43(18.0%)
## 9 13 27(11.3%)
## boost 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 146 (a): class 0
## 93 (b): class 1
##
##
## Attribute usage:
##
## 100.00% ejection_fraction
## 100.00% platelets
## 100.00% serum_creatinine
## 100.00% serum_sodium
## 100.00% time
## 97.07% age
## 95.40% creatinine_phosphokinase
## 83.68% sex
## 75.31% high_blood_pressure
## 70.71% smoking
## 50.21% diabetes
## 33.05% anaemia
##
##
## Time: 0.0 secs
#Predict with boosting and view CrossTable
heartfailureboost_pred <- predict(heartfailure_boost, heartfailure_test)
CrossTable(heartfailure_test$DEATH_EVENT, heartfailureboost_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual death', 'predicted death'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 60
##
##
## | predicted death
## actual death | 0 | 1 | Row Total |
## -------------|-----------|-----------|-----------|
## 0 | 54 | 3 | 57 |
## | 0.900 | 0.050 | |
## -------------|-----------|-----------|-----------|
## 1 | 3 | 0 | 3 |
## | 0.050 | 0.000 | |
## -------------|-----------|-----------|-----------|
## Column Total | 57 | 3 | 60 |
## -------------|-----------|-----------|-----------|
##
##
#Build error cost maxtrix to serve as feeder
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")
matrix_dimensions
## $predicted
## [1] "no" "yes"
##
## $actual
## [1] "no" "yes"
# Define the cost matrix
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = list(c("0", "1"), c("0", "1")))
error_cost
## 0 1
## 0 0 4
## 1 1 0
#run with the error cost , predict and view cost matrix
heartfailure_cost <- C5.0(heartfailure_train[-13], heartfailure_train$DEATH_EVENT,
costs = error_cost)
heartfailure_cost_pred <- predict(heartfailure_cost, heartfailure_test)
CrossTable(heartfailure_test$DEATH_EVENT, heartfailure_cost_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 60
##
##
## | predicted default
## actual default | 0 | 1 | Row Total |
## ---------------|-----------|-----------|-----------|
## 0 | 48 | 9 | 57 |
## | 0.800 | 0.150 | |
## ---------------|-----------|-----------|-----------|
## 1 | 2 | 1 | 3 |
## | 0.033 | 0.017 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 50 | 10 | 60 |
## ---------------|-----------|-----------|-----------|
##
##
#Comparing performance
# 3. Evaluation
# Regression Tree Evaluation
rg_pred <- predict(fit.rg, testing_data, type = "class")
rg_accuracy <- mean(rg_pred == testing_data$DEATH_EVENT)
rg_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = rg_pred)
# KNN Evaluation
# Extract the DEATH_EVENT column from hf_knn_test_labels
hf_knn_test_labels <- hf_knn_test$DEATH_EVENT
# Now compare hf_knn_test_pred with hf_knn_test_labels
knn_accuracy <- mean(hf_knn_test_pred == hf_knn_test_labels)
knn_conf_matrix <- table(Actual = hf_knn_test_labels, Predicted = hf_knn_test_pred)
# SVM Kernel Evaluation
svm_accuracy <- mean(hf_predict == testing_data$DEATH_EVENT)
svm_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = hf_predict)
#svm rbf
# SVM Evaluation
svm_accuracy_rbf <- mean(hf_predict_rbf == testing_data$DEATH_EVENT)
svm_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = hf_predict)
# Decision Tree Evaluation
dt_accuracy <- mean(heartfailureboost_pred == testing_data$DEATH_EVENT)
dt_conf_matrix <- table(Actual = testing_data$DEATH_EVENT, Predicted = heartfailureboost_pred)
# Decision Tree Evaluation
dt_accuracy_cost <- mean(heartfailure_cost_pred == testing_data$DEATH_EVENT)
# 4. Comparison
# Print evaluation metrics for each model
cat("Regression Tree Accuracy:", rg_accuracy, "\n")
## Regression Tree Accuracy: 0.9333333
cat("KNN Accuracy:", knn_accuracy, "\n")
## KNN Accuracy: 0.8
cat("SVM kernel Accuracy:", svm_accuracy, "\n")
## SVM kernel Accuracy: 0.95
cat("SVM rbf Accuracy:", svm_accuracy_rbf, "\n")
## SVM rbf Accuracy: 0.9333333
cat("Decision Tree Accuracy:", dt_accuracy, "\n")
## Decision Tree Accuracy: 0.9
cat("Decision Tree Accuracy:", dt_accuracy_cost, "\n")
## Decision Tree Accuracy: 0.8166667
# Print confusion matrix for each model
cat("\nRegression Tree Confusion Matrix:\n")
##
## Regression Tree Confusion Matrix:
print(rg_conf_matrix)
## Predicted
## Actual 0 1
## 0 55 2
## 1 2 1
cat("\nKNN Confusion Matrix:\n")
##
## KNN Confusion Matrix:
print(knn_conf_matrix)
## Predicted
## Actual 0 1
## 0 47 10
## 1 2 1
cat("\nSVM Confusion Matrix:\n")
##
## SVM Confusion Matrix:
print(svm_conf_matrix)
## Predicted
## Actual 0 1
## 0 57 0
## 1 3 0
cat("\nDecision Tree Confusion Matrix:\n")
##
## Decision Tree Confusion Matrix:
print(dt_conf_matrix)
## Predicted
## Actual 0 1
## 0 54 3
## 1 3 0